Index: sys/amd64/include/xen/xenvar.h =================================================================== --- sys/amd64/include/xen/xenvar.h +++ sys/amd64/include/xen/xenvar.h @@ -70,7 +70,6 @@ #define set_phys_to_machine(pfn, mfn) ((void)0) #define phys_to_machine_mapping_valid(pfn) (TRUE) -#define PT_UPDATES_FLUSH() ((void)0) #else Index: sys/conf/files.amd64 =================================================================== --- sys/conf/files.amd64 +++ sys/conf/files.amd64 @@ -576,7 +576,7 @@ x86/x86/tsc.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm -x86/xen/xen_intr.c optional xen | xenhvm +x86/xen/xen_intr.c optional xenhvm x86/xen/pv.c optional xenhvm x86/xen/pvcpu_enum.c optional xenhvm x86/xen/xen_apic.c optional xenhvm Index: sys/conf/files.i386 =================================================================== --- sys/conf/files.i386 +++ sys/conf/files.i386 @@ -427,16 +427,15 @@ i386/i386/atomic.c standard \ compile-with "${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" i386/i386/autoconf.c standard -i386/i386/bios.c optional native -i386/i386/bioscall.s optional native +i386/i386/bios.c standard +i386/i386/bioscall.s standard i386/i386/bpf_jit_machdep.c optional bpf_jitter i386/i386/db_disasm.c optional ddb i386/i386/db_interface.c optional ddb i386/i386/db_trace.c optional ddb i386/i386/elan-mmcr.c optional cpu_elan | cpu_soekris i386/i386/elf_machdep.c standard -i386/i386/exception.s optional native -i386/xen/exception.s optional xen +i386/i386/exception.s standard i386/i386/gdb_machdep.c optional gdb i386/i386/geode.c optional cpu_geode i386/i386/i686_mem.c optional mem @@ -444,22 +443,17 @@ i386/i386/initcpu.c standard i386/i386/io.c optional io i386/i386/k6_mem.c optional mem -i386/i386/locore.s optional native no-obj -i386/xen/locore.s optional xen no-obj +i386/i386/locore.s standard no-obj i386/i386/longrun.c optional cpu_enable_longrun i386/i386/machdep.c standard -i386/xen/xen_machdep.c optional xen i386/i386/mem.c optional mem i386/i386/minidump_machdep.c standard i386/i386/mp_clock.c optional smp -i386/i386/mp_machdep.c optional native smp -i386/xen/mp_machdep.c optional xen smp +i386/i386/mp_machdep.c optional smp i386/i386/mp_watchdog.c optional mp_watchdog smp i386/i386/mpboot.s optional smp native -i386/xen/mptable.c optional apic xen i386/i386/perfmon.c optional perfmon -i386/i386/pmap.c optional native -i386/xen/pmap.c optional xen +i386/i386/pmap.c standard i386/i386/ptrace_machdep.c standard i386/i386/stack_machdep.c optional ddb | stack i386/i386/support.s standard @@ -488,7 +482,6 @@ i386/ibcs2/ibcs2_xenix.c optional ibcs2 i386/ibcs2/ibcs2_xenix_sysent.c optional ibcs2 i386/ibcs2/imgact_coff.c optional ibcs2 -i386/xen/clock.c optional xen i386/isa/elink.c optional ep | ie i386/isa/npx.c optional npx i386/isa/pmtimer.c optional pmtimer @@ -565,8 +558,8 @@ x86/iommu/intel_quirks.c optional acpi acpi_dmar pci x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atpic.c optional atpic -x86/isa/atrtc.c optional native -x86/isa/clock.c optional native +x86/isa/atrtc.c standard +x86/isa/clock.c standard x86/isa/elcr.c optional atpic | apic native x86/isa/isa.c optional isa x86/isa/isa_dma.c optional isa @@ -582,20 +575,20 @@ x86/x86/identcpu.c standard x86/x86/intr_machdep.c standard x86/x86/io_apic.c optional apic -x86/x86/legacy.c optional native +x86/x86/legacy.c standard x86/x86/local_apic.c optional apic x86/x86/mca.c standard x86/x86/mptable.c optional apic native x86/x86/mptable_pci.c optional apic native pci -x86/x86/mp_x86.c optional native smp +x86/x86/mp_x86.c optional smp x86/x86/msi.c optional apic pci x86/x86/nexus.c standard x86/x86/tsc.c standard x86/x86/pvclock.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm -x86/xen/xen_intr.c optional xen | xenhvm +x86/xen/xen_intr.c optional xenhvm x86/xen/xen_apic.c optional xenhvm -x86/xen/xenpv.c optional xen | xenhvm -x86/xen/xen_nexus.c optional xen | xenhvm -x86/xen/xen_msi.c optional xen | xenhvm +x86/xen/xenpv.c optional xenhvm +x86/xen/xen_nexus.c optional xenhvm +x86/xen/xen_msi.c optional xenhvm Index: sys/conf/options.i386 =================================================================== --- sys/conf/options.i386 +++ sys/conf/options.i386 @@ -121,8 +121,6 @@ # BPF just-in-time compiler BPF_JITTER opt_bpf.h -NATIVE opt_global.h -XEN opt_global.h XENHVM opt_global.h # options for the Intel C600 SAS driver (isci) Index: sys/dev/xen/control/control.c =================================================================== --- sys/dev/xen/control/control.c +++ sys/dev/xen/control/control.c @@ -254,7 +254,6 @@ void *shared_info = HYPERVISOR_shared_info; HYPERVISOR_shared_info = NULL; pmap_kremove((vm_offset_t) shared_info); - PT_UPDATES_FLUSH(); xen_start_info->store_mfn = MFNTOPFN(xen_start_info->store_mfn); xen_start_info->console.domU.mfn = MFNTOPFN(xen_start_info->console.domU.mfn); Index: sys/dev/xen/netfront/netfront.c =================================================================== --- sys/dev/xen/netfront/netfront.c +++ sys/dev/xen/netfront/netfront.c @@ -907,7 +907,6 @@ * We may have allocated buffers which have entries outstanding * in the page * update queue -- make sure we flush those first! */ - PT_UPDATES_FLUSH(); if (nr_flips != 0) { #ifdef notyet /* Tell the ballon driver what is going on. */ Index: sys/i386/conf/DEFAULTS =================================================================== --- sys/i386/conf/DEFAULTS +++ sys/i386/conf/DEFAULTS @@ -26,7 +26,6 @@ options GEOM_PART_MBR # enable support for native hardware -options NATIVE device atpic options NEW_PCIB Index: sys/i386/conf/XEN =================================================================== --- sys/i386/conf/XEN +++ /dev/null @@ -1,96 +0,0 @@ -# -# XEN -- Kernel configuration for i386 XEN DomU -# -# $FreeBSD$ - -cpu I686_CPU -ident XEN - -makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols - -# The following drivers don't build with PAE or XEN enabled. -makeoptions WITHOUT_MODULES="ctl dpt drm drm2 hptmv ida" - -# The following drivers don't work with PAE enabled. -makeoptions WITHOUT_MODULES+="ncr pst" - -options SCHED_ULE # ULE scheduler -options PREEMPTION # Enable kernel thread preemption - -options INET # InterNETworking -options INET6 # IPv6 communications protocols -options SCTP # Stream Control Transmission Protocol -options FFS # Berkeley Fast Filesystem -options SOFTUPDATES # Enable FFS soft updates support -options UFS_ACL # Support for access control lists -options UFS_DIRHASH # Improve performance on big directories -options UFS_GJOURNAL # Enable gjournal-based UFS journaling -options NFSCL # Network Filesystem Client -options NFSD # Network Filesystem Server -options NFSLOCKD # Network Lock Manager -options NFS_ROOT # NFS usable as /, requires NFSCL -options MSDOSFS # MSDOS Filesystem -options CD9660 # ISO 9660 Filesystem -options PROCFS # Process filesystem (requires PSEUDOFS) -options PSEUDOFS # Pseudo-filesystem framework -options GEOM_PART_GPT # GUID Partition Tables. -options GEOM_LABEL # Provides labelization -options COMPAT_FREEBSD4 # Compatible with FreeBSD4 -options COMPAT_FREEBSD5 # Compatible with FreeBSD5 -options COMPAT_FREEBSD6 # Compatible with FreeBSD6 -options COMPAT_FREEBSD7 # Compatible with FreeBSD7 -options COMPAT_FREEBSD9 # Compatible with FreeBSD9 -options COMPAT_FREEBSD10 # Compatible with FreeBSD10 -options KTRACE # ktrace(1) support -options STACK # stack(9) support -options SYSVSHM # SYSV-style shared memory -options SYSVMSG # SYSV-style message queues -options SYSVSEM # SYSV-style semaphores -options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions -options KBD_INSTALL_CDEV # install a CDEV entry in /dev -options AUDIT # Security event auditing - -# Debugging for use in -current -options KDB # Enable kernel debugger support. -options DDB # Support DDB. -options GDB # Support remote GDB. -options DEADLKRES # Enable the deadlock resolver -options INVARIANTS # Enable calls of extra sanity checking -options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS -options WITNESS # Enable checks to detect deadlocks and cycles -options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed - -options PAE -nooption NATIVE -option XEN -nodevice atpic -nodevice isa -options MCLSHIFT=12 - -# To make an SMP kernel, the next two lines are needed -options SMP # Symmetric MultiProcessor Kernel -device apic # I/O APIC - -#device atkbdc # AT keyboard controller -#device atkbd # AT keyboard -device psm # PS/2 mouse -device pci - -#device kbdmux # keyboard multiplexer - -# Pseudo devices. -device loop # Network loopback -device random # Entropy device -device ether # Ethernet support -device tun # Packet tunnel. -device md # Memory "disks" -device gif # IPv6 and IPv4 tunneling - -# Wireless cards -options IEEE80211_SUPPORT_MESH -options AH_SUPPORT_AR5416 - -# The `bpf' device enables the Berkeley Packet Filter. -# Be aware of the administrative consequences of enabling this! -# Note that 'bpf' is required for DHCP. -device bpf # Berkeley packet filter Index: sys/i386/i386/apic_vector.s =================================================================== --- sys/i386/i386/apic_vector.s +++ sys/i386/i386/apic_vector.s @@ -247,7 +247,6 @@ /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ -#ifndef XEN .text SUPERALIGN_TEXT IDTVEC(ipi_intr_bitmap_handler) @@ -262,7 +261,7 @@ call ipi_bitmap_handler MEXITCOUNT jmp doreti -#endif + /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ @@ -282,7 +281,6 @@ /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ -#ifndef XEN .text SUPERALIGN_TEXT IDTVEC(cpususpend) @@ -295,7 +293,6 @@ POP_FRAME jmp doreti_iret -#endif /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. Index: sys/i386/i386/genassym.c =================================================================== --- sys/i386/i386/genassym.c +++ sys/i386/i386/genassym.c @@ -238,11 +238,6 @@ ASSYM(BUS_SPACE_HANDLE_IAT, offsetof(struct bus_space_handle, bsh_iat)); #endif -#ifdef XEN -ASSYM(PC_CR3, offsetof(struct pcpu, pc_cr3)); -ASSYM(XEN_HYPERVISOR_VIRT_START, HYPERVISOR_VIRT_START); -#endif - #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif Index: sys/i386/i386/machdep.c =================================================================== --- sys/i386/i386/machdep.c +++ sys/i386/i386/machdep.c @@ -160,24 +160,6 @@ uint32_t arch_i386_xbox_memsize = 0; #endif -#ifdef XEN -/* XEN includes */ -#include -#include -#include -#include -#include - -void Xhypervisor_callback(void); -void failsafe_callback(void); - -extern trap_info_t trap_table[]; -struct proc_ldt default_proc_ldt; -extern int init_first; -int running_xen = 1; -extern unsigned long physfree; -#endif /* XEN */ - /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); @@ -356,9 +338,7 @@ */ bufinit(); vm_pager_bufferinit(); -#ifndef XEN cpu_setregs(); -#endif } /* @@ -1291,13 +1271,8 @@ int _default_ldt; -#ifdef XEN -union descriptor *gdt; -union descriptor *ldt; -#else union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ -#endif static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ @@ -1397,7 +1372,6 @@ .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, -#ifndef XEN /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, @@ -1489,7 +1463,6 @@ .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, -#endif /* !XEN */ }; static struct soft_segment_descriptor ldt_segs[] = { @@ -1641,7 +1614,7 @@ ssd->ssd_gran = sd->sd_gran; } -#if !defined(PC98) && !defined(XEN) +#if !defined(PC98) static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) @@ -1748,9 +1721,8 @@ if (!add_smap_entry(smap, physmap, physmap_idxp)) break; } -#endif /* !PC98 && !XEN */ +#endif /* !PC98 */ -#ifndef XEN static void basemem_setup(void) { @@ -1798,7 +1770,6 @@ for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } -#endif /* !XEN */ /* * Populate the (physmap) array with base/bound pairs describing the @@ -2074,8 +2045,6 @@ for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); - - PT_UPDATES_FLUSH(); } #else /* PC98 */ static void @@ -2086,7 +2055,6 @@ vm_paddr_t physmap[PHYSMAP_SIZE]; pt_entry_t *pte; quad_t dcons_addr, dcons_size, physmem_tunable; -#ifndef XEN int hasbrokenint12, i, res; u_int extmem; struct vm86frame vmf; @@ -2094,17 +2062,8 @@ vm_paddr_t pa; struct bios_smap *smap, *smapbase; caddr_t kmdp; -#endif has_smap = 0; -#if defined(XEN) - Maxmem = xen_start_info->nr_pages - init_first; - physmem = Maxmem; - basemem = 0; - physmap[0] = init_first << PAGE_SHIFT; - physmap[1] = ptoa(Maxmem) - round_page(msgbufsize); - physmap_idx = 0; -#else #ifdef XBOX if (arch_i386_is_xbox) { /* @@ -2247,7 +2206,6 @@ physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: -#endif /* * Now, physmap contains a map of physical memory. */ @@ -2321,7 +2279,6 @@ getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; -#ifndef XEN /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. @@ -2442,13 +2399,6 @@ } *pte = 0; invltlb(); -#else - phys_avail[0] = physfree; - phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; - dump_avail[0] = 0; - dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE; - -#endif /* * XXX @@ -2472,272 +2422,9 @@ for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); - - PT_UPDATES_FLUSH(); } #endif /* PC98 */ -#ifdef XEN -#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) - -register_t -init386(first) - int first; -{ - unsigned long gdtmachpfn; - int error, gsel_tss, metadata_missing, x, pa; - struct pcpu *pc; -#ifdef CPU_ENABLE_SSE - struct xstate_hdr *xhdr; -#endif - struct callback_register event = { - .type = CALLBACKTYPE_event, - .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, - }; - struct callback_register failsafe = { - .type = CALLBACKTYPE_failsafe, - .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback }, - }; - - thread0.td_kstack = proc0kstack; - thread0.td_kstack_pages = KSTACK_PAGES; - - /* - * This may be done better later if it gets more high level - * components in it. If so just link td->td_proc here. - */ - proc_linkup0(&proc0, &thread0); - - metadata_missing = 0; - if (xen_start_info->mod_start) { - preload_metadata = (caddr_t)xen_start_info->mod_start; - preload_bootstrap_relocate(KERNBASE); - } else { - metadata_missing = 1; - } - if (envmode == 1) - kern_envp = static_env; - else if ((caddr_t)xen_start_info->cmd_line) - kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); - - boothowto |= xen_boothowto(kern_envp); - - /* Init basic tunables, hz etc */ - init_param1(); - - /* - * XEN occupies a portion of the upper virtual address space - * At its base it manages an array mapping machine page frames - * to physical page frames - hence we need to be able to - * access 4GB - (64MB - 4MB + 64k) - */ - gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE); - - pc = &__pcpu[0]; - gdt_segs[GPRIV_SEL].ssd_base = (int) pc; - gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; - - PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW); - bzero(gdt, PAGE_SIZE); - for (x = 0; x < NGDT; x++) - ssdtosd(&gdt_segs[x], &gdt[x].sd); - - mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); - - gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT; - PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V); - PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0); - lgdt(&r_gdt); - gdtset = 1; - - if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) { - panic("set_trap_table failed - error %d\n", error); - } - - error = HYPERVISOR_callback_op(CALLBACKOP_register, &event); - if (error == 0) - error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); -#if CONFIG_XEN_COMPAT <= 0x030002 - if (error == -ENOXENSYS) - HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL), - (unsigned long)Xhypervisor_callback, - GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback); -#endif - pcpu_init(pc, 0, sizeof(struct pcpu)); - for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) - pmap_kenter(pa + KERNBASE, pa); - dpcpu_init((void *)(first + KERNBASE), 0); - first += DPCPU_SIZE; - physfree += DPCPU_SIZE; - init_first += DPCPU_SIZE / PAGE_SIZE; - - PCPU_SET(prvspace, pc); - PCPU_SET(curthread, &thread0); - - /* - * Initialize mutexes. - * - * icu_lock: in order to allow an interrupt to occur in a critical - * section, to set pcpu->ipending (etc...) properly, we - * must be able to get the icu lock, so it can't be - * under witness. - */ - mutex_init(); - mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); - - /* make ldt memory segments */ - PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW); - bzero(ldt, PAGE_SIZE); - ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); - ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); - for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++) - ssdtosd(&ldt_segs[x], &ldt[x].sd); - - default_proc_ldt.ldt_base = (caddr_t)ldt; - default_proc_ldt.ldt_len = 6; - _default_ldt = (int)&default_proc_ldt; - PCPU_SET(currentldt, _default_ldt); - PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW); - xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0])); - -#if defined(XEN_PRIVILEGED) - /* - * Initialize the i8254 before the console so that console - * initialization can use DELAY(). - */ - i8254_init(); -#endif - - /* - * Initialize the console before we print anything out. - */ - cninit(); - - if (metadata_missing) - printf("WARNING: loader(8) metadata is missing!\n"); - -#ifdef DEV_ISA -#ifdef DEV_ATPIC - elcr_probe(); - atpic_startup(); -#else - /* Reset and mask the atpics and leave them shut down. */ - atpic_reset(); - - /* - * Point the ICU spurious interrupt vectors at the APIC spurious - * interrupt handler. - */ - setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); -#endif -#endif - -#ifdef DDB - db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); -#endif - - kdb_init(); - -#ifdef KDB - if (boothowto & RB_KDB) - kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); -#endif - - finishidentcpu(); /* Final stage of CPU initialization */ - setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - initializecpu(); /* Initialize CPU registers */ - initializecpucache(); - - /* pointer to selector slot for %fs/%gs */ - PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); - - dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = - dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; - dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = - dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); -#if defined(PAE) || defined(PAE_TABLES) - dblfault_tss.tss_cr3 = (int)IdlePDPT; -#else - dblfault_tss.tss_cr3 = (int)IdlePTD; -#endif - dblfault_tss.tss_eip = (int)dblfault_handler; - dblfault_tss.tss_eflags = PSL_KERNEL; - dblfault_tss.tss_ds = dblfault_tss.tss_es = - dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); - dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); - dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); - dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); - - vm86_initialize(); - getmemsize(first); - init_param2(physmem); - - /* now running on new page tables, configured,and u/iom is accessible */ - - msgbufinit(msgbufp, msgbufsize); -#ifdef DEV_NPX - npxinit(true); -#endif - /* - * Set up thread0 pcb after npxinit calculated pcb + fpu save - * area size. Zero out the extended state header in fpu save - * area. - */ - thread0.td_pcb = get_pcb_td(&thread0); - bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); -#ifdef CPU_ENABLE_SSE - if (use_xsave) { - xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + - 1); - xhdr->xstate_bv = xsave_mask; - } -#endif - PCPU_SET(curpcb, thread0.td_pcb); - /* make an initial tss so cpu can get interrupt stack on syscall! */ - /* Note: -16 is so we can grow the trapframe if we came from vm86 */ - PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); - HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), - PCPU_GET(common_tss.tss_esp0)); - - /* transfer to user mode */ - - _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); - _udatasel = GSEL(GUDATA_SEL, SEL_UPL); - - /* setup proc 0's pcb */ - thread0.td_pcb->pcb_flags = 0; -#if defined(PAE) || defined(PAE_TABLES) - thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; -#else - thread0.td_pcb->pcb_cr3 = (int)IdlePTD; -#endif - thread0.td_pcb->pcb_ext = 0; - thread0.td_frame = &proc0_tf; - thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0]; - thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; - - cpu_probe_amdc1e(); - - /* Location of kernel stack for locore */ - return ((register_t)thread0.td_pcb); -} - -#else register_t init386(first) int first; @@ -3061,7 +2748,6 @@ /* Location of kernel stack for locore */ return ((register_t)thread0.td_pcb); } -#endif void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) Index: sys/i386/i386/minidump_machdep.c =================================================================== --- sys/i386/i386/minidump_machdep.c +++ sys/i386/i386/minidump_machdep.c @@ -68,10 +68,6 @@ static uint64_t counter, progress; CTASSERT(sizeof(*vm_page_dump) == 4); -#ifndef XEN -#define xpmap_mtop(x) (x) -#define xpmap_ptom(x) (x) -#endif static int @@ -205,7 +201,7 @@ j = va >> PDRSHIFT; if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is an entire 2M page. */ - pa = xpmap_mtop(pd[j] & PG_PS_FRAME); + pa = pd[j] & PG_PS_FRAME; for (k = 0; k < NPTEPG; k++) { if (is_dumpable(pa)) dump_add_page(pa); @@ -215,10 +211,10 @@ } if ((pd[j] & PG_V) == PG_V) { /* set bit for each valid page in this 2MB block */ - pt = pmap_kenter_temporary(xpmap_mtop(pd[j] & PG_FRAME), 0); + pt = pmap_kenter_temporary(pd[j] & PG_FRAME, 0); for (k = 0; k < NPTEPG; k++) { if ((pt[k] & PG_V) == PG_V) { - pa = xpmap_mtop(pt[k] & PG_FRAME); + pa = pt[k] & PG_FRAME; if (is_dumpable(pa)) dump_add_page(pa); } @@ -318,24 +314,8 @@ continue; } if ((pd[j] & PG_V) == PG_V) { - pa = xpmap_mtop(pd[j] & PG_FRAME); -#ifndef XEN + pa = pd[j] & PG_FRAME; error = blk_write(di, 0, pa, PAGE_SIZE); -#else - pt = pmap_kenter_temporary(pa, 0); - memcpy(fakept, pt, PAGE_SIZE); - for (i = 0; i < NPTEPG; i++) - fakept[i] = xpmap_mtop(fakept[i]); - error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE); - if (error) - goto fail; - /* flush, in case we reuse fakept in the same block */ - error = blk_flush(di); - if (error) - goto fail; - bzero(fakept, sizeof(fakept)); -#endif - if (error) goto fail; } else { Index: sys/i386/i386/support.s =================================================================== --- sys/i386/i386/support.s +++ sys/i386/i386/support.s @@ -695,11 +695,9 @@ */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) -#ifndef XEN /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) -#endif /* flush the prefetch q */ jmp 1f Index: sys/i386/i386/swtch.s =================================================================== --- sys/i386/i386/swtch.s +++ sys/i386/i386/swtch.s @@ -88,7 +88,7 @@ movl 8(%esp),%ecx /* New thread */ movl TD_PCB(%ecx),%edx movl PCB_CR3(%edx),%eax - LOAD_CR3(%eax) + movl %eax,%cr3 /* set bit in new pm_active */ movl TD_PROC(%ecx),%eax movl P_VMSPACE(%eax), %ebx @@ -174,10 +174,10 @@ /* switch address space */ movl PCB_CR3(%edx),%eax - READ_CR3(%ebx) /* The same address space? */ + movl %cr3,%ebx /* The same address space? */ cmpl %ebx,%eax je sw0 - LOAD_CR3(%eax) /* new address space */ + movl %eax,%cr3 /* new address space */ movl %esi,%eax movl PCPU(CPUID),%esi SETOP %eax,TD_LOCK(%edi) /* Switchout td_lock */ @@ -204,18 +204,6 @@ SETOP %esi,TD_LOCK(%edi) /* Switchout td_lock */ sw1: BLOCK_SPIN(%ecx) -#ifdef XEN - pushl %eax - pushl %ecx - pushl %edx - call xen_handle_thread_switch - popl %edx - popl %ecx - popl %eax - /* - * XXX set IOPL - */ -#else /* * At this point, we've switched address spaces and are ready * to load up the rest of the next context. @@ -264,7 +252,7 @@ movl 12(%esi), %ebx movl %eax, 8(%edi) movl %ebx, 12(%edi) -#endif + /* Restore context. */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp @@ -290,7 +278,7 @@ movl _default_ldt,%eax cmpl PCPU(CURRENTLDT),%eax je 2f - LLDT(_default_ldt) + lldt _default_ldt movl %eax,PCPU(CURRENTLDT) jmp 2f 1: Index: sys/i386/i386/sys_machdep.c =================================================================== --- sys/i386/i386/sys_machdep.c +++ sys/i386/i386/sys_machdep.c @@ -59,20 +59,6 @@ #include -#ifdef XEN -#include - -void i386_reset_ldt(struct proc_ldt *pldt); - -void -i386_reset_ldt(struct proc_ldt *pldt) -{ - xen_set_ldt((vm_offset_t)pldt->ldt_base, pldt->ldt_len); -} -#else -#define i386_reset_ldt(x) -#endif - #include /* for kernel_map */ #define MAX_LD 8192 @@ -211,12 +197,7 @@ */ sd.sd_lobase = base & 0xffffff; sd.sd_hibase = (base >> 24) & 0xff; -#ifdef XEN - /* need to do nosegneg like Linux */ - sd.sd_lolimit = (HYPERVISOR_VIRT_START >> 12) & 0xffff; -#else sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ -#endif sd.sd_hilimit = 0xf; sd.sd_type = SDT_MEMRWA; sd.sd_dpl = SEL_UPL; @@ -226,12 +207,7 @@ sd.sd_gran = 1; critical_enter(); td->td_pcb->pcb_fsd = sd; -#ifdef XEN - HYPERVISOR_update_descriptor(vtomach(&PCPU_GET(fsgs_gdt)[0]), - *(uint64_t *)&sd); -#else PCPU_GET(fsgs_gdt)[0] = sd; -#endif critical_exit(); td->td_frame->tf_fs = GSEL(GUFS_SEL, SEL_UPL); } @@ -252,12 +228,7 @@ sd.sd_lobase = base & 0xffffff; sd.sd_hibase = (base >> 24) & 0xff; -#ifdef XEN - /* need to do nosegneg like Linux */ - sd.sd_lolimit = (HYPERVISOR_VIRT_START >> 12) & 0xffff; -#else sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ -#endif sd.sd_hilimit = 0xf; sd.sd_type = SDT_MEMRWA; sd.sd_dpl = SEL_UPL; @@ -267,12 +238,7 @@ sd.sd_gran = 1; critical_enter(); td->td_pcb->pcb_gsd = sd; -#ifdef XEN - HYPERVISOR_update_descriptor(vtomach(&PCPU_GET(fsgs_gdt)[1]), - *(uint64_t *)&sd); -#else PCPU_GET(fsgs_gdt)[1] = sd; -#endif critical_exit(); load_gs(GSEL(GUGS_SEL, SEL_UPL)); } @@ -434,10 +400,6 @@ } pldt = mdp->md_ldt; -#ifdef XEN - i386_reset_ldt(pldt); - PCPU_SET(currentldt, (int)pldt); -#else #ifdef SMP gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pldt->ldt_sd; #else @@ -445,7 +407,6 @@ #endif lldt(GSEL(GUSERLDT_SEL, SEL_KPL)); PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL)); -#endif /* XEN */ if (dtlocked) mtx_unlock_spin(&dt_lock); } @@ -464,43 +425,6 @@ } #endif -#ifdef XEN - -/* - * dt_lock must be held. Returns with dt_lock held. - */ -struct proc_ldt * -user_ldt_alloc(struct mdproc *mdp, int len) -{ - struct proc_ldt *pldt, *new_ldt; - - mtx_assert(&dt_lock, MA_OWNED); - mtx_unlock_spin(&dt_lock); - new_ldt = malloc(sizeof(struct proc_ldt), - M_SUBPROC, M_WAITOK); - - new_ldt->ldt_len = len = NEW_MAX_LD(len); - new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena, - round_page(len * sizeof(union descriptor)), M_WAITOK); - new_ldt->ldt_refcnt = 1; - new_ldt->ldt_active = 0; - - mtx_lock_spin(&dt_lock); - if ((pldt = mdp->md_ldt)) { - if (len > pldt->ldt_len) - len = pldt->ldt_len; - bcopy(pldt->ldt_base, new_ldt->ldt_base, - len * sizeof(union descriptor)); - } else { - bcopy(ldt, new_ldt->ldt_base, PAGE_SIZE); - } - mtx_unlock_spin(&dt_lock); /* XXX kill once pmap locking fixed. */ - pmap_map_readonly(kernel_pmap, (vm_offset_t)new_ldt->ldt_base, - new_ldt->ldt_len*sizeof(union descriptor)); - mtx_lock_spin(&dt_lock); /* XXX kill once pmap locking fixed. */ - return (new_ldt); -} -#else /* * dt_lock must be held. Returns with dt_lock held. */ @@ -535,7 +459,6 @@ return (new_ldt); } -#endif /* !XEN */ /* * Must be called with dt_lock held. Returns with dt_lock unheld. @@ -553,13 +476,8 @@ } if (td == curthread) { -#ifdef XEN - i386_reset_ldt(&default_proc_ldt); - PCPU_SET(currentldt, (int)&default_proc_ldt); -#else lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); -#endif } mdp->md_ldt = NULL; @@ -785,27 +703,7 @@ td->td_retval[0] = uap->start; return (error); } -#ifdef XEN -static int -i386_set_ldt_data(struct thread *td, int start, int num, - union descriptor *descs) -{ - struct mdproc *mdp = &td->td_proc->p_md; - struct proc_ldt *pldt = mdp->md_ldt; - mtx_assert(&dt_lock, MA_OWNED); - - while (num) { - xen_update_descriptor( - &((union descriptor *)(pldt->ldt_base))[start], - descs); - num--; - start++; - descs++; - } - return (0); -} -#else static int i386_set_ldt_data(struct thread *td, int start, int num, union descriptor *descs) @@ -821,7 +719,6 @@ num * sizeof(union descriptor)); return (0); } -#endif /* !XEN */ static int i386_ldt_grow(struct thread *td, int len) Index: sys/i386/i386/vm_machdep.c =================================================================== --- sys/i386/i386/vm_machdep.c +++ sys/i386/i386/vm_machdep.c @@ -89,9 +89,6 @@ #include #include -#ifdef XEN -#include -#endif #ifdef PC98 #include #else @@ -304,10 +301,8 @@ /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; - /* - * XXX XEN need to check on PSL_USER is handled - */ td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; + /* * Now, cpu_switch() can schedule the new process. * pcb_esp is loaded pointing to the cpu_switch() stack frame @@ -698,12 +693,6 @@ #endif disable_intr(); -#ifdef XEN - if (smp_processor_id() == 0) - HYPERVISOR_shutdown(SHUTDOWN_reboot); - else - HYPERVISOR_shutdown(SHUTDOWN_poweroff); -#endif #ifdef CPU_ELAN if (elan_mmcr != NULL) elan_mmcr->RESCFG = 1; @@ -797,13 +786,8 @@ */ ptep = vtopte(sf->kva); opte = *ptep; -#ifdef XEN - PT_SET_MA(sf->kva, xpmap_ptom(VM_PAGE_TO_PHYS(sf->m)) | pgeflag - | PG_RW | PG_V | pmap_cache_bits(sf->m->md.pat_mode, 0)); -#else *ptep = VM_PAGE_TO_PHYS(sf->m) | pgeflag | PG_RW | PG_V | pmap_cache_bits(sf->m->md.pat_mode, 0); -#endif /* * Avoid unnecessary TLB invalidations: If the sf_buf's old @@ -854,15 +838,8 @@ int sf_buf_unmap(struct sf_buf *sf) { -#ifdef XEN - /* - * Xen doesn't like having dangling R/W mappings - */ - pmap_qremove(sf->kva, 1); - return (1); -#else + return (0); -#endif } static void Index: sys/i386/include/asmacros.h =================================================================== --- sys/i386/include/asmacros.h +++ sys/i386/include/asmacros.h @@ -176,37 +176,6 @@ movl $KPSEL, %eax ; /* reload with per-CPU data segment */ \ movl %eax, %fs -#ifdef XEN -#define LOAD_CR3(reg) \ - movl reg,PCPU(CR3); \ - pushl %ecx ; \ - pushl %edx ; \ - pushl %esi ; \ - pushl reg ; \ - call xen_load_cr3 ; \ - addl $4,%esp ; \ - popl %esi ; \ - popl %edx ; \ - popl %ecx ; \ - -#define READ_CR3(reg) movl PCPU(CR3),reg; -#define LLDT(arg) \ - pushl %edx ; \ - pushl %eax ; \ - xorl %eax,%eax ; \ - movl %eax,%gs ; \ - call i386_reset_ldt ; \ - popl %eax ; \ - popl %edx -#define CLI call ni_cli -#else -#define LOAD_CR3(reg) movl reg,%cr3; -#define READ_CR3(reg) movl %cr3,reg; -#define LLDT(arg) lldt arg; -#define CLI cli -#endif /* !XEN */ - - #endif /* LOCORE */ #ifdef __STDC__ Index: sys/i386/include/cpufunc.h =================================================================== --- sys/i386/include/cpufunc.h +++ sys/i386/include/cpufunc.h @@ -42,17 +42,6 @@ #error this file needs sys/cdefs.h as a prerequisite #endif -#ifdef XEN -extern void xen_cli(void); -extern void xen_sti(void); -extern u_int xen_rcr2(void); -extern void xen_load_cr3(u_int data); -extern void xen_tlb_flush(void); -extern void xen_invlpg(u_int addr); -extern void write_eflags(u_int eflags); -extern u_int read_eflags(void); -#endif - struct region_descriptor; #define readb(va) (*(volatile uint8_t *) (va)) @@ -106,11 +95,8 @@ static __inline void disable_intr(void) { -#ifdef XEN - xen_cli(); -#else + __asm __volatile("cli" : : : "memory"); -#endif } static __inline void @@ -132,11 +118,8 @@ static __inline void enable_intr(void) { -#ifdef XEN - xen_sti(); -#else + __asm __volatile("sti"); -#endif } static __inline void @@ -325,11 +308,7 @@ } static __inline u_int -#ifdef XEN -_read_eflags(void) -#else read_eflags(void) -#endif { u_int ef; @@ -389,11 +368,7 @@ } static __inline void -#ifdef XEN -_write_eflags(u_int ef) -#else write_eflags(u_int ef) -#endif { __asm __volatile("pushl %0; popfl" : : "r" (ef)); } @@ -425,9 +400,6 @@ { u_int data; -#ifdef XEN - return (xen_rcr2()); -#endif __asm __volatile("movl %%cr2,%0" : "=r" (data)); return (data); } @@ -435,11 +407,8 @@ static __inline void load_cr3(u_int data) { -#ifdef XEN - xen_load_cr3(data); -#else + __asm __volatile("movl %0,%%cr3" : : "r" (data) : "memory"); -#endif } static __inline u_int @@ -491,11 +460,8 @@ static __inline void invltlb(void) { -#ifdef XEN - xen_tlb_flush(); -#else + load_cr3(rcr3()); -#endif } /* @@ -506,11 +472,7 @@ invlpg(u_int addr) { -#ifdef XEN - xen_invlpg(addr); -#else __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); -#endif } static __inline u_short Index: sys/i386/include/intr_machdep.h =================================================================== --- sys/i386/include/intr_machdep.h +++ sys/i386/include/intr_machdep.h @@ -58,13 +58,7 @@ (FIRST_MSI_INT + NUM_MSI_INTS) #define LAST_EVTCHN_INT \ (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) -#elif defined(XEN) -#include -#define NUM_EVTCHN_INTS NR_EVENT_CHANNELS -#define FIRST_EVTCHN_INT 0 -#define LAST_EVTCHN_INT \ - (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) -#else /* !XEN && !XENHVM */ +#else /* !XENHVM */ #define NUM_EVTCHN_INTS 0 #endif #define NUM_IO_INTS (FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS) Index: sys/i386/include/pcpu.h =================================================================== --- sys/i386/include/pcpu.h +++ sys/i386/include/pcpu.h @@ -44,34 +44,6 @@ * other processors" */ -#if defined(XEN) - -/* These are peridically updated in shared_info, and then copied here. */ -struct shadow_time_info { - uint64_t tsc_timestamp; /* TSC at last update of time vals. */ - uint64_t system_timestamp; /* Time, in nanosecs, since boot. */ - uint32_t tsc_to_nsec_mul; - uint32_t tsc_to_usec_mul; - int tsc_shift; - uint32_t version; -}; - -#define PCPU_XEN_FIELDS \ - ; \ - u_int pc_cr3; /* track cr3 for R1/R3*/ \ - vm_paddr_t *pc_pdir_shadow; \ - uint64_t pc_processed_system_time; \ - struct shadow_time_info pc_shadow_time; \ - char __pad[185] - -#else /* !XEN */ - -#define PCPU_XEN_FIELDS \ - ; \ - char __pad[233] - -#endif - #define PCPU_MD_FIELDS \ char pc_monitorbuf[128] __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ @@ -85,8 +57,8 @@ u_int pc_apic_id; \ int pc_private_tss; /* Flag indicating private tss*/\ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ - u_int pc_vcpu_id /* Xen vCPU ID */ \ - PCPU_XEN_FIELDS + u_int pc_vcpu_id; /* Xen vCPU ID */ \ + char __pad[233] #ifdef _KERNEL Index: sys/i386/include/pmap.h =================================================================== --- sys/i386/include/pmap.h +++ sys/i386/include/pmap.h @@ -219,76 +219,6 @@ */ #define vtophys(va) pmap_kextract((vm_offset_t)(va)) -#if defined(XEN) -#include - -#include - -#include -#include - -extern pt_entry_t pg_nx; - -#define PG_KERNEL (PG_V | PG_A | PG_RW | PG_M) - -#define MACH_TO_VM_PAGE(ma) PHYS_TO_VM_PAGE(xpmap_mtop((ma))) -#define VM_PAGE_TO_MACH(m) xpmap_ptom(VM_PAGE_TO_PHYS((m))) - -#define VTOM(va) xpmap_ptom(VTOP(va)) - -static __inline vm_paddr_t -pmap_kextract_ma(vm_offset_t va) -{ - vm_paddr_t ma; - if ((ma = PTD[va >> PDRSHIFT]) & PG_PS) { - ma = (ma & ~(NBPDR - 1)) | (va & (NBPDR - 1)); - } else { - ma = (*vtopte(va) & PG_FRAME) | (va & PAGE_MASK); - } - return ma; -} - -static __inline vm_paddr_t -pmap_kextract(vm_offset_t va) -{ - return xpmap_mtop(pmap_kextract_ma(va)); -} -#define vtomach(va) pmap_kextract_ma(((vm_offset_t) (va))) - -vm_paddr_t pmap_extract_ma(struct pmap *pmap, vm_offset_t va); - -void pmap_kenter_ma(vm_offset_t va, vm_paddr_t pa); -void pmap_map_readonly(struct pmap *pmap, vm_offset_t va, int len); -void pmap_map_readwrite(struct pmap *pmap, vm_offset_t va, int len); - -static __inline pt_entry_t -pte_load_store(pt_entry_t *ptep, pt_entry_t v) -{ - pt_entry_t r; - - r = *ptep; - PT_SET_VA(ptep, v, TRUE); - return (r); -} - -static __inline pt_entry_t -pte_load_store_ma(pt_entry_t *ptep, pt_entry_t v) -{ - pt_entry_t r; - - r = *ptep; - PT_SET_VA_MA(ptep, v, TRUE); - return (r); -} - -#define pte_load_clear(ptep) pte_load_store((ptep), (pt_entry_t)0ULL) - -#define pte_store(ptep, pte) pte_load_store((ptep), (pt_entry_t)pte) -#define pte_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) -#define pde_store_ma(ptep, pte) pte_load_store_ma((ptep), (pt_entry_t)pte) - -#elif !defined(XEN) - /* * KPTmap is a linear mapping of the kernel page table. It differs from the * recursive mapping in two ways: (1) it only provides access to kernel page @@ -328,13 +258,8 @@ } return (pa); } -#endif - -#if !defined(XEN) -#define PT_UPDATES_FLUSH() -#endif -#if (defined(PAE) || defined(PAE_TABLES)) && !defined(XEN) +#if (defined(PAE) || defined(PAE_TABLES)) #define pde_cmpset(pdep, old, new) atomic_cmpset_64_i586(pdep, old, new) #define pte_load_store(ptep, pte) atomic_swap_64_i586(ptep, pte) @@ -343,7 +268,7 @@ extern pt_entry_t pg_nx; -#elif !defined(PAE) && !defined(PAE_TABLES) && !defined(XEN) +#else /* !(PAE || PAE_TABLES) */ #define pde_cmpset(pdep, old, new) atomic_cmpset_int(pdep, old, new) #define pte_load_store(ptep, pte) atomic_swap_int(ptep, pte) @@ -352,7 +277,7 @@ *(u_int *)(ptep) = (u_int)(pte); \ } while (0) -#endif /* PAE */ +#endif /* !(PAE || PAE_TABLES) */ #define pte_clear(ptep) pte_store(ptep, 0) Index: sys/i386/include/segments.h =================================================================== --- sys/i386/include/segments.h +++ sys/i386/include/segments.h @@ -82,14 +82,8 @@ #ifdef _KERNEL extern int _default_ldt; -#ifdef XEN -extern struct proc_ldt default_proc_ldt; -extern union descriptor *gdt; -extern union descriptor *ldt; -#else extern union descriptor gdt[]; extern union descriptor ldt[NLDT]; -#endif extern struct soft_segment_descriptor gdt_segs[]; extern struct gate_descriptor *idt; extern struct region_descriptor r_gdt, r_idt; Index: sys/i386/include/smp.h =================================================================== --- sys/i386/include/smp.h +++ sys/i386/include/smp.h @@ -90,9 +90,7 @@ void assign_cpu_ids(void); void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); -#ifndef XEN void cpususpend_handler(void); -#endif void init_secondary_tail(void); void invltlb_handler(void); void invlpg_handler(void); @@ -101,9 +99,7 @@ void init_secondary(void); void ipi_startup(int apic_id, int vector); void ipi_all_but_self(u_int ipi); -#ifndef XEN void ipi_bitmap_handler(struct trapframe frame); -#endif void ipi_cpu(int cpu, u_int ipi); int ipi_nmi_handler(void); void ipi_selected(cpuset_t cpus, u_int ipi); @@ -121,9 +117,6 @@ void topo_probe(void); void ipi_send_cpu(int cpu, u_int ipi); -#ifdef XEN -void ipi_to_irq_init(void); -#endif #endif /* !LOCORE */ #endif /* SMP */ Index: sys/i386/include/vmparam.h =================================================================== --- sys/i386/include/vmparam.h +++ sys/i386/include/vmparam.h @@ -135,11 +135,7 @@ * Kernel physical load address. */ #ifndef KERNLOAD -#if defined(XEN) && !defined(XEN_PRIVILEGED_GUEST) -#define KERNLOAD 0 -#else #define KERNLOAD (1 << PDRSHIFT) -#endif #endif /* !defined(KERNLOAD) */ /* @@ -149,11 +145,7 @@ * messy at times, but hey, we'll do anything to save a page :-) */ -#ifdef XEN -#define VM_MAX_KERNEL_ADDRESS HYPERVISOR_VIRT_START -#else #define VM_MAX_KERNEL_ADDRESS VADDR(KPTDI+NKPDE-1, NPTEPG-1) -#endif #define VM_MIN_KERNEL_ADDRESS VADDR(PTDPTDI, PTDPTDI) Index: sys/i386/isa/npx.c =================================================================== --- sys/i386/isa/npx.c +++ sys/i386/isa/npx.c @@ -69,10 +69,6 @@ #include #include -#ifdef XEN -#include -#include -#endif #ifdef DEV_ISA #include @@ -157,13 +153,8 @@ #endif /* __GNUCLIKE_ASM && !lint */ -#ifdef XEN -#define start_emulating() (HYPERVISOR_fpu_taskswitch(1)) -#define stop_emulating() (HYPERVISOR_fpu_taskswitch(0)) -#else #define start_emulating() load_cr0(rcr0() | CR0_TS) #define stop_emulating() clts() -#endif #ifdef CPU_ENABLE_SSE #define GET_FPU_CW(thread) \ Index: sys/i386/pci/pci_cfgreg.c =================================================================== --- sys/i386/pci/pci_cfgreg.c +++ sys/i386/pci/pci_cfgreg.c @@ -93,9 +93,7 @@ int bytes); static int pcireg_cfgread(int bus, int slot, int func, int reg, int bytes); static void pcireg_cfgwrite(int bus, int slot, int func, int reg, int data, int bytes); -#ifndef XEN static int pcireg_cfgopen(void); -#endif static int pciereg_cfgread(int bus, unsigned slot, unsigned func, unsigned reg, unsigned bytes); static void pciereg_cfgwrite(int bus, unsigned slot, unsigned func, @@ -116,7 +114,6 @@ return (line); } -#ifndef XEN static u_int16_t pcibios_get_version(void) { @@ -137,7 +134,6 @@ } return (args.ebx & 0xffff); } -#endif /* * Initialise access to PCI configuration space @@ -145,9 +141,6 @@ int pci_cfgregopen(void) { -#ifdef XEN - return (0); -#else static int opened = 0; uint64_t pciebar; u_int16_t vid, did; @@ -202,7 +195,6 @@ } return(1); -#endif } static uint32_t @@ -390,7 +382,6 @@ mtx_unlock_spin(&pcicfg_mtx); } -#ifndef XEN /* check whether the configuration mechanism has been correctly identified */ static int pci_cfgcheck(int maxdev) @@ -607,7 +598,6 @@ return (1); } -#endif /* !XEN */ #define PCIE_PADDR(base, reg, bus, slot, func) \ ((base) + \ Index: sys/i386/pci/pci_pir.c =================================================================== --- sys/i386/pci/pci_pir.c +++ sys/i386/pci/pci_pir.c @@ -137,9 +137,6 @@ int i; uint8_t ck, *cv; -#ifdef XEN - return; -#else /* Don't try if we've already found a table. */ if (pci_route_table != NULL) return; @@ -150,7 +147,7 @@ sigaddr = bios_sigsearch(0, "_PIR", 4, 16, 0); if (sigaddr == 0) return; -#endif + /* If we found something, check the checksum and length. */ /* XXX - Use pmap_mapdev()? */ pt = (struct PIR_table *)(uintptr_t)BIOS_PADDRTOVADDR(sigaddr); @@ -481,11 +478,7 @@ args.eax = PCIBIOS_ROUTE_INTERRUPT; args.ebx = (bus << 8) | (device << 3) | func; args.ecx = (irq << 8) | (0xa + pin); -#ifdef XEN - return (0); -#else return (bios32(&args, PCIbios.ventry, GSEL(GCODE_SEL, SEL_KPL))); -#endif } Index: sys/i386/xen/clock.c =================================================================== --- sys/i386/xen/clock.c +++ /dev/null @@ -1,570 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz and Don Ahn. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)clock.c 7.2 (Berkeley) 5/12/91 - */ - -#include -__FBSDID("$FreeBSD$"); - -/* #define DELAYDEBUG */ -/* - * Routines to handle clock hardware. - */ - -#include "opt_ddb.h" -#include "opt_clock.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#if defined(SMP) -#include -#endif -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * 32-bit time_t's can't reach leap years before 1904 or after 2036, so we - * can use a simple formula for leap years. - */ -#define LEAPYEAR(y) (!((y) % 4)) -#define DAYSPERYEAR (28+30*4+31*7) - -#ifndef TIMER_FREQ -#define TIMER_FREQ 1193182 -#endif - -#ifdef CYC2NS_SCALE_FACTOR -#undef CYC2NS_SCALE_FACTOR -#endif -#define CYC2NS_SCALE_FACTOR 10 - -/* Values for timerX_state: */ -#define RELEASED 0 -#define RELEASE_PENDING 1 -#define ACQUIRED 2 -#define ACQUIRE_PENDING 3 - -struct mtx clock_lock; -#define RTC_LOCK_INIT \ - mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE) -#define RTC_LOCK mtx_lock_spin(&clock_lock) -#define RTC_UNLOCK mtx_unlock_spin(&clock_lock) -#define NS_PER_TICK (1000000000ULL/hz) - -int adjkerntz; /* local offset from UTC in seconds */ -int clkintr_pending; -int pscnt = 1; -int psdiv = 1; -int wall_cmos_clock; -u_int timer_freq = TIMER_FREQ; -static u_long cyc2ns_scale; -static uint64_t processed_system_time; /* stime (ns) at last processing. */ - -#define do_div(n,base) ({ \ - unsigned long __upper, __low, __high, __mod, __base; \ - __base = (base); \ - __asm("":"=a" (__low), "=d" (__high):"A" (n)); \ - __upper = __high; \ - if (__high) { \ - __upper = __high % (__base); \ - __high = __high / (__base); \ - } \ - __asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ - __asm("":"=A" (n):"a" (__low),"d" (__high)); \ - __mod; \ -}) - - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_mhz * 10^6)) - * ns = cycles * (10^3 / cpu_mhz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^3 * SC / cpu_mhz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) -{ - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return ((cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR); -} - -static uint32_t -getit(void) -{ - return (pvclock_get_last_cycles()); -} - - -/* - * XXX: timer needs more SMP work. - */ -void -i8254_init(void) -{ - - RTC_LOCK_INIT; -} - -/* - * Wait "n" microseconds. - * Relies on timer 1 counting down from (timer_freq / hz) - * Note: timer had better have been programmed before this is first used! - */ -void -i8254_delay(int n) -{ - int delta, ticks_left; - uint32_t tick, prev_tick; -#ifdef DELAYDEBUG - int getit_calls = 1; - int n1; - static int state = 0; - - if (state == 0) { - state = 1; - for (n1 = 1; n1 <= 10000000; n1 *= 10) - DELAY(n1); - state = 2; - } - if (state == 1) - printf("DELAY(%d)...", n); -#endif - /* - * Read the counter first, so that the rest of the setup overhead is - * counted. Guess the initial overhead is 20 usec (on most systems it - * takes about 1.5 usec for each of the i/o's in getit(). The loop - * takes about 6 usec on a 486/33 and 13 usec on a 386/20. The - * multiplications and divisions to scale the count take a while). - * - * However, if ddb is active then use a fake counter since reading - * the i8254 counter involves acquiring a lock. ddb must not go - * locking for many reasons, but it calls here for at least atkbd - * input. - */ - prev_tick = getit(); - - n -= 0; /* XXX actually guess no initial overhead */ - /* - * Calculate (n * (timer_freq / 1e6)) without using floating point - * and without any avoidable overflows. - */ - if (n <= 0) - ticks_left = 0; - else if (n < 256) - /* - * Use fixed point to avoid a slow division by 1000000. - * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest. - * 2^15 is the first power of 2 that gives exact results - * for n between 0 and 256. - */ - ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15; - else - /* - * Don't bother using fixed point, although gcc-2.7.2 - * generates particularly poor code for the long long - * division, since even the slow way will complete long - * before the delay is up (unless we're interrupted). - */ - ticks_left = ((u_int)n * (long long)timer_freq + 999999) - / 1000000; - - while (ticks_left > 0) { - tick = getit(); -#ifdef DELAYDEBUG - ++getit_calls; -#endif - delta = tick - prev_tick; - prev_tick = tick; - if (delta < 0) { - /* - * Guard against timer0_max_count being wrong. - * This shouldn't happen in normal operation, - * but it may happen if set_timer_freq() is - * traced. - */ - /* delta += timer0_max_count; ??? */ - if (delta < 0) - delta = 0; - } - ticks_left -= delta; - } -#ifdef DELAYDEBUG - if (state == 1) - printf(" %d calls to getit() at %d usec each\n", - getit_calls, (n + 5) / getit_calls); -#endif -} - -void -startrtclock() -{ - uint64_t __cpu_khz; - uint32_t cpu_khz; - struct vcpu_time_info *info; - - __cpu_khz = 1000000ULL << 32; - info = &HYPERVISOR_shared_info->vcpu_info[0].time; - - (void)do_div(__cpu_khz, info->tsc_to_system_mul); - if ( info->tsc_shift < 0 ) - cpu_khz = __cpu_khz << -info->tsc_shift; - else - cpu_khz = __cpu_khz >> info->tsc_shift; - - printf("Xen reported: %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - - /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = - (2^32 * 1 / (clocks/us)) */ - - set_cyc2ns_scale(cpu_khz/1000); - tsc_freq = cpu_khz * 1000; -} - -/* - * RTC support routines - */ - - -static __inline int -readrtc(int port) -{ - return(bcd2bin(rtcin(port))); -} - - -#ifdef XEN_PRIVILEGED_GUEST - -/* - * Initialize the time of day register, based on the time base which is, e.g. - * from a filesystem. - */ -static void -domu_inittodr(time_t base) -{ - unsigned long sec; - int s, y; - struct timespec ts; - - update_wallclock(); - add_uptime_to_wallclock(); - - RTC_LOCK; - - if (base) { - ts.tv_sec = base; - ts.tv_nsec = 0; - tc_setclock(&ts); - } - - sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - y = time_second - shadow_tv.tv_sec; - if (y <= -2 || y >= 2) { - /* badly off, adjust it */ - tc_setclock(&shadow_tv); - } - RTC_UNLOCK; -} - -/* - * Write system time back to RTC. - */ -static void -domu_resettodr(void) -{ - unsigned long tm; - int s; - dom0_op_t op; - struct shadow_time_info *shadow; - struct pcpu *pc; - - pc = pcpu_find(smp_processor_id()); - shadow = &pc->pc_shadow_time; - if (xen_disable_rtc_set) - return; - - s = splclock(); - tm = time_second; - splx(s); - - tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - if ((xen_start_info->flags & SIF_INITDOMAIN) && - !independent_wallclock) - { - op.cmd = DOM0_SETTIME; - op.u.settime.secs = tm; - op.u.settime.nsecs = 0; - op.u.settime.system_time = shadow->system_timestamp; - HYPERVISOR_dom0_op(&op); - update_wallclock(); - add_uptime_to_wallclock(); - } else if (independent_wallclock) { - /* notyet */ - ; - } -} - -/* - * Initialize the time of day register, based on the time base which is, e.g. - * from a filesystem. - */ -void -inittodr(time_t base) -{ - unsigned long sec, days; - int year, month; - int y, m, s; - struct timespec ts; - - if (!(xen_start_info->flags & SIF_INITDOMAIN)) { - domu_inittodr(base); - return; - } - - if (base) { - s = splclock(); - ts.tv_sec = base; - ts.tv_nsec = 0; - tc_setclock(&ts); - splx(s); - } - - /* Look if we have a RTC present and the time is valid */ - if (!(rtcin(RTC_STATUSD) & RTCSD_PWR)) - goto wrong_time; - - /* wait for time update to complete */ - /* If RTCSA_TUP is zero, we have at least 244us before next update */ - s = splhigh(); - while (rtcin(RTC_STATUSA) & RTCSA_TUP) { - splx(s); - s = splhigh(); - } - - days = 0; -#ifdef USE_RTC_CENTURY - year = readrtc(RTC_YEAR) + readrtc(RTC_CENTURY) * 100; -#else - year = readrtc(RTC_YEAR) + 1900; - if (year < 1970) - year += 100; -#endif - if (year < 1970) { - splx(s); - goto wrong_time; - } - month = readrtc(RTC_MONTH); - for (m = 1; m < month; m++) - days += daysinmonth[m-1]; - if ((month > 2) && LEAPYEAR(year)) - days ++; - days += readrtc(RTC_DAY) - 1; - for (y = 1970; y < year; y++) - days += DAYSPERYEAR + LEAPYEAR(y); - sec = ((( days * 24 + - readrtc(RTC_HRS)) * 60 + - readrtc(RTC_MIN)) * 60 + - readrtc(RTC_SEC)); - /* sec now contains the number of seconds, since Jan 1 1970, - in the local time zone */ - - sec += tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - y = time_second - sec; - if (y <= -2 || y >= 2) { - /* badly off, adjust it */ - ts.tv_sec = sec; - ts.tv_nsec = 0; - tc_setclock(&ts); - } - splx(s); - return; - - wrong_time: - printf("Invalid time in real time clock.\n"); - printf("Check and reset the date immediately!\n"); -} - - -/* - * Write system time back to RTC - */ -void -resettodr() -{ - unsigned long tm; - int y, m, s; - - if (!(xen_start_info->flags & SIF_INITDOMAIN)) { - domu_resettodr(); - return; - } - - if (xen_disable_rtc_set) - return; - - s = splclock(); - tm = time_second; - splx(s); - - /* Disable RTC updates and interrupts. */ - writertc(RTC_STATUSB, RTCSB_HALT | RTCSB_24HR); - - /* Calculate local time to put in RTC */ - - tm -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0); - - writertc(RTC_SEC, bin2bcd(tm%60)); tm /= 60; /* Write back Seconds */ - writertc(RTC_MIN, bin2bcd(tm%60)); tm /= 60; /* Write back Minutes */ - writertc(RTC_HRS, bin2bcd(tm%24)); tm /= 24; /* Write back Hours */ - - /* We have now the days since 01-01-1970 in tm */ - writertc(RTC_WDAY, (tm + 4) % 7 + 1); /* Write back Weekday */ - for (y = 1970, m = DAYSPERYEAR + LEAPYEAR(y); - tm >= m; - y++, m = DAYSPERYEAR + LEAPYEAR(y)) - tm -= m; - - /* Now we have the years in y and the day-of-the-year in tm */ - writertc(RTC_YEAR, bin2bcd(y%100)); /* Write back Year */ -#ifdef USE_RTC_CENTURY - writertc(RTC_CENTURY, bin2bcd(y/100)); /* ... and Century */ -#endif - for (m = 0; ; m++) { - int ml; - - ml = daysinmonth[m]; - if (m == 1 && LEAPYEAR(y)) - ml++; - if (tm < ml) - break; - tm -= ml; - } - - writertc(RTC_MONTH, bin2bcd(m + 1)); /* Write back Month */ - writertc(RTC_DAY, bin2bcd(tm + 1)); /* Write back Month Day */ - - /* Reenable RTC updates and interrupts. */ - writertc(RTC_STATUSB, RTCSB_24HR); - rtcin(RTC_INTR); -} -#endif - -/* - * Start clocks running. - */ -void -cpu_initclocks(void) -{ - cpu_initclocks_bsp(); -} - -/* Return system time offset by ticks */ -uint64_t -get_system_time(int ticks) -{ - return (processed_system_time + (ticks * NS_PER_TICK)); -} - -int -timer_spkr_acquire(void) -{ - - return (0); -} - -int -timer_spkr_release(void) -{ - - return (0); -} - -void -timer_spkr_setfreq(int freq) -{ - -} - Index: sys/i386/xen/exception.s =================================================================== --- sys/i386/xen/exception.s +++ /dev/null @@ -1,494 +0,0 @@ -/*- - * Copyright (c) 1989, 1990 William F. Jolitz. - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#include "opt_apic.h" -#include "opt_npx.h" - -#include -#include -#include - -#include "assym.s" - -#define SEL_RPL_MASK 0x0002 -#define __HYPERVISOR_iret 23 - -/* Offsets into shared_info_t. */ - -#define evtchn_upcall_pending /* 0 */ -#define evtchn_upcall_mask 1 - -#define sizeof_vcpu_shift 6 - - -#ifdef SMP -#define GET_VCPU_INFO(reg) movl PCPU(CPUID),reg ; \ - shl $sizeof_vcpu_shift,reg ; \ - addl HYPERVISOR_shared_info,reg -#else -#define GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg -#endif - -#define __DISABLE_INTERRUPTS(reg) movb $1,evtchn_upcall_mask(reg) -#define __ENABLE_INTERRUPTS(reg) movb $0,evtchn_upcall_mask(reg) -#define DISABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ - __DISABLE_INTERRUPTS(reg) -#define ENABLE_INTERRUPTS(reg) GET_VCPU_INFO(reg) ; \ - __ENABLE_INTERRUPTS(reg) -#define __TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) - -#define POPA \ - popl %edi; \ - popl %esi; \ - popl %ebp; \ - popl %ebx; \ - popl %ebx; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - - .text - -/*****************************************************************************/ -/* Trap handling */ -/*****************************************************************************/ -/* - * Trap and fault vector routines. - * - * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on - * the stack that mostly looks like an interrupt, but does not disable - * interrupts. A few of the traps we are use are interrupt gates, - * SDT_SYS386IGT, which are nearly the same thing except interrupts are - * disabled on entry. - * - * The cpu will push a certain amount of state onto the kernel stack for - * the current process. The amount of state depends on the type of trap - * and whether the trap crossed rings or not. See i386/include/frame.h. - * At the very least the current EFLAGS (status register, which includes - * the interrupt disable state prior to the trap), the code segment register, - * and the return instruction pointer are pushed by the cpu. The cpu - * will also push an 'error' code for certain traps. We push a dummy - * error code for those traps where the cpu doesn't in order to maintain - * a consistent frame. We also push a contrived 'trap number'. - * - * The cpu does not push the general registers, we must do that, and we - * must restore them prior to calling 'iret'. The cpu adjusts the %cs and - * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we - * must load them with appropriate values for supervisor mode operation. - */ - -MCOUNT_LABEL(user) -MCOUNT_LABEL(btrap) - -#define TRAP(a) pushl $(a) ; jmp alltraps - -IDTVEC(div) - pushl $0; TRAP(T_DIVIDE) -IDTVEC(dbg) - pushl $0; TRAP(T_TRCTRAP) -IDTVEC(nmi) - pushl $0; TRAP(T_NMI) -IDTVEC(bpt) - pushl $0; TRAP(T_BPTFLT) -IDTVEC(ofl) - pushl $0; TRAP(T_OFLOW) -IDTVEC(bnd) - pushl $0; TRAP(T_BOUND) -IDTVEC(ill) - pushl $0; TRAP(T_PRIVINFLT) -IDTVEC(dna) - pushl $0; TRAP(T_DNA) -IDTVEC(fpusegm) - pushl $0; TRAP(T_FPOPFLT) -IDTVEC(tss) - TRAP(T_TSSFLT) -IDTVEC(missing) - TRAP(T_SEGNPFLT) -IDTVEC(stk) - TRAP(T_STKFLT) -IDTVEC(prot) - TRAP(T_PROTFLT) -IDTVEC(page) - TRAP(T_PAGEFLT) -IDTVEC(mchk) - pushl $0; TRAP(T_MCHK) -IDTVEC(rsvd) - pushl $0; TRAP(T_RESERVED) -IDTVEC(fpu) - pushl $0; TRAP(T_ARITHTRAP) -IDTVEC(align) - TRAP(T_ALIGNFLT) -IDTVEC(xmm) - pushl $0; TRAP(T_XMMFLT) - -IDTVEC(hypervisor_callback) - pushl $0; - pushl $0; - pushal - pushl %ds - pushl %es - pushl %fs -upcall_with_regs_pushed: - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) -call_evtchn_upcall: - movl TF_EIP(%esp),%eax - cmpl $scrit,%eax - jb 10f - cmpl $ecrit,%eax - jb critical_region_fixup - -10: pushl %esp - call xen_intr_handle_upcall - addl $4,%esp - - /* - * Return via doreti to handle ASTs. - */ - MEXITCOUNT - jmp doreti - - -hypervisor_callback_pending: - DISABLE_INTERRUPTS(%esi) /* cli */ - jmp 10b - /* - * alltraps entry point. Interrupts are enabled if this was a trap - * gate (TGT), else disabled if this was an interrupt gate (IGT). - * Note that int0x80_syscall is a trap gate. Only page faults - * use an interrupt gate. - */ - SUPERALIGN_TEXT - .globl alltraps - .type alltraps,@function -alltraps: - pushal - pushl %ds - pushl %es - pushl %fs - -alltraps_with_regs_pushed: - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) - -calltrap: - push %esp - call trap - add $4, %esp - - /* - * Return via doreti to handle ASTs. - */ - MEXITCOUNT - jmp doreti - -/* - * SYSCALL CALL GATE (old entry point for a.out binaries) - * - * The intersegment call has been set up to specify one dummy parameter. - * - * This leaves a place to put eflags so that the call frame can be - * converted to a trap frame. Note that the eflags is (semi-)bogusly - * pushed into (what will be) tf_err and then copied later into the - * final spot. It has to be done this way because esp can't be just - * temporarily altered for the pushfl - an interrupt might come in - * and clobber the saved cs/eip. - */ - SUPERALIGN_TEXT -IDTVEC(lcall_syscall) - pushfl /* save eflags */ - popl 8(%esp) /* shuffle into tf_eflags */ - pushl $7 /* sizeof "lcall 7,0" */ - subl $4,%esp /* skip over tf_trapno */ - pushal - pushl %ds - pushl %es - pushl %fs - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) - pushl %esp - call syscall - add $4, %esp - MEXITCOUNT - jmp doreti - -/* - * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) - * - * Even though the name says 'int0x80', this is actually a TGT (trap gate) - * rather then an IGT (interrupt gate). Thus interrupts are enabled on - * entry just as they are for a normal syscall. - */ - SUPERALIGN_TEXT -IDTVEC(int0x80_syscall) - pushl $2 /* sizeof "int 0x80" */ - pushl $0xBEEF /* for debug */ - pushal - pushl %ds - pushl %es - pushl %fs - SET_KERNEL_SREGS - FAKE_MCOUNT(TF_EIP(%esp)) - pushl %esp - call syscall - add $4, %esp - MEXITCOUNT - jmp doreti - -ENTRY(fork_trampoline) - pushl %esp /* trapframe pointer */ - pushl %ebx /* arg1 */ - pushl %esi /* function */ - call fork_exit - addl $12,%esp - /* cut from syscall */ - - /* - * Return via doreti to handle ASTs. - */ - MEXITCOUNT - jmp doreti - - -/* - * To efficiently implement classification of trap and interrupt handlers - * for profiling, there must be only trap handlers between the labels btrap - * and bintr, and only interrupt handlers between the labels bintr and - * eintr. This is implemented (partly) by including files that contain - * some of the handlers. Before including the files, set up a normal asm - * environment so that the included files doen't need to know that they are - * included. - */ - - .data - .p2align 4 - .text - SUPERALIGN_TEXT -MCOUNT_LABEL(bintr) - -#ifdef DEV_APIC - .data - .p2align 4 - .text - SUPERALIGN_TEXT - -#include -#endif - - .data - .p2align 4 - .text - SUPERALIGN_TEXT -#include - - .text -MCOUNT_LABEL(eintr) - -/* - * void doreti(struct trapframe) - * - * Handle return from interrupts, traps and syscalls. - */ - .text - SUPERALIGN_TEXT - .type doreti,@function -doreti: - FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ -doreti_next: -#ifdef notyet - /* - * Check if ASTs can be handled now. PSL_VM must be checked first - * since segment registers only have an RPL in non-VM86 mode. - */ - testl $PSL_VM,TF_EFLAGS(%esp) /* are we in vm86 mode? */ - jz doreti_notvm86 - movl PCPU(CURPCB),%ecx - testl $PCB_VM86CALL,PCB_FLAGS(%ecx) /* are we in a vm86 call? */ - jz doreti_ast /* can handle ASTS now if not */ - jmp doreti_exit - -doreti_notvm86: -#endif - testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ - jz doreti_exit /* can't handle ASTs now if not */ - -doreti_ast: - /* - * Check for ASTs atomically with returning. Disabling CPU - * interrupts provides sufficient locking even in the SMP case, - * since we will be informed of any new ASTs by an IPI. - */ - DISABLE_INTERRUPTS(%esi) /* cli */ - movl PCPU(CURTHREAD),%eax - testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) - je doreti_exit - ENABLE_INTERRUPTS(%esi) /* sti */ - pushl %esp /* pass a pointer to the trapframe */ - call ast - add $4,%esp - jmp doreti_ast - - /* - * doreti_exit: pop registers, iret. - * - * The segment register pop is a special case, since it may - * fault if (for example) a sigreturn specifies bad segment - * registers. The fault is handled in trap.c. - */ -doreti_exit: - ENABLE_INTERRUPTS(%esi) # reenable event callbacks (sti) - - .globl scrit -scrit: - __TEST_PENDING(%esi) - jnz hypervisor_callback_pending /* More to go */ - - MEXITCOUNT - - .globl doreti_popl_fs -doreti_popl_fs: - popl %fs - .globl doreti_popl_es -doreti_popl_es: - popl %es - .globl doreti_popl_ds -doreti_popl_ds: - popl %ds - - /* - * This is important: as nothing is atomic over here (we can get - * interrupted any time), we use the critical_region_fixup() in - * order to figure out where out stack is. Therefore, do NOT use - * 'popal' here without fixing up the table! - */ - POPA - addl $8,%esp - .globl doreti_iret -doreti_iret: - jmp hypercall_page + (__HYPERVISOR_iret * 32) - .globl ecrit -ecrit: - /* - * doreti_iret_fault and friends. Alternative return code for - * the case where we get a fault in the doreti_exit code - * above. trap() (i386/i386/trap.c) catches this specific - * case, sends the process a signal and continues in the - * corresponding place in the code below. - */ - ALIGN_TEXT - .globl doreti_iret_fault -doreti_iret_fault: - subl $8,%esp - pushal - pushl %ds - .globl doreti_popl_ds_fault -doreti_popl_ds_fault: - pushl %es - .globl doreti_popl_es_fault -doreti_popl_es_fault: - pushl %fs - .globl doreti_popl_fs_fault -doreti_popl_fs_fault: - movl $0,TF_ERR(%esp) /* XXX should be the error code */ - movl $T_PROTFLT,TF_TRAPNO(%esp) - jmp alltraps_with_regs_pushed - - /* -# [How we do the fixup]. We want to merge the current stack frame with the -# just-interrupted frame. How we do this depends on where in the critical -# region the interrupted handler was executing, and so how many saved -# registers are in each frame. We do this quickly using the lookup table -# 'critical_fixup_table'. For each byte offset in the critical region, it -# provides the number of bytes which have already been popped from the -# interrupted stack frame. -*/ - -.globl critical_region_fixup -critical_region_fixup: - addl $critical_fixup_table-scrit,%eax - movzbl (%eax),%eax # %eax contains num bytes popped - movl %esp,%esi - add %eax,%esi # %esi points at end of src region - movl %esp,%edi - add $0x40,%edi # %edi points at end of dst region - movl %eax,%ecx - shr $2,%ecx # convert bytes to words - je 16f # skip loop if nothing to copy -15: subl $4,%esi # pre-decrementing copy loop - subl $4,%edi - movl (%esi),%eax - movl %eax,(%edi) - loop 15b -16: movl %edi,%esp # final %edi is top of merged stack - jmp hypervisor_callback_pending - - -critical_fixup_table: -.byte 0x0,0x0,0x0 #testb $0x1,(%esi) -.byte 0x0,0x0,0x0,0x0,0x0,0x0 #jne ea -.byte 0x0,0x0 #pop %fs -.byte 0x04 #pop %es -.byte 0x08 #pop %ds -.byte 0x0c #pop %edi -.byte 0x10 #pop %esi -.byte 0x14 #pop %ebp -.byte 0x18 #pop %ebx -.byte 0x1c #pop %ebx -.byte 0x20 #pop %edx -.byte 0x24 #pop %ecx -.byte 0x28 #pop %eax -.byte 0x2c,0x2c,0x2c #add $0x8,%esp -#if 0 - .byte 0x34 #iret -#endif -.byte 0x34,0x34,0x34,0x34,0x34 #HYPERVISOR_iret - - -/* # Hypervisor uses this for application faults while it executes.*/ -ENTRY(failsafe_callback) - pushal - call xen_failsafe_handler -/*# call install_safe_pf_handler */ - movl 28(%esp),%ebx -1: movl %ebx,%ds - movl 32(%esp),%ebx -2: movl %ebx,%es - movl 36(%esp),%ebx -3: movl %ebx,%fs - movl 40(%esp),%ebx -4: movl %ebx,%gs -/*# call install_normal_pf_handler */ - popal - addl $12,%esp - iret - - Index: sys/i386/xen/locore.s =================================================================== --- sys/i386/xen/locore.s +++ /dev/null @@ -1,360 +0,0 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)locore.s 7.3 (Berkeley) 5/13/91 - * $FreeBSD$ - * - * originally from: locore.s, by William F. Jolitz - * - * Substantially rewritten by David Greenman, Rod Grimes, - * Bruce Evans, Wolfgang Solfrank, Poul-Henning Kamp - * and many others. - */ - -#include "opt_bootp.h" -#include "opt_compat.h" -#include "opt_nfsroot.h" -#include "opt_pmap.h" - -#include -#include - -#include -#include -#include -#include -#include - -#define __ASSEMBLY__ -#include - -/* The defines below have been lifted out of */ -#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ -#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ -#define KERNEL_CS FLAT_RING1_CS -#define KERNEL_DS FLAT_RING1_DS - -#include "assym.s" - -.section __xen_guest - .ascii "LOADER=generic,GUEST_OS=freebsd,GUEST_VER=7.0,XEN_VER=xen-3.0,BSD_SYMTAB,VIRT_BASE=0xc0000000" - .byte 0 - - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "FreeBSD") - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "HEAD") - ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, KERNBASE) - ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, KERNBASE) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, btext) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, XEN_HYPERVISOR_VIRT_START) -#if 0 - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") -#endif - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|supervisor_mode_kernel|writable_descriptor_tables") - -#ifdef PAE - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") - ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") - ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) -#endif - ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") - ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) - - - -/* - * XXX - * - * Note: This version greatly munged to avoid various assembler errors - * that may be fixed in newer versions of gas. Perhaps newer versions - * will have more pleasant appearance. - */ - -/* - * PTmap is recursive pagemap at top of virtual address space. - * Within PTmap, the page directory can be found (third indirection). - */ - .globl PTmap,PTD,PTDpde - .set PTmap,(PTDPTDI << PDRSHIFT) - .set PTD,PTmap + (PTDPTDI * PAGE_SIZE) - .set PTDpde,PTD + (PTDPTDI * PDESIZE) - -/* - * Compiled KERNBASE location and the kernel load address - */ - .globl kernbase - .set kernbase,KERNBASE - .globl kernload - .set kernload,KERNLOAD - -/* - * Globals - */ - .data - ALIGN_DATA /* just to be sure */ - - .space 0x2000 /* space for tmpstk - temporary stack */ -tmpstk: - - .globl bootinfo -bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ - - .globl KERNend -KERNend: .long 0 /* phys addr end of kernel (just after bss) */ - .globl physfree -physfree: .long 0 /* phys addr of next free page */ - - .globl IdlePTD -IdlePTD: .long 0 /* phys addr of kernel PTD */ - -#ifdef PAE - .globl IdlePDPT -IdlePDPT: .long 0 /* phys addr of kernel PDPT */ -#endif - -#ifdef SMP - .globl KPTphys -#endif -KPTphys: .long 0 /* phys addr of kernel page tables */ - .globl gdtset -gdtset: .long 0 /* GDT is valid */ - - .globl proc0kstack -proc0kstack: .long 0 /* address of proc 0 kstack space */ -p0kpa: .long 0 /* phys addr of proc0's STACK */ - -vm86phystk: .long 0 /* PA of vm86/bios stack */ - - .globl vm86paddr, vm86pa -vm86paddr: .long 0 /* address of vm86 region */ -vm86pa: .long 0 /* phys addr of vm86 region */ - -#ifdef PC98 - .globl pc98_system_parameter -pc98_system_parameter: - .space 0x240 -#endif - - .globl avail_space -avail_space: .long 0 - -/********************************************************************** - * - * Some handy macros - * - */ - -/* - * We're already in protected mode, so no remapping is needed. - */ -#define R(foo) (foo) - -#define ALLOCPAGES(foo) \ - movl R(physfree), %esi ; \ - movl $((foo)*PAGE_SIZE), %eax ; \ - addl %esi, %eax ; \ - movl %eax, R(physfree) ; \ - movl %esi, %edi ; \ - movl $((foo)*PAGE_SIZE),%ecx ; \ - xorl %eax,%eax ; \ - cld ; \ - rep ; \ - stosb - -/* - * fillkpt - * eax = page frame address - * ebx = index into page table - * ecx = how many pages to map - * base = base address of page dir/table - * prot = protection bits - */ -#define fillkpt(base, prot) \ - shll $PTESHIFT,%ebx ; \ - addl base,%ebx ; \ - orl $PG_V,%eax ; \ - orl prot,%eax ; \ -1: movl %eax,(%ebx) ; \ - addl $PAGE_SIZE,%eax ; /* increment physical address */ \ - addl $PTESIZE,%ebx ; /* next pte */ \ - loop 1b - -/* - * fillkptphys(prot) - * eax = physical address - * ecx = how many pages to map - * prot = protection bits - */ -#define fillkptphys(prot) \ - movl %eax, %ebx ; \ - shrl $PAGE_SHIFT, %ebx ; \ - fillkpt(R(KPTphys), prot) - -/* Temporary stack */ -.space 8192 -tmpstack: - .long tmpstack, KERNEL_DS - - .text - -.p2align 12, 0x90 - -#define HYPERCALL_PAGE_OFFSET 0x1000 -.org HYPERCALL_PAGE_OFFSET -ENTRY(hypercall_page) - .cfi_startproc - .skip 0x1000 - .cfi_endproc - -/********************************************************************** - * - * This is where the bootblocks start us, set the ball rolling... - * - */ -NON_GPROF_ENTRY(btext) - /* At the end of our stack, we shall have free space - so store it */ - movl %esp,%ebx - movl %ebx,R(avail_space) - - lss tmpstack,%esp - - pushl %esi - call initvalues - popl %esi - - /* Store the CPUID information */ - xorl %eax,%eax - cpuid # cpuid 0 - movl %eax,R(cpu_high) # highest capability - movl %ebx,R(cpu_vendor) # store vendor string - movl %edx,R(cpu_vendor+4) - movl %ecx,R(cpu_vendor+8) - movb $0,R(cpu_vendor+12) - - movl $1,%eax - cpuid # cpuid 1 - movl %eax,R(cpu_id) # store cpu_id - movl %ebx,R(cpu_procinfo) # store cpu_procinfo - movl %edx,R(cpu_feature) # store cpu_feature - movl %ecx,R(cpu_feature2) # store cpu_feature2 - rorl $8,%eax # extract family type - andl $15,%eax - cmpl $5,%eax - movl $CPU_686,R(cpu) - - movl proc0kstack,%eax - leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp - xorl %ebp,%ebp /* mark end of frames */ -#ifdef PAE - movl IdlePDPT,%esi -#else - movl IdlePTD,%esi -#endif - movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) - pushl physfree - call init386 - addl $4, %esp - call mi_startup - /* NOTREACHED */ - int $3 - -/* - * Signal trampoline, copied to top of user stack - */ -NON_GPROF_ENTRY(sigcode) - calll *SIGF_HANDLER(%esp) - leal SIGF_UC(%esp),%eax /* get ucontext */ - pushl %eax - testl $PSL_VM,UC_EFLAGS(%eax) - jne 1f - mov UC_GS(%eax), %gs /* restore %gs */ -1: - movl $SYS_sigreturn,%eax - pushl %eax /* junk to fake return addr. */ - int $0x80 /* enter kernel with args */ - /* on stack */ -1: - jmp 1b - -#ifdef COMPAT_FREEBSD4 - ALIGN_TEXT -freebsd4_sigcode: - calll *SIGF_HANDLER(%esp) - leal SIGF_UC4(%esp),%eax /* get ucontext */ - pushl %eax - testl $PSL_VM,UC4_EFLAGS(%eax) - jne 1f - mov UC4_GS(%eax),%gs /* restore %gs */ -1: - movl $344,%eax /* 4.x SYS_sigreturn */ - pushl %eax /* junk to fake return addr. */ - int $0x80 /* enter kernel with args */ - /* on stack */ -1: - jmp 1b -#endif - -#ifdef COMPAT_43 - ALIGN_TEXT -osigcode: - call *SIGF_HANDLER(%esp) /* call signal handler */ - lea SIGF_SC(%esp),%eax /* get sigcontext */ - pushl %eax - testl $PSL_VM,SC_PS(%eax) - jne 9f - movl SC_GS(%eax),%gs /* restore %gs */ -9: - movl $103,%eax /* 3.x SYS_sigreturn */ - pushl %eax /* junk to fake return addr. */ - int $0x80 /* enter kernel with args */ -0: jmp 0b -#endif /* COMPAT_43 */ - - ALIGN_TEXT -esigcode: - - .data - .globl szsigcode -szsigcode: - .long esigcode-sigcode -#ifdef COMPAT_FREEBSD4 - .globl szfreebsd4_sigcode -szfreebsd4_sigcode: - .long esigcode-freebsd4_sigcode -#endif -#ifdef COMPAT_43 - .globl szosigcode -szosigcode: - .long esigcode-osigcode -#endif Index: sys/i386/xen/mp_machdep.c =================================================================== --- sys/i386/xen/mp_machdep.c +++ /dev/null @@ -1,1292 +0,0 @@ -/*- - * Copyright (c) 1996, by Steve Passe - * Copyright (c) 2008, by Kip Macy - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. The name of the developer may NOT be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include "opt_apic.h" -#include "opt_cpu.h" -#include "opt_kstack_pages.h" -#include "opt_mp_watchdog.h" -#include "opt_pmap.h" -#include "opt_sched.h" -#include "opt_smp.h" - -#if !defined(lint) -#if !defined(SMP) -#error How did you get here? -#endif - -#ifndef DEV_APIC -#error The apic device is required for SMP, add "device apic" to your config file. -#endif -#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) -#error SMP not supported with CPU_DISABLE_CMPXCHG -#endif -#endif /* not lint */ - -#include -#include -#include -#include /* cngetc() */ -#include -#ifdef GPROF -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/*---------------------------- Extern Declarations ---------------------------*/ -extern struct pcpu __pcpu[]; - -extern void Xhypervisor_callback(void); -extern void failsafe_callback(void); - -/*--------------------------- Forward Declarations ---------------------------*/ -static driver_filter_t smp_reschedule_interrupt; -static driver_filter_t smp_call_function_interrupt; -static int start_all_aps(void); -static int start_ap(int apic_id); -static void release_aps(void *dummy); - -/*---------------------------------- Macros ----------------------------------*/ -#define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) - -/*-------------------------------- Local Types -------------------------------*/ -typedef void call_data_func_t(uintptr_t , uintptr_t); - -struct xen_ipi_handler -{ - driver_filter_t *filter; - const char *description; -}; - -enum { - RESCHEDULE_VECTOR, - CALL_FUNCTION_VECTOR, -}; - -/*-------------------------------- Global Data -------------------------------*/ -static u_int hyperthreading_cpus; -static cpuset_t hyperthreading_cpus_mask; - -int mp_naps; /* # of Applications processors */ -int boot_cpu_id = -1; /* designated BSP */ - -int bootAP; -static union descriptor *bootAPgdt; - -/* Free these after use */ -void *bootstacks[MAXCPU]; - -struct pcb stoppcbs[MAXCPU]; - -/* Variables needed for SMP tlb shootdown. */ -vm_offset_t smp_tlb_addr1; -vm_offset_t smp_tlb_addr2; -volatile int smp_tlb_wait; - -static u_int logical_cpus; -static volatile cpuset_t ipi_nmi_pending; - -/* used to hold the AP's until we are ready to release them */ -struct mtx ap_boot_mtx; - -/* Set to 1 once we're ready to let the APs out of the pen. */ -volatile int aps_ready = 0; - -/* - * Store data from cpu_add() until later in the boot when we actually setup - * the APs. - */ -struct cpu_info cpu_info[MAX_APIC_ID + 1]; -int cpu_apic_ids[MAXCPU]; -int apic_cpuids[MAX_APIC_ID + 1]; - -/* Holds pending bitmap based IPIs per CPU */ -volatile u_int cpu_ipi_pending[MAXCPU]; - -int cpu_logical; -int cpu_cores; - -static const struct xen_ipi_handler xen_ipis[] = -{ - [RESCHEDULE_VECTOR] = { smp_reschedule_interrupt, "resched" }, - [CALL_FUNCTION_VECTOR] = { smp_call_function_interrupt,"callfunc" } -}; - -/*------------------------------- Per-CPU Data -------------------------------*/ -DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); -DPCPU_DEFINE(struct vcpu_info *, vcpu_info); - -/*------------------------------ Implementation ------------------------------*/ -struct cpu_group * -cpu_topo(void) -{ - if (cpu_cores == 0) - cpu_cores = 1; - if (cpu_logical == 0) - cpu_logical = 1; - if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { - printf("WARNING: Non-uniform processors.\n"); - printf("WARNING: Using suboptimal topology.\n"); - return (smp_topo_none()); - } - /* - * No multi-core or hyper-threaded. - */ - if (cpu_logical * cpu_cores == 1) - return (smp_topo_none()); - /* - * Only HTT no multi-core. - */ - if (cpu_logical > 1 && cpu_cores == 1) - return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); - /* - * Only multi-core no HTT. - */ - if (cpu_cores > 1 && cpu_logical == 1) - return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); - /* - * Both HTT and multi-core. - */ - return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, - CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); -} - -/* - * Calculate usable address in base memory for AP trampoline code. - */ -u_int -mp_bootaddress(u_int basemem) -{ - - return (basemem); -} - -void -cpu_add(u_int apic_id, char boot_cpu) -{ - - if (apic_id > MAX_APIC_ID) { - panic("SMP: APIC ID %d too high", apic_id); - return; - } - KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", - apic_id)); - cpu_info[apic_id].cpu_present = 1; - if (boot_cpu) { - KASSERT(boot_cpu_id == -1, - ("CPU %d claims to be BSP, but CPU %d already is", apic_id, - boot_cpu_id)); - boot_cpu_id = apic_id; - cpu_info[apic_id].cpu_bsp = 1; - } - if (mp_ncpus < MAXCPU) - mp_ncpus++; - if (bootverbose) - printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : - "AP"); -} - -void -cpu_mp_setmaxid(void) -{ - - mp_maxid = MAXCPU - 1; -} - -int -cpu_mp_probe(void) -{ - - /* - * Always record BSP in CPU map so that the mbuf init code works - * correctly. - */ - CPU_SETOF(0, &all_cpus); - if (mp_ncpus == 0) { - /* - * No CPUs were found, so this must be a UP system. Setup - * the variables to represent a system with a single CPU - * with an id of 0. - */ - mp_ncpus = 1; - return (0); - } - - /* At least one CPU was found. */ - if (mp_ncpus == 1) { - /* - * One CPU was found, so this must be a UP system with - * an I/O APIC. - */ - return (0); - } - - /* At least two CPUs were found. */ - return (1); -} - -/* - * Initialize the IPI handlers and start up the AP's. - */ -void -cpu_mp_start(void) -{ - int i; - - /* Initialize the logical ID to APIC ID table. */ - for (i = 0; i < MAXCPU; i++) { - cpu_apic_ids[i] = -1; - cpu_ipi_pending[i] = 0; - } - - /* Set boot_cpu_id if needed. */ - if (boot_cpu_id == -1) { - boot_cpu_id = PCPU_GET(apic_id); - cpu_info[boot_cpu_id].cpu_bsp = 1; - } else - KASSERT(boot_cpu_id == PCPU_GET(apic_id), - ("BSP's APIC ID doesn't match boot_cpu_id")); - cpu_apic_ids[0] = boot_cpu_id; - apic_cpuids[boot_cpu_id] = 0; - - assign_cpu_ids(); - - /* Start each Application Processor */ - start_all_aps(); - - /* Setup the initial logical CPUs info. */ - logical_cpus = 0; - CPU_ZERO(&logical_cpus_mask); - if (cpu_feature & CPUID_HTT) - logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; - - set_interrupt_apic_ids(); -} - - -static void -iv_rendezvous(uintptr_t a, uintptr_t b) -{ - smp_rendezvous_action(); -} - -static void -iv_invltlb(uintptr_t a, uintptr_t b) -{ - xen_tlb_flush(); -} - -static void -iv_invlpg(uintptr_t a, uintptr_t b) -{ - xen_invlpg(a); -} - -static void -iv_invlrng(uintptr_t a, uintptr_t b) -{ - vm_offset_t start = (vm_offset_t)a; - vm_offset_t end = (vm_offset_t)b; - - while (start < end) { - xen_invlpg(start); - start += PAGE_SIZE; - } -} - - -static void -iv_invlcache(uintptr_t a, uintptr_t b) -{ - - wbinvd(); - atomic_add_int(&smp_tlb_wait, 1); -} - -/* - * These start from "IPI offset" APIC_IPI_INTS - */ -static call_data_func_t *ipi_vectors[5] = -{ - iv_rendezvous, - iv_invltlb, - iv_invlpg, - iv_invlrng, - iv_invlcache, -}; - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -static int -smp_reschedule_interrupt(void *unused) -{ - int cpu = PCPU_GET(cpuid); - u_int ipi_bitmap; - - ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); - - if (ipi_bitmap & (1 << IPI_PREEMPT)) { -#ifdef COUNT_IPIS - (*ipi_preempt_counts[cpu])++; -#endif - sched_preempt(curthread); - } - - if (ipi_bitmap & (1 << IPI_AST)) { -#ifdef COUNT_IPIS - (*ipi_ast_counts[cpu])++; -#endif - /* Nothing to do for AST */ - } - return (FILTER_HANDLED); -} - -struct _call_data { - uint16_t func_id; - uint16_t wait; - uintptr_t arg1; - uintptr_t arg2; - atomic_t started; - atomic_t finished; -}; - -static struct _call_data *call_data; - -static int -smp_call_function_interrupt(void *unused) -{ - call_data_func_t *func; - uintptr_t arg1 = call_data->arg1; - uintptr_t arg2 = call_data->arg2; - int wait = call_data->wait; - atomic_t *started = &call_data->started; - atomic_t *finished = &call_data->finished; - - /* We only handle function IPIs, not bitmap IPIs */ - if (call_data->func_id < APIC_IPI_INTS || - call_data->func_id > IPI_BITMAP_VECTOR) - panic("invalid function id %u", call_data->func_id); - - func = ipi_vectors[IPI_TO_IDX(call_data->func_id)]; - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - (*func)(arg1, arg2); - - if (wait) { - mb(); - atomic_inc(finished); - } - atomic_add_int(&smp_tlb_wait, 1); - return (FILTER_HANDLED); -} - -/* - * Print various information about the SMP system hardware and setup. - */ -void -cpu_mp_announce(void) -{ - int i, x; - - /* List CPUs */ - printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); - for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { - if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) - continue; - if (cpu_info[x].cpu_disabled) - printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); - else { - KASSERT(i < mp_ncpus, - ("mp_ncpus and actual cpus are out of whack")); - printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); - } - } -} - -static int -xen_smp_cpu_init(unsigned int cpu) -{ - xen_intr_handle_t *ipi_handle; - const struct xen_ipi_handler *ipi; - int idx, rc; - - ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); - for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { - - /* - * The PCPU variable pc_device is not initialized on i386 PV, - * so we have to use the root_bus device in order to setup - * the IPIs. - */ - rc = xen_intr_alloc_and_bind_ipi(root_bus, cpu, - ipi->filter, INTR_TYPE_TTY, &ipi_handle[idx]); - if (rc != 0) { - printf("Unable to allocate a XEN IPI port. " - "Error %d\n", rc); - break; - } - xen_intr_describe(ipi_handle[idx], "%s", ipi->description); - } - - for (;idx < nitems(xen_ipis); idx++) - ipi_handle[idx] = NULL; - - if (rc == 0) - return (0); - - /* Either all are successfully mapped, or none at all. */ - for (idx = 0; idx < nitems(xen_ipis); idx++) { - if (ipi_handle[idx] == NULL) - continue; - - xen_intr_unbind(ipi_handle[idx]); - ipi_handle[idx] = NULL; - } - - return (rc); -} - -static void -xen_smp_intr_init_cpus(void *unused) -{ - int i; - - for (i = 0; i < mp_ncpus; i++) - xen_smp_cpu_init(i); -} - -static void -xen_smp_intr_setup_cpus(void *unused) -{ - int i; - - for (i = 0; i < mp_ncpus; i++) - DPCPU_ID_SET(i, vcpu_info, - &HYPERVISOR_shared_info->vcpu_info[i]); -} - -#define MTOPSIZE (1<<(14 + PAGE_SHIFT)) - -/* - * AP CPU's call this to initialize themselves. - */ -void -init_secondary(void) -{ - vm_offset_t addr; - u_int cpuid; - int gsel_tss; - - - /* bootAP is set in start_ap() to our ID. */ - PCPU_SET(currentldt, _default_ldt); - gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); -#if 0 - gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; -#endif - PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); -#if 0 - PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); - - PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); -#endif - PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); - - /* - * Set to a known state: - * Set by mpboot.s: CR0_PG, CR0_PE - * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM - */ - /* - * signal our startup to the BSP. - */ - mp_naps++; - - /* Spin until the BSP releases the AP's. */ - while (!aps_ready) - ia32_pause(); - - /* BSP may have changed PTD while we were waiting */ - invltlb(); - for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) - invlpg(addr); - -#if 0 - /* set up SSE/NX */ - initializecpu(); -#endif - - /* set up FPU state on the AP */ - npxinit(false); -#if 0 - /* A quick check from sanity claus */ - if (PCPU_GET(apic_id) != lapic_id()) { - printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); - printf("SMP: actual apic_id = %d\n", lapic_id()); - printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); - panic("cpuid mismatch! boom!!"); - } -#endif - - /* Initialize curthread. */ - KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); - PCPU_SET(curthread, PCPU_GET(idlethread)); - - mtx_lock_spin(&ap_boot_mtx); -#if 0 - - /* Init local apic for irq's */ - lapic_setup(1); -#endif - smp_cpus++; - - cpuid = PCPU_GET(cpuid); - CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); - printf("SMP: AP CPU #%d Launched!\n", cpuid); - - /* Determine if we are a logical CPU. */ - if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) - CPU_SET(cpuid, &logical_cpus_mask); - - /* Determine if we are a hyperthread. */ - if (hyperthreading_cpus > 1 && - PCPU_GET(apic_id) % hyperthreading_cpus != 0) - CPU_SET(cpuid, &hyperthreading_cpus_mask); -#if 0 - if (bootverbose) - lapic_dump("AP"); -#endif - if (smp_cpus == mp_ncpus) { - /* enable IPI's, tlb shootdown, freezes etc */ - atomic_store_rel_int(&smp_started, 1); - } - - mtx_unlock_spin(&ap_boot_mtx); - - /* wait until all the AP's are up */ - while (smp_started == 0) - ia32_pause(); - - PCPU_SET(curthread, PCPU_GET(idlethread)); - - /* Start per-CPU event timers. */ - cpu_initclocks_ap(); - - /* enter the scheduler */ - sched_throw(NULL); - - panic("scheduler returned us to %s", __func__); - /* NOTREACHED */ -} - -/******************************************************************* - * local functions and data - */ - -/* - * We tell the I/O APIC code about all the CPUs we want to receive - * interrupts. If we don't want certain CPUs to receive IRQs we - * can simply not tell the I/O APIC code about them in this function. - * We also do not tell it about the BSP since it tells itself about - * the BSP internally to work with UP kernels and on UP machines. - */ -void -set_interrupt_apic_ids(void) -{ - u_int i, apic_id; - - for (i = 0; i < MAXCPU; i++) { - apic_id = cpu_apic_ids[i]; - if (apic_id == -1) - continue; - if (cpu_info[apic_id].cpu_bsp) - continue; - if (cpu_info[apic_id].cpu_disabled) - continue; - - /* Don't let hyperthreads service interrupts. */ - if (hyperthreading_cpus > 1 && - apic_id % hyperthreading_cpus != 0) - continue; - - intr_add_cpu(i); - } -} - -/* - * Assign logical CPU IDs to local APICs. - */ -void -assign_cpu_ids(void) -{ - u_int i; - - /* Check for explicitly disabled CPUs. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) - continue; - - /* Don't use this CPU if it has been disabled by a tunable. */ - if (resource_disabled("lapic", i)) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - /* - * Assign CPU IDs to local APIC IDs and disable any CPUs - * beyond MAXCPU. CPU 0 has already been assigned to the BSP, - * so we only have to assign IDs for APs. - */ - mp_ncpus = 1; - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || - cpu_info[i].cpu_disabled) - continue; - - if (mp_ncpus < MAXCPU) { - cpu_apic_ids[mp_ncpus] = i; - apic_cpuids[i] = mp_ncpus; - mp_ncpus++; - } else - cpu_info[i].cpu_disabled = 1; - } - KASSERT(mp_maxid >= mp_ncpus - 1, - ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, - mp_ncpus)); -} - -/* - * start each AP in our list - */ -/* Lowest 1MB is already mapped: don't touch*/ -#define TMPMAP_START 1 -int -start_all_aps(void) -{ - int x,apic_id, cpu; - struct pcpu *pc; - - mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); - - /* set up temporary P==V mapping for AP boot */ - /* XXX this is a hack, we should boot the AP on its own stack/PTD */ - - /* start each AP */ - for (cpu = 1; cpu < mp_ncpus; cpu++) { - apic_id = cpu_apic_ids[cpu]; - - - bootAP = cpu; - bootAPgdt = gdt + (512*cpu); - - /* Get per-cpu data */ - pc = &__pcpu[bootAP]; - pcpu_init(pc, bootAP, sizeof(struct pcpu)); - dpcpu_init((void *)kmem_malloc(kernel_arena, DPCPU_SIZE, - M_WAITOK | M_ZERO), bootAP); - pc->pc_apic_id = cpu_apic_ids[bootAP]; - pc->pc_vcpu_id = cpu_apic_ids[bootAP]; - pc->pc_prvspace = pc; - pc->pc_curthread = 0; - - gdt_segs[GPRIV_SEL].ssd_base = (int) pc; - gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; - - PT_SET_MA(bootAPgdt, VTOM(bootAPgdt) | PG_V | PG_RW); - bzero(bootAPgdt, PAGE_SIZE); - for (x = 0; x < NGDT; x++) - ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); - PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); -#ifdef notyet - - if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { - apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); - acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); -#ifdef CONFIG_ACPI - if (acpiid != 0xff) - x86_acpiid_to_apicid[acpiid] = apicid; -#endif - } -#endif - - /* attempt to start the Application Processor */ - if (!start_ap(cpu)) { - printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); - /* better panic as the AP may be running loose */ - printf("panic y/n? [y] "); - if (cngetc() != 'n') - panic("bye-bye"); - } - - CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ - } - - - pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); - - /* number of APs actually started */ - return (mp_naps); -} - -extern uint8_t *pcpu_boot_stack; -extern trap_info_t trap_table[]; - -static void -smp_trap_init(trap_info_t *trap_ctxt) -{ - const trap_info_t *t = trap_table; - - for (t = trap_table; t->address; t++) { - trap_ctxt[t->vector].flags = t->flags; - trap_ctxt[t->vector].cs = t->cs; - trap_ctxt[t->vector].address = t->address; - } -} - -extern struct rwlock pvh_global_lock; -extern int nkpt; -static void -cpu_initialize_context(unsigned int cpu) -{ - /* vcpu_guest_context_t is too large to allocate on the stack. - * Hence we allocate statically and protect it with a lock */ - vm_page_t m[NPGPTD + 2]; - static vcpu_guest_context_t ctxt; - vm_offset_t boot_stack; - vm_offset_t newPTD; - vm_paddr_t ma[NPGPTD]; - int i; - - /* - * Page 0,[0-3] PTD - * Page 1, [4] boot stack - * Page [5] PDPT - * - */ - for (i = 0; i < NPGPTD + 2; i++) { - m[i] = vm_page_alloc(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - - pmap_zero_page(m[i]); - - } - boot_stack = kva_alloc(PAGE_SIZE); - newPTD = kva_alloc(NPGPTD * PAGE_SIZE); - ma[0] = VM_PAGE_TO_MACH(m[0])|PG_V; - -#ifdef PAE - pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); - for (i = 0; i < NPGPTD; i++) { - ((vm_paddr_t *)boot_stack)[i] = - ma[i] = VM_PAGE_TO_MACH(m[i])|PG_V; - } -#endif - - /* - * Copy cpu0 IdlePTD to new IdlePTD - copying only - * kernel mappings - */ - pmap_qenter(newPTD, m, 4); - - memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), - (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), - nkpt*sizeof(vm_paddr_t)); - - pmap_qremove(newPTD, 4); - kva_free(newPTD, 4 * PAGE_SIZE); - /* - * map actual idle stack to boot_stack - */ - pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); - - - xen_pgdpt_pin(VM_PAGE_TO_MACH(m[NPGPTD + 1])); - rw_wlock(&pvh_global_lock); - for (i = 0; i < 4; i++) { - int pdir = (PTDPTDI + i) / NPDEPG; - int curoffset = (PTDPTDI + i) % NPDEPG; - - xen_queue_pt_update((vm_paddr_t) - ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), - ma[i]); - } - PT_UPDATES_FLUSH(); - rw_wunlock(&pvh_global_lock); - - memset(&ctxt, 0, sizeof(ctxt)); - ctxt.flags = VGCF_IN_KERNEL; - ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); - ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); - ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); - ctxt.user_regs.eip = (unsigned long)init_secondary; - ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ - - memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); - - smp_trap_init(ctxt.trap_ctxt); - - ctxt.ldt_ents = 0; - ctxt.gdt_frames[0] = - (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); - ctxt.gdt_ents = 512; - -#ifdef __i386__ - ctxt.user_regs.esp = boot_stack + PAGE_SIZE; - - ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); - ctxt.kernel_sp = boot_stack + PAGE_SIZE; - - ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); - ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; - ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; - - ctxt.ctrlreg[3] = VM_PAGE_TO_MACH(m[NPGPTD + 1]); -#else /* __x86_64__ */ - ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); - ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); - ctxt.kernel_sp = idle->thread.rsp0; - - ctxt.event_callback_eip = (unsigned long)hypervisor_callback; - ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; - ctxt.syscall_callback_eip = (unsigned long)system_call; - - ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); - - ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); -#endif - - printf("gdtpfn=%lx pdptpfn=%lx\n", - ctxt.gdt_frames[0], - ctxt.ctrlreg[3] >> PAGE_SHIFT); - - PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); - DELAY(3000); - PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); -} - -/* - * This function starts the AP (application processor) identified - * by the APIC ID 'physicalCpu'. It does quite a "song and dance" - * to accomplish this. This is necessary because of the nuances - * of the different hardware we might encounter. It isn't pretty, - * but it seems to work. - */ - -int cpus; -static int -start_ap(int apic_id) -{ - int ms; - - /* used as a watchpoint to signal AP startup */ - cpus = mp_naps; - - cpu_initialize_context(apic_id); - - /* Wait up to 5 seconds for it to start. */ - for (ms = 0; ms < 5000; ms++) { - if (mp_naps > cpus) - return (1); /* return SUCCESS */ - DELAY(1000); - } - return (0); /* return FAILURE */ -} - -static void -ipi_pcpu(int cpu, u_int ipi) -{ - KASSERT((ipi <= nitems(xen_ipis)), ("invalid IPI")); - xen_intr_signal(DPCPU_ID_GET(cpu, ipi_handle[ipi])); -} - -/* - * send an IPI to a specific CPU. - */ -void -ipi_send_cpu(int cpu, u_int ipi) -{ - u_int bitmap, old_pending, new_pending; - - if (IPI_IS_BITMAPED(ipi)) { - bitmap = 1 << ipi; - ipi = IPI_BITMAP_VECTOR; - do { - old_pending = cpu_ipi_pending[cpu]; - new_pending = old_pending | bitmap; - } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], - old_pending, new_pending)); - if (!old_pending) - ipi_pcpu(cpu, RESCHEDULE_VECTOR); - } else { - KASSERT(call_data != NULL, ("call_data not set")); - ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); - } -} - -/* - * Flush the TLB on all other CPU's - */ -static void -smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) -{ - u_int ncpu; - struct _call_data data; - - ncpu = mp_ncpus - 1; /* does not shootdown self */ - if (ncpu < 1) - return; /* no other cpus */ - if (!(read_eflags() & PSL_I)) - panic("%s: interrupts disabled", __func__); - mtx_lock_spin(&smp_ipi_mtx); - KASSERT(call_data == NULL, ("call_data isn't null?!")); - call_data = &data; - call_data->func_id = vector; - call_data->arg1 = addr1; - call_data->arg2 = addr2; - atomic_store_rel_int(&smp_tlb_wait, 0); - ipi_all_but_self(vector); - while (smp_tlb_wait < ncpu) - ia32_pause(); - call_data = NULL; - mtx_unlock_spin(&smp_ipi_mtx); -} - -static void -smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, - vm_offset_t addr2) -{ - int cpu, ncpu, othercpus; - struct _call_data data; - - othercpus = mp_ncpus - 1; - if (CPU_ISFULLSET(&mask)) { - if (othercpus < 1) - return; - } else { - CPU_CLR(PCPU_GET(cpuid), &mask); - if (CPU_EMPTY(&mask)) - return; - } - if (!(read_eflags() & PSL_I)) - panic("%s: interrupts disabled", __func__); - mtx_lock_spin(&smp_ipi_mtx); - KASSERT(call_data == NULL, ("call_data isn't null?!")); - call_data = &data; - call_data->func_id = vector; - call_data->arg1 = addr1; - call_data->arg2 = addr2; - atomic_store_rel_int(&smp_tlb_wait, 0); - if (CPU_ISFULLSET(&mask)) { - ncpu = othercpus; - ipi_all_but_self(vector); - } else { - ncpu = 0; - while ((cpu = CPU_FFS(&mask)) != 0) { - cpu--; - CPU_CLR(cpu, &mask); - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, - vector); - ipi_send_cpu(cpu, vector); - ncpu++; - } - } - while (smp_tlb_wait < ncpu) - ia32_pause(); - call_data = NULL; - mtx_unlock_spin(&smp_ipi_mtx); -} - -void -smp_cache_flush(void) -{ - - if (smp_started) - smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); -} - -void -smp_invltlb(void) -{ - - if (smp_started) { - smp_tlb_shootdown(IPI_INVLTLB, 0, 0); - } -} - -void -smp_invlpg(vm_offset_t addr) -{ - - if (smp_started) { - smp_tlb_shootdown(IPI_INVLPG, addr, 0); - } -} - -void -smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) -{ - - if (smp_started) { - smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); - } -} - -void -smp_masked_invltlb(cpuset_t mask) -{ - - if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); - } -} - -void -smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) -{ - - if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); - } -} - -void -smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) -{ - - if (smp_started) { - smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); - } -} - -/* - * send an IPI to a set of cpus. - */ -void -ipi_selected(cpuset_t cpus, u_int ipi) -{ - int cpu; - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); - - while ((cpu = CPU_FFS(&cpus)) != 0) { - cpu--; - CPU_CLR(cpu, &cpus); - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); - } -} - -/* - * send an IPI to a specific CPU. - */ -void -ipi_cpu(int cpu, u_int ipi) -{ - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - if (ipi == IPI_STOP_HARD) - CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); - - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); - ipi_send_cpu(cpu, ipi); -} - -/* - * send an IPI to all CPUs EXCEPT myself - */ -void -ipi_all_but_self(u_int ipi) -{ - cpuset_t other_cpus; - - /* - * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit - * of help in order to understand what is the source. - * Set the mask of receiving CPUs for this purpose. - */ - other_cpus = all_cpus; - CPU_CLR(PCPU_GET(cpuid), &other_cpus); - if (ipi == IPI_STOP_HARD) - CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); - - CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); - ipi_selected(other_cpus, ipi); -} - -int -ipi_nmi_handler() -{ - u_int cpuid; - - /* - * As long as there is not a simple way to know about a NMI's - * source, if the bitmask for the current CPU is present in - * the global pending bitword an IPI_STOP_HARD has been issued - * and should be handled. - */ - cpuid = PCPU_GET(cpuid); - if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) - return (1); - - CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); - cpustop_handler(); - return (0); -} - -/* - * Handle an IPI_STOP by saving our current context and spinning until we - * are resumed. - */ -void -cpustop_handler(void) -{ - int cpu; - - cpu = PCPU_GET(cpuid); - - savectx(&stoppcbs[cpu]); - - /* Indicate that we are stopped */ - CPU_SET_ATOMIC(cpu, &stopped_cpus); - - /* Wait for restart */ - while (!CPU_ISSET(cpu, &started_cpus)) - ia32_pause(); - - CPU_CLR_ATOMIC(cpu, &started_cpus); - CPU_CLR_ATOMIC(cpu, &stopped_cpus); - - if (cpu == 0 && cpustop_restartfunc != NULL) { - cpustop_restartfunc(); - cpustop_restartfunc = NULL; - } -} - -/* - * Handlers for TLB related IPIs - * - * On i386 Xen PV this are no-ops since this port doesn't support SMP. - */ -void -invltlb_handler(void) -{ -} - -void -invlpg_handler(void) -{ -} - -void -invlrng_handler(void) -{ -} - -void -invlcache_handler(void) -{ -} - -/* - * This is called once the rest of the system is up and running and we're - * ready to let the AP's out of the pen. - */ -static void -release_aps(void *dummy __unused) -{ - - if (mp_ncpus == 1) - return; - atomic_store_rel_int(&aps_ready, 1); - while (smp_started == 0) - ia32_pause(); -} -SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); -SYSINIT(start_ipis, SI_SUB_SMP, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); -SYSINIT(start_cpu, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_setup_cpus, NULL); Index: sys/i386/xen/mptable.c =================================================================== --- sys/i386/xen/mptable.c +++ /dev/null @@ -1,109 +0,0 @@ -/*- - * Copyright (c) 2003 John Baldwin - * Copyright (c) 1996, by Steve Passe - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. The name of the developer may NOT be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - - -static int mptable_probe(void); -static int mptable_probe_cpus(void); -static void mptable_register(void *dummy); -static int mptable_setup_local(void); -static int mptable_setup_io(void); - -static struct apic_enumerator mptable_enumerator = { - "MPTable", - mptable_probe, - mptable_probe_cpus, - mptable_setup_local, - mptable_setup_io -}; - -static int -mptable_probe(void) -{ - - return (-100); -} - -static int -mptable_probe_cpus(void) -{ - int i, rc; - - for (i = 0; i < MAXCPU; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) - cpu_add(i, (i == 0)); - } - - return (0); -} - -/* - * Initialize the local APIC on the BSP. - */ -static int -mptable_setup_local(void) -{ - - PCPU_SET(apic_id, 0); - PCPU_SET(vcpu_id, 0); - return (0); -} - -static int -mptable_setup_io(void) -{ - - return (0); -} - -static void -mptable_register(void *dummy __unused) -{ - - apic_register_enumerator(&mptable_enumerator); -} -SYSINIT(mptable_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, mptable_register, - NULL); Index: sys/i386/xen/pmap.c =================================================================== --- sys/i386/xen/pmap.c +++ /dev/null @@ -1,4420 +0,0 @@ -/*- - * Copyright (c) 1991 Regents of the University of California. - * All rights reserved. - * Copyright (c) 1994 John S. Dyson - * All rights reserved. - * Copyright (c) 1994 David Greenman - * All rights reserved. - * Copyright (c) 2005 Alan L. Cox - * All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * the Systems Programming Group of the University of Utah Computer - * Science Department and William Jolitz of UUNET Technologies Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 - */ -/*- - * Copyright (c) 2003 Networks Associates Technology, Inc. - * All rights reserved. - * - * This software was developed for the FreeBSD Project by Jake Burkholder, - * Safeport Network Services, and Network Associates Laboratories, the - * Security Research Division of Network Associates, Inc. under - * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA - * CHATS research program. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -/* - * Manages physical address maps. - * - * Since the information managed by this module is - * also stored by the logical address mapping module, - * this module may throw away valid virtual-to-physical - * mappings at almost any time. However, invalidations - * of virtual-to-physical mappings must be done as - * requested. - * - * In order to cope with hardware architectures which - * make virtual-to-physical map invalidates expensive, - * this module may delay invalidate or reduced protection - * operations until such time as they are actually - * necessary. This module is given full information as - * to which processors are currently using which maps, - * and to when physical maps must be made correct. - */ - -#include "opt_cpu.h" -#include "opt_pmap.h" -#include "opt_smp.h" -#include "opt_xbox.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef SMP -#include -#else -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#ifdef SMP -#include -#endif - -#ifdef XBOX -#include -#endif - -#include -#include -#include -#include -#include - -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - -#ifndef PMAP_SHPGPERPROC -#define PMAP_SHPGPERPROC 200 -#endif - -#define DIAGNOSTIC - -#if !defined(DIAGNOSTIC) -#ifdef __GNUC_GNU_INLINE__ -#define PMAP_INLINE __attribute__((__gnu_inline__)) inline -#else -#define PMAP_INLINE extern inline -#endif -#else -#define PMAP_INLINE -#endif - -#ifdef PV_STATS -#define PV_STAT(x) do { x ; } while (0) -#else -#define PV_STAT(x) do { } while (0) -#endif - -/* - * Get PDEs and PTEs for user/kernel address space - */ -#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) -#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) - -#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) -#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) -#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) -#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) -#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) - -#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) - -#define HAMFISTED_LOCKING -#ifdef HAMFISTED_LOCKING -static struct mtx createdelete_lock; -#endif - -struct pmap kernel_pmap_store; -LIST_HEAD(pmaplist, pmap); -static struct pmaplist allpmaps; -static struct mtx allpmaps_lock; - -vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ -vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ -int pgeflag = 0; /* PG_G or-in */ -int pseflag = 0; /* PG_PS or-in */ - -int nkpt; -vm_offset_t kernel_vm_end; -extern u_int32_t KERNend; - -#ifdef PAE -pt_entry_t pg_nx; -#endif - -static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); - -static int pat_works; /* Is page attribute table sane? */ - -/* - * This lock is defined as static in other pmap implementations. It cannot, - * however, be defined as static here, because it is (ab)used to serialize - * queued page table changes in other sources files. - */ -struct rwlock pvh_global_lock; - -/* - * Data for the pv entry allocation mechanism - */ -static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); -static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; -static int shpgperproc = PMAP_SHPGPERPROC; - -struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ -int pv_maxchunks; /* How many chunks we have KVA for */ -vm_offset_t pv_vafree; /* freelist stored in the PTE */ - -/* - * All those kernel PT submaps that BSD is so fond of - */ -struct sysmaps { - struct mtx lock; - pt_entry_t *CMAP1; - pt_entry_t *CMAP2; - caddr_t CADDR1; - caddr_t CADDR2; -}; -static struct sysmaps sysmaps_pcpu[MAXCPU]; -pt_entry_t *CMAP3; -caddr_t ptvmmap = 0; -caddr_t CADDR3; -struct msgbuf *msgbufp = 0; - -/* - * Crashdump maps. - */ -static caddr_t crashdumpmap; - -static pt_entry_t *PMAP1 = 0, *PMAP2; -static pt_entry_t *PADDR1 = 0, *PADDR2; -#ifdef SMP -static int PMAP1cpu; -static int PMAP1changedcpu; -SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, - &PMAP1changedcpu, 0, - "Number of times pmap_pte_quick changed CPU with same PMAP1"); -#endif -static int PMAP1changed; -SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, - &PMAP1changed, 0, - "Number of times pmap_pte_quick changed PMAP1"); -static int PMAP1unchanged; -SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, - &PMAP1unchanged, 0, - "Number of times pmap_pte_quick didn't change PMAP1"); -static struct mtx PMAP2mutex; - -static void free_pv_chunk(struct pv_chunk *pc); -static void free_pv_entry(pmap_t pmap, pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); -static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); -static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, - vm_offset_t va); - -static vm_page_t pmap_enter_quick_locked(multicall_entry_t **mcl, int *count, pmap_t pmap, vm_offset_t va, - vm_page_t m, vm_prot_t prot, vm_page_t mpte); -static void pmap_flush_page(vm_page_t m); -static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); -static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, - vm_page_t *free); -static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, - vm_page_t *free); -static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, - vm_offset_t va); -static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, - vm_page_t m); - -static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); - -static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); -static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free); -static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); -static void pmap_pte_release(pt_entry_t *pte); -static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); -static boolean_t pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr); - -static __inline void pagezero(void *page); - -CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); -CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); - -/* - * If you get an error here, then you set KVA_PAGES wrong! See the - * description of KVA_PAGES in sys/i386/include/pmap.h. It must be - * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. - */ -CTASSERT(KERNBASE % (1 << 24) == 0); - -void -pd_set(struct pmap *pmap, int ptepindex, vm_paddr_t val, int type) -{ - vm_paddr_t pdir_ma = vtomach(&pmap->pm_pdir[ptepindex]); - - switch (type) { - case SH_PD_SET_VA: -#if 0 - xen_queue_pt_update(shadow_pdir_ma, - xpmap_ptom(val & ~(PG_RW))); -#endif - xen_queue_pt_update(pdir_ma, - xpmap_ptom(val)); - break; - case SH_PD_SET_VA_MA: -#if 0 - xen_queue_pt_update(shadow_pdir_ma, - val & ~(PG_RW)); -#endif - xen_queue_pt_update(pdir_ma, val); - break; - case SH_PD_SET_VA_CLEAR: -#if 0 - xen_queue_pt_update(shadow_pdir_ma, 0); -#endif - xen_queue_pt_update(pdir_ma, 0); - break; - } -} - -/* - * Bootstrap the system enough to run with virtual memory. - * - * On the i386 this is called after mapping has already been enabled - * and just syncs the pmap module with what has already been done. - * [We can't call it easily with mapping off since the kernel is not - * mapped with PA == VA, hence we would have to relocate every address - * from the linked base (virtual) address "KERNBASE" to the actual - * (physical) address starting relative to 0] - */ -void -pmap_bootstrap(vm_paddr_t firstaddr) -{ - vm_offset_t va; - pt_entry_t *pte, *unused; - struct sysmaps *sysmaps; - int i; - - /* - * Initialize the first available kernel virtual address. However, - * using "firstaddr" may waste a few pages of the kernel virtual - * address space, because locore may not have mapped every physical - * page that it allocated. Preferably, locore would provide a first - * unused virtual address in addition to "firstaddr". - */ - virtual_avail = (vm_offset_t) KERNBASE + firstaddr; - - virtual_end = VM_MAX_KERNEL_ADDRESS; - - /* - * Initialize the kernel pmap (which is statically allocated). - */ - PMAP_LOCK_INIT(kernel_pmap); - kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); -#ifdef PAE - kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); -#endif - CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ - TAILQ_INIT(&kernel_pmap->pm_pvchunk); - - /* - * Initialize the global pv list lock. - */ - rw_init_flags(&pvh_global_lock, "pmap pv global", RW_RECURSE); - - LIST_INIT(&allpmaps); - mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); - mtx_lock_spin(&allpmaps_lock); - LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); - mtx_unlock_spin(&allpmaps_lock); - if (nkpt == 0) - nkpt = NKPT; - - /* - * Reserve some special page table entries/VA space for temporary - * mapping of pages. - */ -#define SYSMAP(c, p, v, n) \ - v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); - - va = virtual_avail; - pte = vtopte(va); - - /* - * CMAP1/CMAP2 are used for zeroing and copying pages. - * CMAP3 is used for the idle process page zeroing. - */ - for (i = 0; i < MAXCPU; i++) { - sysmaps = &sysmaps_pcpu[i]; - mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); - SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) - SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) - PT_SET_MA(sysmaps->CADDR1, 0); - PT_SET_MA(sysmaps->CADDR2, 0); - } - SYSMAP(caddr_t, CMAP3, CADDR3, 1) - PT_SET_MA(CADDR3, 0); - - /* - * Crashdump maps. - */ - SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) - - /* - * ptvmmap is used for reading arbitrary physical pages via /dev/mem. - */ - SYSMAP(caddr_t, unused, ptvmmap, 1) - - /* - * msgbufp is used to map the system message buffer. - */ - SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) - - /* - * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), - * respectively. - */ - SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) - SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) - - mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); - - virtual_avail = va; - - /* - * Leave in place an identity mapping (virt == phys) for the low 1 MB - * physical memory region that is used by the ACPI wakeup code. This - * mapping must not have PG_G set. - */ -#ifndef XEN - /* - * leave here deliberately to show that this is not supported - */ -#ifdef XBOX - /* FIXME: This is gross, but needed for the XBOX. Since we are in such - * an early stadium, we cannot yet neatly map video memory ... :-( - * Better fixes are very welcome! */ - if (!arch_i386_is_xbox) -#endif - for (i = 1; i < NKPT; i++) - PTD[i] = 0; - - /* Initialize the PAT MSR if present. */ - pmap_init_pat(); - - /* Turn on PG_G on kernel page(s) */ - pmap_set_pg(); -#endif - -#ifdef HAMFISTED_LOCKING - mtx_init(&createdelete_lock, "pmap create/delete", NULL, MTX_DEF); -#endif -} - -/* - * Setup the PAT MSR. - */ -void -pmap_init_pat(void) -{ - uint64_t pat_msr; - - /* Bail if this CPU doesn't implement PAT. */ - if (!(cpu_feature & CPUID_PAT)) - return; - - if (cpu_vendor_id != CPU_VENDOR_INTEL || - (CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) { - /* - * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. - * Program 4 and 5 as WP and WC. - * Leave 6 and 7 as UC and UC-. - */ - pat_msr = rdmsr(MSR_PAT); - pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); - pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | - PAT_VALUE(5, PAT_WRITE_COMBINING); - pat_works = 1; - } else { - /* - * Due to some Intel errata, we can only safely use the lower 4 - * PAT entries. Thus, just replace PAT Index 2 with WC instead - * of UC-. - * - * Intel Pentium III Processor Specification Update - * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B - * or Mode C Paging) - * - * Intel Pentium IV Processor Specification Update - * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) - */ - pat_msr = rdmsr(MSR_PAT); - pat_msr &= ~PAT_MASK(2); - pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); - pat_works = 0; - } - wrmsr(MSR_PAT, pat_msr); -} - -/* - * Initialize a vm_page's machine-dependent fields. - */ -void -pmap_page_init(vm_page_t m) -{ - - TAILQ_INIT(&m->md.pv_list); - m->md.pat_mode = PAT_WRITE_BACK; -} - -/* - * ABuse the pte nodes for unmapped kva to thread a kva freelist through. - * Requirements: - * - Must deal with pages in order to ensure that none of the PG_* bits - * are ever set, PG_V in particular. - * - Assumes we can write to ptes without pte_store() atomic ops, even - * on PAE systems. This should be ok. - * - Assumes nothing will ever test these addresses for 0 to indicate - * no mapping instead of correctly checking PG_V. - * - Assumes a vm_offset_t will fit in a pte (true for i386). - * Because PG_V is never set, there can be no mappings to invalidate. - */ -static int ptelist_count = 0; -static vm_offset_t -pmap_ptelist_alloc(vm_offset_t *head) -{ - vm_offset_t va; - vm_offset_t *phead = (vm_offset_t *)*head; - - if (ptelist_count == 0) { - printf("out of memory!!!!!!\n"); - return (0); /* Out of memory */ - } - ptelist_count--; - va = phead[ptelist_count]; - return (va); -} - -static void -pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) -{ - vm_offset_t *phead = (vm_offset_t *)*head; - - phead[ptelist_count++] = va; -} - -static void -pmap_ptelist_init(vm_offset_t *head, void *base, int npages) -{ - int i, nstackpages; - vm_offset_t va; - vm_page_t m; - - nstackpages = (npages + PAGE_SIZE/sizeof(vm_offset_t) - 1)/ (PAGE_SIZE/sizeof(vm_offset_t)); - for (i = 0; i < nstackpages; i++) { - va = (vm_offset_t)base + i * PAGE_SIZE; - m = vm_page_alloc(NULL, i, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - pmap_qenter(va, &m, 1); - } - - *head = (vm_offset_t)base; - for (i = npages - 1; i >= nstackpages; i--) { - va = (vm_offset_t)base + i * PAGE_SIZE; - pmap_ptelist_free(head, va); - } -} - - -/* - * Initialize the pmap module. - * Called by vm_init, to initialize any structures that the pmap - * system needs to map virtual memory. - */ -void -pmap_init(void) -{ - - /* - * Initialize the address space (zone) for the pv entries. Set a - * high water mark so that the system can recover from excessive - * numbers of pv entries. - */ - TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); - pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; - TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); - pv_entry_max = roundup(pv_entry_max, _NPCPV); - pv_entry_high_water = 9 * (pv_entry_max / 10); - - pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); - pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); - if (pv_chunkbase == NULL) - panic("pmap_init: not enough kvm for pv chunks"); - pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); -} - - -SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, - "Max number of PV entries"); -SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, - "Page share factor per proc"); - -static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, - "2/4MB page mapping counters"); - -static u_long pmap_pde_mappings; -SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, - &pmap_pde_mappings, 0, "2/4MB page mappings"); - -/*************************************************** - * Low level helper routines..... - ***************************************************/ - -/* - * Determine the appropriate bits to set in a PTE or PDE for a specified - * caching mode. - */ -int -pmap_cache_bits(int mode, boolean_t is_pde) -{ - int pat_flag, pat_index, cache_bits; - - /* The PAT bit is different for PTE's and PDE's. */ - pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; - - /* If we don't support PAT, map extended modes to older ones. */ - if (!(cpu_feature & CPUID_PAT)) { - switch (mode) { - case PAT_UNCACHEABLE: - case PAT_WRITE_THROUGH: - case PAT_WRITE_BACK: - break; - case PAT_UNCACHED: - case PAT_WRITE_COMBINING: - case PAT_WRITE_PROTECTED: - mode = PAT_UNCACHEABLE; - break; - } - } - - /* Map the caching mode to a PAT index. */ - if (pat_works) { - switch (mode) { - case PAT_UNCACHEABLE: - pat_index = 3; - break; - case PAT_WRITE_THROUGH: - pat_index = 1; - break; - case PAT_WRITE_BACK: - pat_index = 0; - break; - case PAT_UNCACHED: - pat_index = 2; - break; - case PAT_WRITE_COMBINING: - pat_index = 5; - break; - case PAT_WRITE_PROTECTED: - pat_index = 4; - break; - default: - panic("Unknown caching mode %d\n", mode); - } - } else { - switch (mode) { - case PAT_UNCACHED: - case PAT_UNCACHEABLE: - case PAT_WRITE_PROTECTED: - pat_index = 3; - break; - case PAT_WRITE_THROUGH: - pat_index = 1; - break; - case PAT_WRITE_BACK: - pat_index = 0; - break; - case PAT_WRITE_COMBINING: - pat_index = 2; - break; - default: - panic("Unknown caching mode %d\n", mode); - } - } - - /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ - cache_bits = 0; - if (pat_index & 0x4) - cache_bits |= pat_flag; - if (pat_index & 0x2) - cache_bits |= PG_NC_PCD; - if (pat_index & 0x1) - cache_bits |= PG_NC_PWT; - return (cache_bits); -} -#ifdef SMP -/* - * For SMP, these functions have to use the IPI mechanism for coherence. - * - * N.B.: Before calling any of the following TLB invalidation functions, - * the calling processor must ensure that all stores updating a non- - * kernel page table are globally performed. Otherwise, another - * processor could cache an old, pre-update entry without being - * invalidated. This can happen one of two ways: (1) The pmap becomes - * active on another processor after its pm_active field is checked by - * one of the following functions but before a store updating the page - * table is globally performed. (2) The pmap becomes active on another - * processor before its pm_active field is checked but due to - * speculative loads one of the following functions stills reads the - * pmap as inactive on the other processor. - * - * The kernel page table is exempt because its pm_active field is - * immutable. The kernel page table is always active on every - * processor. - */ -void -pmap_invalidate_page(pmap_t pmap, vm_offset_t va) -{ - cpuset_t other_cpus; - u_int cpuid; - - CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", - pmap, va); - - sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invlpg(va); - smp_invlpg(va); - } else { - cpuid = PCPU_GET(cpuid); - other_cpus = all_cpus; - CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - invlpg(va); - CPU_AND(&other_cpus, &pmap->pm_active); - if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg(other_cpus, va); - } - sched_unpin(); - PT_UPDATES_FLUSH(); -} - -void -pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - cpuset_t other_cpus; - vm_offset_t addr; - u_int cpuid; - - CTR3(KTR_PMAP, "pmap_invalidate_page: pmap=%p eva=0x%x sva=0x%x", - pmap, sva, eva); - - sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - smp_invlpg_range(sva, eva); - } else { - cpuid = PCPU_GET(cpuid); - other_cpus = all_cpus; - CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - CPU_AND(&other_cpus, &pmap->pm_active); - if (!CPU_EMPTY(&other_cpus)) - smp_masked_invlpg_range(other_cpus, sva, eva); - } - sched_unpin(); - PT_UPDATES_FLUSH(); -} - -void -pmap_invalidate_all(pmap_t pmap) -{ - cpuset_t other_cpus; - u_int cpuid; - - CTR1(KTR_PMAP, "pmap_invalidate_page: pmap=%p", pmap); - - sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { - invltlb(); - smp_invltlb(); - } else { - cpuid = PCPU_GET(cpuid); - other_cpus = all_cpus; - CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - invltlb(); - CPU_AND(&other_cpus, &pmap->pm_active); - if (!CPU_EMPTY(&other_cpus)) - smp_masked_invltlb(other_cpus); - } - sched_unpin(); -} - -void -pmap_invalidate_cache(void) -{ - - sched_pin(); - wbinvd(); - smp_cache_flush(); - sched_unpin(); -} -#else /* !SMP */ -/* - * Normal, non-SMP, 486+ invalidation functions. - * We inline these within pmap.c for speed. - */ -PMAP_INLINE void -pmap_invalidate_page(pmap_t pmap, vm_offset_t va) -{ - CTR2(KTR_PMAP, "pmap_invalidate_page: pmap=%p va=0x%x", - pmap, va); - - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invlpg(va); - PT_UPDATES_FLUSH(); -} - -PMAP_INLINE void -pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - vm_offset_t addr; - - if (eva - sva > PAGE_SIZE) - CTR3(KTR_PMAP, "pmap_invalidate_range: pmap=%p sva=0x%x eva=0x%x", - pmap, sva, eva); - - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - PT_UPDATES_FLUSH(); -} - -PMAP_INLINE void -pmap_invalidate_all(pmap_t pmap) -{ - - CTR1(KTR_PMAP, "pmap_invalidate_all: pmap=%p", pmap); - - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) - invltlb(); -} - -PMAP_INLINE void -pmap_invalidate_cache(void) -{ - - wbinvd(); -} -#endif /* !SMP */ - -#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) - -void -pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) -{ - - if (force) { - sva &= ~(vm_offset_t)cpu_clflush_line_size; - } else { - KASSERT((sva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: sva not page-aligned")); - KASSERT((eva & PAGE_MASK) == 0, - ("pmap_invalidate_cache_range: eva not page-aligned")); - } - - if ((cpu_feature & CPUID_SS) != 0 && !force) - ; /* If "Self Snoop" is supported, do nothing. */ - else if ((cpu_feature & CPUID_CLFSH) != 0 && - eva - sva < PMAP_CLFLUSH_THRESHOLD) { - - /* - * Otherwise, do per-cache line flush. Use the mfence - * instruction to insure that previous stores are - * included in the write-back. The processor - * propagates flush to other processors in the cache - * coherence domain. - */ - mfence(); - for (; sva < eva; sva += cpu_clflush_line_size) - clflush(sva); - mfence(); - } else { - - /* - * No targeted cache flush methods are supported by CPU, - * or the supplied range is bigger than 2MB. - * Globally invalidate cache. - */ - pmap_invalidate_cache(); - } -} - -void -pmap_invalidate_cache_pages(vm_page_t *pages, int count) -{ - int i; - - if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || - (cpu_feature & CPUID_CLFSH) == 0) { - pmap_invalidate_cache(); - } else { - for (i = 0; i < count; i++) - pmap_flush_page(pages[i]); - } -} - -/* - * Are we current address space or kernel? N.B. We return FALSE when - * a pmap's page table is in use because a kernel thread is borrowing - * it. The borrowed page table can change spontaneously, making any - * dependence on its continued use subject to a race condition. - */ -static __inline int -pmap_is_current(pmap_t pmap) -{ - - return (pmap == kernel_pmap || - (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && - (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); -} - -/* - * If the given pmap is not the current or kernel pmap, the returned pte must - * be released by passing it to pmap_pte_release(). - */ -pt_entry_t * -pmap_pte(pmap_t pmap, vm_offset_t va) -{ - pd_entry_t newpf; - pd_entry_t *pde; - - pde = pmap_pde(pmap, va); - if (*pde & PG_PS) - return (pde); - if (*pde != 0) { - /* are we current address space or kernel? */ - if (pmap_is_current(pmap)) - return (vtopte(va)); - mtx_lock(&PMAP2mutex); - newpf = *pde & PG_FRAME; - if ((*PMAP2 & PG_FRAME) != newpf) { - PT_SET_MA(PADDR2, newpf | PG_V | PG_A | PG_M); - CTR3(KTR_PMAP, "pmap_pte: pmap=%p va=0x%x newpte=0x%08x", - pmap, va, (*PMAP2 & 0xffffffff)); - } - return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); - } - return (NULL); -} - -/* - * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte - * being NULL. - */ -static __inline void -pmap_pte_release(pt_entry_t *pte) -{ - - if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) { - CTR1(KTR_PMAP, "pmap_pte_release: pte=0x%jx", - *PMAP2); - rw_wlock(&pvh_global_lock); - PT_SET_VA(PMAP2, 0, TRUE); - rw_wunlock(&pvh_global_lock); - mtx_unlock(&PMAP2mutex); - } -} - -static __inline void -invlcaddr(void *caddr) -{ - - invlpg((u_int)caddr); - PT_UPDATES_FLUSH(); -} - -/* - * Super fast pmap_pte routine best used when scanning - * the pv lists. This eliminates many coarse-grained - * invltlb calls. Note that many of the pv list - * scans are across different pmaps. It is very wasteful - * to do an entire invltlb for checking a single mapping. - * - * If the given pmap is not the current pmap, pvh_global_lock - * must be held and curthread pinned to a CPU. - */ -static pt_entry_t * -pmap_pte_quick(pmap_t pmap, vm_offset_t va) -{ - pd_entry_t newpf; - pd_entry_t *pde; - - pde = pmap_pde(pmap, va); - if (*pde & PG_PS) - return (pde); - if (*pde != 0) { - /* are we current address space or kernel? */ - if (pmap_is_current(pmap)) - return (vtopte(va)); - rw_assert(&pvh_global_lock, RA_WLOCKED); - KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); - newpf = *pde & PG_FRAME; - if ((*PMAP1 & PG_FRAME) != newpf) { - PT_SET_MA(PADDR1, newpf | PG_V | PG_A | PG_M); - CTR3(KTR_PMAP, "pmap_pte_quick: pmap=%p va=0x%x newpte=0x%08x", - pmap, va, (u_long)*PMAP1); - -#ifdef SMP - PMAP1cpu = PCPU_GET(cpuid); -#endif - PMAP1changed++; - } else -#ifdef SMP - if (PMAP1cpu != PCPU_GET(cpuid)) { - PMAP1cpu = PCPU_GET(cpuid); - invlcaddr(PADDR1); - PMAP1changedcpu++; - } else -#endif - PMAP1unchanged++; - return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); - } - return (0); -} - -/* - * Routine: pmap_extract - * Function: - * Extract the physical page address associated - * with the given map/virtual_address pair. - */ -vm_paddr_t -pmap_extract(pmap_t pmap, vm_offset_t va) -{ - vm_paddr_t rtval; - pt_entry_t *pte; - pd_entry_t pde; - pt_entry_t pteval; - - rtval = 0; - PMAP_LOCK(pmap); - pde = pmap->pm_pdir[va >> PDRSHIFT]; - if (pde != 0) { - if ((pde & PG_PS) != 0) { - rtval = xpmap_mtop(pde & PG_PS_FRAME) | (va & PDRMASK); - PMAP_UNLOCK(pmap); - return rtval; - } - pte = pmap_pte(pmap, va); - pteval = *pte ? xpmap_mtop(*pte) : 0; - rtval = (pteval & PG_FRAME) | (va & PAGE_MASK); - pmap_pte_release(pte); - } - PMAP_UNLOCK(pmap); - return (rtval); -} - -/* - * Routine: pmap_extract_ma - * Function: - * Like pmap_extract, but returns machine address - */ -vm_paddr_t -pmap_extract_ma(pmap_t pmap, vm_offset_t va) -{ - vm_paddr_t rtval; - pt_entry_t *pte; - pd_entry_t pde; - - rtval = 0; - PMAP_LOCK(pmap); - pde = pmap->pm_pdir[va >> PDRSHIFT]; - if (pde != 0) { - if ((pde & PG_PS) != 0) { - rtval = (pde & ~PDRMASK) | (va & PDRMASK); - PMAP_UNLOCK(pmap); - return rtval; - } - pte = pmap_pte(pmap, va); - rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); - pmap_pte_release(pte); - } - PMAP_UNLOCK(pmap); - return (rtval); -} - -/* - * Routine: pmap_extract_and_hold - * Function: - * Atomically extract and hold the physical page - * with the given pmap and virtual address pair - * if that mapping permits the given protection. - */ -vm_page_t -pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) -{ - pd_entry_t pde; - pt_entry_t pte, *ptep; - vm_page_t m; - vm_paddr_t pa; - - pa = 0; - m = NULL; - PMAP_LOCK(pmap); -retry: - pde = PT_GET(pmap_pde(pmap, va)); - if (pde != 0) { - if (pde & PG_PS) { - if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { - if (vm_page_pa_tryrelock(pmap, (pde & - PG_PS_FRAME) | (va & PDRMASK), &pa)) - goto retry; - m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | - (va & PDRMASK)); - vm_page_hold(m); - } - } else { - ptep = pmap_pte(pmap, va); - pte = PT_GET(ptep); - pmap_pte_release(ptep); - if (pte != 0 && - ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { - if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, - &pa)) - goto retry; - m = PHYS_TO_VM_PAGE(pte & PG_FRAME); - vm_page_hold(m); - } - } - } - PA_UNLOCK_COND(pa); - PMAP_UNLOCK(pmap); - return (m); -} - -/*************************************************** - * Low level mapping routines..... - ***************************************************/ - -/* - * Add a wired page to the kva. - * Note: not SMP coherent. - * - * This function may be used before pmap_bootstrap() is called. - */ -void -pmap_kenter(vm_offset_t va, vm_paddr_t pa) -{ - - PT_SET_MA(va, xpmap_ptom(pa)| PG_RW | PG_V | pgeflag); -} - -void -pmap_kenter_ma(vm_offset_t va, vm_paddr_t ma) -{ - pt_entry_t *pte; - - pte = vtopte(va); - pte_store_ma(pte, ma | PG_RW | PG_V | pgeflag); -} - -static __inline void -pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) -{ - - PT_SET_MA(va, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); -} - -/* - * Remove a page from the kernel pagetables. - * Note: not SMP coherent. - * - * This function may be used before pmap_bootstrap() is called. - */ -PMAP_INLINE void -pmap_kremove(vm_offset_t va) -{ - pt_entry_t *pte; - - pte = vtopte(va); - PT_CLEAR_VA(pte, FALSE); -} - -/* - * Used to map a range of physical addresses into kernel - * virtual address space. - * - * The value passed in '*virt' is a suggested virtual address for - * the mapping. Architectures which can support a direct-mapped - * physical to virtual region can return the appropriate address - * within that region, leaving '*virt' unchanged. Other - * architectures should map the pages starting at '*virt' and - * update '*virt' with the first usable address after the mapped - * region. - */ -vm_offset_t -pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) -{ - vm_offset_t va, sva; - - va = sva = *virt; - CTR4(KTR_PMAP, "pmap_map: va=0x%x start=0x%jx end=0x%jx prot=0x%x", - va, start, end, prot); - while (start < end) { - pmap_kenter(va, start); - va += PAGE_SIZE; - start += PAGE_SIZE; - } - pmap_invalidate_range(kernel_pmap, sva, va); - *virt = va; - return (sva); -} - - -/* - * Add a list of wired pages to the kva - * this routine is only used for temporary - * kernel mappings that do not need to have - * page modification or references recorded. - * Note that old mappings are simply written - * over. The page *must* be wired. - * Note: SMP coherent. Uses a ranged shootdown IPI. - */ -void -pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) -{ - pt_entry_t *endpte, *pte; - vm_paddr_t pa; - vm_offset_t va = sva; - int mclcount = 0; - multicall_entry_t mcl[16]; - multicall_entry_t *mclp = mcl; - int error; - - CTR2(KTR_PMAP, "pmap_qenter:sva=0x%x count=%d", va, count); - pte = vtopte(sva); - endpte = pte + count; - while (pte < endpte) { - pa = VM_PAGE_TO_MACH(*ma) | pgeflag | PG_RW | PG_V | PG_M | PG_A; - - mclp->op = __HYPERVISOR_update_va_mapping; - mclp->args[0] = va; - mclp->args[1] = (uint32_t)(pa & 0xffffffff); - mclp->args[2] = (uint32_t)(pa >> 32); - mclp->args[3] = (*pte & PG_V) ? UVMF_INVLPG|UVMF_ALL : 0; - - va += PAGE_SIZE; - pte++; - ma++; - mclp++; - mclcount++; - if (mclcount == 16) { - error = HYPERVISOR_multicall(mcl, mclcount); - mclp = mcl; - mclcount = 0; - KASSERT(error == 0, ("bad multicall %d", error)); - } - } - if (mclcount) { - error = HYPERVISOR_multicall(mcl, mclcount); - KASSERT(error == 0, ("bad multicall %d", error)); - } - -#ifdef INVARIANTS - for (pte = vtopte(sva), mclcount = 0; mclcount < count; mclcount++, pte++) - KASSERT(*pte, ("pte not set for va=0x%x", sva + mclcount*PAGE_SIZE)); -#endif -} - -/* - * This routine tears out page mappings from the - * kernel -- it is meant only for temporary mappings. - * Note: SMP coherent. Uses a ranged shootdown IPI. - */ -void -pmap_qremove(vm_offset_t sva, int count) -{ - vm_offset_t va; - - CTR2(KTR_PMAP, "pmap_qremove: sva=0x%x count=%d", sva, count); - va = sva; - rw_wlock(&pvh_global_lock); - critical_enter(); - while (count-- > 0) { - pmap_kremove(va); - va += PAGE_SIZE; - } - PT_UPDATES_FLUSH(); - pmap_invalidate_range(kernel_pmap, sva, va); - critical_exit(); - rw_wunlock(&pvh_global_lock); -} - -/*************************************************** - * Page table page management routines..... - ***************************************************/ -static __inline void -pmap_free_zero_pages(vm_page_t free) -{ - vm_page_t m; - - while (free != NULL) { - m = free; - free = (void *)m->object; - m->object = NULL; - vm_page_free_zero(m); - } -} - -/* - * Decrements a page table page's wire count, which is used to record the - * number of valid page table entries within the page. If the wire count - * drops to zero, then the page table page is unmapped. Returns TRUE if the - * page table page was unmapped and FALSE otherwise. - */ -static inline boolean_t -pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) -{ - - --m->wire_count; - if (m->wire_count == 0) { - _pmap_unwire_ptp(pmap, m, free); - return (TRUE); - } else - return (FALSE); -} - -static void -_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) -{ - vm_offset_t pteva; - - PT_UPDATES_FLUSH(); - /* - * unmap the page table page - */ - xen_pt_unpin(pmap->pm_pdir[m->pindex]); - /* - * page *might* contain residual mapping :-/ - */ - PD_CLEAR_VA(pmap, m->pindex, TRUE); - pmap_zero_page(m); - --pmap->pm_stats.resident_count; - - /* - * This is a release store so that the ordinary store unmapping - * the page table page is globally performed before TLB shoot- - * down is begun. - */ - atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); - - /* - * Do an invltlb to make the invalidated mapping - * take effect immediately. - */ - pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); - pmap_invalidate_page(pmap, pteva); - - /* - * Put page on a list so that it is released after - * *ALL* TLB shootdown is done - */ - m->object = (void *)*free; - *free = m; -} - -/* - * After removing a page table entry, this routine is used to - * conditionally free the page, and manage the hold/wire counts. - */ -static int -pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) -{ - pd_entry_t ptepde; - vm_page_t mpte; - - if (va >= VM_MAXUSER_ADDRESS) - return (0); - ptepde = PT_GET(pmap_pde(pmap, va)); - mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); - return (pmap_unwire_ptp(pmap, mpte, free)); -} - -/* - * Initialize the pmap for the swapper process. - */ -void -pmap_pinit0(pmap_t pmap) -{ - - PMAP_LOCK_INIT(pmap); - /* - * Since the page table directory is shared with the kernel pmap, - * which is already included in the list "allpmaps", this pmap does - * not need to be inserted into that list. - */ - pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); -#ifdef PAE - pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); -#endif - CPU_ZERO(&pmap->pm_active); - PCPU_SET(curpmap, pmap); - TAILQ_INIT(&pmap->pm_pvchunk); - bzero(&pmap->pm_stats, sizeof pmap->pm_stats); -} - -/* - * Initialize a preallocated and zeroed pmap structure, - * such as one in a vmspace structure. - */ -int -pmap_pinit(pmap_t pmap) -{ - vm_page_t m, ptdpg[NPGPTD + 1]; - int npgptd = NPGPTD + 1; - int i; - -#ifdef HAMFISTED_LOCKING - mtx_lock(&createdelete_lock); -#endif - - /* - * No need to allocate page table space yet but we do need a valid - * page directory table. - */ - if (pmap->pm_pdir == NULL) { - pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); - if (pmap->pm_pdir == NULL) { -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif - return (0); - } -#ifdef PAE - pmap->pm_pdpt = (pd_entry_t *)kva_alloc(1); -#endif - } - - /* - * allocate the page directory page(s) - */ - for (i = 0; i < npgptd;) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO); - if (m == NULL) - VM_WAIT; - else { - ptdpg[i++] = m; - } - } - - pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); - - for (i = 0; i < NPGPTD; i++) - if ((ptdpg[i]->flags & PG_ZERO) == 0) - pagezero(pmap->pm_pdir + (i * NPDEPG)); - - mtx_lock_spin(&allpmaps_lock); - LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); - /* Copy the kernel page table directory entries. */ - bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); - mtx_unlock_spin(&allpmaps_lock); - -#ifdef PAE - pmap_qenter((vm_offset_t)pmap->pm_pdpt, &ptdpg[NPGPTD], 1); - if ((ptdpg[NPGPTD]->flags & PG_ZERO) == 0) - bzero(pmap->pm_pdpt, PAGE_SIZE); - for (i = 0; i < NPGPTD; i++) { - vm_paddr_t ma; - - ma = VM_PAGE_TO_MACH(ptdpg[i]); - pmap->pm_pdpt[i] = ma | PG_V; - - } -#endif - for (i = 0; i < NPGPTD; i++) { - pt_entry_t *pd; - vm_paddr_t ma; - - ma = VM_PAGE_TO_MACH(ptdpg[i]); - pd = pmap->pm_pdir + (i * NPDEPG); - PT_SET_MA(pd, *vtopte((vm_offset_t)pd) & ~(PG_M|PG_A|PG_U|PG_RW)); -#if 0 - xen_pgd_pin(ma); -#endif - } - -#ifdef PAE - PT_SET_MA(pmap->pm_pdpt, *vtopte((vm_offset_t)pmap->pm_pdpt) & ~PG_RW); -#endif - rw_wlock(&pvh_global_lock); - xen_flush_queue(); - xen_pgdpt_pin(VM_PAGE_TO_MACH(ptdpg[NPGPTD])); - for (i = 0; i < NPGPTD; i++) { - vm_paddr_t ma = VM_PAGE_TO_MACH(ptdpg[i]); - PT_SET_VA_MA(&pmap->pm_pdir[PTDPTDI + i], ma | PG_V | PG_A, FALSE); - } - xen_flush_queue(); - rw_wunlock(&pvh_global_lock); - CPU_ZERO(&pmap->pm_active); - TAILQ_INIT(&pmap->pm_pvchunk); - bzero(&pmap->pm_stats, sizeof pmap->pm_stats); - -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif - return (1); -} - -/* - * this routine is called if the page table page is not - * mapped correctly. - */ -static vm_page_t -_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) -{ - vm_paddr_t ptema; - vm_page_t m; - - /* - * Allocate a page table page. - */ - if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { - if ((flags & PMAP_ENTER_NOSLEEP) == 0) { - PMAP_UNLOCK(pmap); - rw_wunlock(&pvh_global_lock); - VM_WAIT; - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - } - - /* - * Indicate the need to retry. While waiting, the page table - * page may have been allocated. - */ - return (NULL); - } - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(m); - - /* - * Map the pagetable page into the process address space, if - * it isn't already there. - */ - - pmap->pm_stats.resident_count++; - - ptema = VM_PAGE_TO_MACH(m); - xen_pt_pin(ptema); - PT_SET_VA_MA(&pmap->pm_pdir[ptepindex], - (ptema | PG_U | PG_RW | PG_V | PG_A | PG_M), TRUE); - - KASSERT(pmap->pm_pdir[ptepindex], - ("_pmap_allocpte: ptepindex=%d did not get mapped", ptepindex)); - return (m); -} - -static vm_page_t -pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) -{ - u_int ptepindex; - pd_entry_t ptema; - vm_page_t m; - - /* - * Calculate pagetable page index - */ - ptepindex = va >> PDRSHIFT; -retry: - /* - * Get the page directory entry - */ - ptema = pmap->pm_pdir[ptepindex]; - - /* - * This supports switching from a 4MB page to a - * normal 4K page. - */ - if (ptema & PG_PS) { - /* - * XXX - */ - pmap->pm_pdir[ptepindex] = 0; - ptema = 0; - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - pmap_invalidate_all(kernel_pmap); - } - - /* - * If the page table page is mapped, we just increment the - * hold count, and activate it. - */ - if (ptema & PG_V) { - m = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); - m->wire_count++; - } else { - /* - * Here if the pte page isn't mapped, or if it has - * been deallocated. - */ - CTR3(KTR_PMAP, "pmap_allocpte: pmap=%p va=0x%08x flags=0x%x", - pmap, va, flags); - m = _pmap_allocpte(pmap, ptepindex, flags); - if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) - goto retry; - - KASSERT(pmap->pm_pdir[ptepindex], ("ptepindex=%d did not get mapped", ptepindex)); - } - return (m); -} - - -/*************************************************** -* Pmap allocation/deallocation routines. - ***************************************************/ - - -/* - * Release any resources held by the given physical map. - * Called when a pmap initialized by pmap_pinit is being released. - * Should only be called if the map contains no valid mappings. - */ -void -pmap_release(pmap_t pmap) -{ - vm_page_t m, ptdpg[2*NPGPTD+1]; - vm_paddr_t ma; - int i; -#ifdef PAE - int npgptd = NPGPTD + 1; -#else - int npgptd = NPGPTD; -#endif - - KASSERT(pmap->pm_stats.resident_count == 0, - ("pmap_release: pmap resident count %ld != 0", - pmap->pm_stats.resident_count)); - PT_UPDATES_FLUSH(); - -#ifdef HAMFISTED_LOCKING - mtx_lock(&createdelete_lock); -#endif - - KASSERT(CPU_EMPTY(&pmap->pm_active), - ("releasing active pmap %p", pmap)); - mtx_lock_spin(&allpmaps_lock); - LIST_REMOVE(pmap, pm_list); - mtx_unlock_spin(&allpmaps_lock); - - for (i = 0; i < NPGPTD; i++) - ptdpg[i] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdir + (i*NPDEPG)) & PG_FRAME); - pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); -#ifdef PAE - ptdpg[NPGPTD] = PHYS_TO_VM_PAGE(vtophys(pmap->pm_pdpt)); -#endif - - for (i = 0; i < npgptd; i++) { - m = ptdpg[i]; - ma = VM_PAGE_TO_MACH(m); - /* unpinning L1 and L2 treated the same */ -#if 0 - xen_pgd_unpin(ma); -#else - if (i == NPGPTD) - xen_pgd_unpin(ma); -#endif -#ifdef PAE - if (i < NPGPTD) - KASSERT(VM_PAGE_TO_MACH(m) == (pmap->pm_pdpt[i] & PG_FRAME), - ("pmap_release: got wrong ptd page")); -#endif - m->wire_count--; - atomic_subtract_int(&vm_cnt.v_wire_count, 1); - vm_page_free(m); - } -#ifdef PAE - pmap_qremove((vm_offset_t)pmap->pm_pdpt, 1); -#endif - -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif -} - -static int -kvm_size(SYSCTL_HANDLER_ARGS) -{ - unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; - - return (sysctl_handle_long(oidp, &ksize, 0, req)); -} -SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, - 0, 0, kvm_size, "IU", "Size of KVM"); - -static int -kvm_free(SYSCTL_HANDLER_ARGS) -{ - unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; - - return (sysctl_handle_long(oidp, &kfree, 0, req)); -} -SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, - 0, 0, kvm_free, "IU", "Amount of KVM free"); - -/* - * grow the number of kernel page table entries, if needed - */ -void -pmap_growkernel(vm_offset_t addr) -{ - struct pmap *pmap; - vm_paddr_t ptppaddr; - vm_page_t nkpg; - pd_entry_t newpdir; - - mtx_assert(&kernel_map->system_mtx, MA_OWNED); - if (kernel_vm_end == 0) { - kernel_vm_end = KERNBASE; - nkpt = 0; - while (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); - nkpt++; - if (kernel_vm_end - 1 >= kernel_map->max_offset) { - kernel_vm_end = kernel_map->max_offset; - break; - } - } - } - addr = roundup2(addr, NBPDR); - if (addr - 1 >= kernel_map->max_offset) - addr = kernel_map->max_offset; - while (kernel_vm_end < addr) { - if (pdir_pde(PTD, kernel_vm_end)) { - kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; - if (kernel_vm_end - 1 >= kernel_map->max_offset) { - kernel_vm_end = kernel_map->max_offset; - break; - } - continue; - } - - nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, - VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | - VM_ALLOC_ZERO); - if (nkpg == NULL) - panic("pmap_growkernel: no memory to grow kernel"); - - nkpt++; - - if ((nkpg->flags & PG_ZERO) == 0) - pmap_zero_page(nkpg); - ptppaddr = VM_PAGE_TO_PHYS(nkpg); - newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); - rw_wlock(&pvh_global_lock); - PD_SET_VA(kernel_pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); - mtx_lock_spin(&allpmaps_lock); - LIST_FOREACH(pmap, &allpmaps, pm_list) - PD_SET_VA(pmap, (kernel_vm_end >> PDRSHIFT), newpdir, TRUE); - - mtx_unlock_spin(&allpmaps_lock); - rw_wunlock(&pvh_global_lock); - - kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; - if (kernel_vm_end - 1 >= kernel_map->max_offset) { - kernel_vm_end = kernel_map->max_offset; - break; - } - } -} - - -/*************************************************** - * page management routines. - ***************************************************/ - -CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); -CTASSERT(_NPCM == 11); -CTASSERT(_NPCPV == 336); - -static __inline struct pv_chunk * -pv_to_chunk(pv_entry_t pv) -{ - - return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); -} - -#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) - -#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ -#define PC_FREE10 0x0000fffful /* Free values for index 10 */ - -static const uint32_t pc_freemask[_NPCM] = { - PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, - PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, - PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, - PC_FREE0_9, PC_FREE10 -}; - -SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, - "Current number of pv entries"); - -#ifdef PV_STATS -static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; - -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, - "Current number of pv entry chunks"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, - "Current number of pv entry chunks allocated"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, - "Current number of pv entry chunks frees"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, - "Number of times tried to get a chunk page but failed."); - -static long pv_entry_frees, pv_entry_allocs; -static int pv_entry_spare; - -SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, - "Current number of pv entry frees"); -SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, - "Current number of pv entry allocs"); -SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, - "Current number of spare pv entries"); -#endif - -/* - * We are in a serious low memory condition. Resort to - * drastic measures to free some pages so we can allocate - * another pv entry chunk. - */ -static vm_page_t -pmap_pv_reclaim(pmap_t locked_pmap) -{ - struct pch newtail; - struct pv_chunk *pc; - pmap_t pmap; - pt_entry_t *pte, tpte; - pv_entry_t pv; - vm_offset_t va; - vm_page_t free, m, m_pc; - uint32_t inuse; - int bit, field, freed; - - PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); - pmap = NULL; - free = m_pc = NULL; - TAILQ_INIT(&newtail); - while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || - free == NULL)) { - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); - if (pmap != pc->pc_pmap) { - if (pmap != NULL) { - pmap_invalidate_all(pmap); - if (pmap != locked_pmap) - PMAP_UNLOCK(pmap); - } - pmap = pc->pc_pmap; - /* Avoid deadlock and lock recursion. */ - if (pmap > locked_pmap) - PMAP_LOCK(pmap); - else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { - pmap = NULL; - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); - continue; - } - } - - /* - * Destroy every non-wired, 4 KB page mapping in the chunk. - */ - freed = 0; - for (field = 0; field < _NPCM; field++) { - for (inuse = ~pc->pc_map[field] & pc_freemask[field]; - inuse != 0; inuse &= ~(1UL << bit)) { - bit = bsfl(inuse); - pv = &pc->pc_pventry[field * 32 + bit]; - va = pv->pv_va; - pte = pmap_pte(pmap, va); - tpte = *pte; - if ((tpte & PG_W) == 0) - tpte = pte_load_clear(pte); - pmap_pte_release(pte); - if ((tpte & PG_W) != 0) - continue; - KASSERT(tpte != 0, - ("pmap_pv_reclaim: pmap %p va %x zero pte", - pmap, va)); - if ((tpte & PG_G) != 0) - pmap_invalidate_page(pmap, va); - m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - vm_page_dirty(m); - if ((tpte & PG_A) != 0) - vm_page_aflag_set(m, PGA_REFERENCED); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); - pc->pc_map[field] |= 1UL << bit; - pmap_unuse_pt(pmap, va, &free); - freed++; - } - } - if (freed == 0) { - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); - continue; - } - /* Every freed mapping is for a 4 KB page. */ - pmap->pm_stats.resident_count -= freed; - PV_STAT(pv_entry_frees += freed); - PV_STAT(pv_entry_spare += freed); - pv_entry_count -= freed; - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - for (field = 0; field < _NPCM; field++) - if (pc->pc_map[field] != pc_freemask[field]) { - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, - pc_list); - TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); - - /* - * One freed pv entry in locked_pmap is - * sufficient. - */ - if (pmap == locked_pmap) - goto out; - break; - } - if (field == _NPCM) { - PV_STAT(pv_entry_spare -= _NPCPV); - PV_STAT(pc_chunk_count--); - PV_STAT(pc_chunk_frees++); - /* Entire chunk is free; return it. */ - m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); - pmap_qremove((vm_offset_t)pc, 1); - pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); - break; - } - } -out: - TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); - if (pmap != NULL) { - pmap_invalidate_all(pmap); - if (pmap != locked_pmap) - PMAP_UNLOCK(pmap); - } - if (m_pc == NULL && pv_vafree != 0 && free != NULL) { - m_pc = free; - free = (void *)m_pc->object; - /* Recycle a freed page table page. */ - m_pc->wire_count = 1; - atomic_add_int(&vm_cnt.v_wire_count, 1); - } - pmap_free_zero_pages(free); - return (m_pc); -} - -/* - * free the pv_entry back to the free list - */ -static void -free_pv_entry(pmap_t pmap, pv_entry_t pv) -{ - struct pv_chunk *pc; - int idx, field, bit; - - rw_assert(&pvh_global_lock, RA_WLOCKED); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - PV_STAT(pv_entry_frees++); - PV_STAT(pv_entry_spare++); - pv_entry_count--; - pc = pv_to_chunk(pv); - idx = pv - &pc->pc_pventry[0]; - field = idx / 32; - bit = idx % 32; - pc->pc_map[field] |= 1ul << bit; - for (idx = 0; idx < _NPCM; idx++) - if (pc->pc_map[idx] != pc_freemask[idx]) { - /* - * 98% of the time, pc is already at the head of the - * list. If it isn't already, move it to the head. - */ - if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != - pc)) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, - pc_list); - } - return; - } - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - free_pv_chunk(pc); -} - -static void -free_pv_chunk(struct pv_chunk *pc) -{ - vm_page_t m; - - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); - PV_STAT(pv_entry_spare -= _NPCPV); - PV_STAT(pc_chunk_count--); - PV_STAT(pc_chunk_frees++); - /* entire chunk is free, return it */ - m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); - pmap_qremove((vm_offset_t)pc, 1); - vm_page_unwire(m, PQ_INACTIVE); - vm_page_free(m); - pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); -} - -/* - * get a new pv_entry, allocating a block from the system - * when needed. - */ -static pv_entry_t -get_pv_entry(pmap_t pmap, boolean_t try) -{ - static const struct timeval printinterval = { 60, 0 }; - static struct timeval lastprint; - int bit, field; - pv_entry_t pv; - struct pv_chunk *pc; - vm_page_t m; - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - rw_assert(&pvh_global_lock, RA_WLOCKED); - PV_STAT(pv_entry_allocs++); - pv_entry_count++; - if (pv_entry_count > pv_entry_high_water) - if (ratecheck(&lastprint, &printinterval)) - printf("Approaching the limit on PV entries, consider " - "increasing either the vm.pmap.shpgperproc or the " - "vm.pmap.pv_entry_max tunable.\n"); -retry: - pc = TAILQ_FIRST(&pmap->pm_pvchunk); - if (pc != NULL) { - for (field = 0; field < _NPCM; field++) { - if (pc->pc_map[field]) { - bit = bsfl(pc->pc_map[field]); - break; - } - } - if (field < _NPCM) { - pv = &pc->pc_pventry[field * 32 + bit]; - pc->pc_map[field] &= ~(1ul << bit); - /* If this was the last item, move it to tail */ - for (field = 0; field < _NPCM; field++) - if (pc->pc_map[field] != 0) { - PV_STAT(pv_entry_spare--); - return (pv); /* not full, return */ - } - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); - PV_STAT(pv_entry_spare--); - return (pv); - } - } - /* - * Access to the ptelist "pv_vafree" is synchronized by the page - * queues lock. If "pv_vafree" is currently non-empty, it will - * remain non-empty until pmap_ptelist_alloc() completes. - */ - if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | - VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { - if (try) { - pv_entry_count--; - PV_STAT(pc_chunk_tryfail++); - return (NULL); - } - m = pmap_pv_reclaim(pmap); - if (m == NULL) - goto retry; - } - PV_STAT(pc_chunk_count++); - PV_STAT(pc_chunk_allocs++); - pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); - pmap_qenter((vm_offset_t)pc, &m, 1); - if ((m->flags & PG_ZERO) == 0) - pagezero(pc); - pc->pc_pmap = pmap; - pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ - for (field = 1; field < _NPCM; field++) - pc->pc_map[field] = pc_freemask[field]; - TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); - pv = &pc->pc_pventry[0]; - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - PV_STAT(pv_entry_spare += _NPCPV - 1); - return (pv); -} - -static __inline pv_entry_t -pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) -{ - pv_entry_t pv; - - rw_assert(&pvh_global_lock, RA_WLOCKED); - TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { - if (pmap == PV_PMAP(pv) && va == pv->pv_va) { - TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); - break; - } - } - return (pv); -} - -static void -pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) -{ - pv_entry_t pv; - - pv = pmap_pvh_remove(pvh, pmap, va); - KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); - free_pv_entry(pmap, pv); -} - -static void -pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) -{ - - rw_assert(&pvh_global_lock, RA_WLOCKED); - pmap_pvh_free(&m->md, pmap, va); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); -} - -/* - * Conditionally create a pv entry. - */ -static boolean_t -pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) -{ - pv_entry_t pv; - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - rw_assert(&pvh_global_lock, RA_WLOCKED); - if (pv_entry_count < pv_entry_high_water && - (pv = get_pv_entry(pmap, TRUE)) != NULL) { - pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - return (TRUE); - } else - return (FALSE); -} - -/* - * pmap_remove_pte: do the things to unmap a page in a process - */ -static int -pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) -{ - pt_entry_t oldpte; - vm_page_t m; - - CTR3(KTR_PMAP, "pmap_remove_pte: pmap=%p *ptq=0x%x va=0x%x", - pmap, (u_long)*ptq, va); - - rw_assert(&pvh_global_lock, RA_WLOCKED); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - oldpte = *ptq; - PT_SET_VA_MA(ptq, 0, TRUE); - KASSERT(oldpte != 0, - ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); - if (oldpte & PG_W) - pmap->pm_stats.wired_count -= 1; - /* - * Machines that don't support invlpg, also don't support - * PG_G. - */ - if (oldpte & PG_G) - pmap_invalidate_page(kernel_pmap, va); - pmap->pm_stats.resident_count -= 1; - if (oldpte & PG_MANAGED) { - m = PHYS_TO_VM_PAGE(xpmap_mtop(oldpte) & PG_FRAME); - if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - vm_page_dirty(m); - if (oldpte & PG_A) - vm_page_aflag_set(m, PGA_REFERENCED); - pmap_remove_entry(pmap, m, va); - } - return (pmap_unuse_pt(pmap, va, free)); -} - -/* - * Remove a single page from a process address space - */ -static void -pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) -{ - pt_entry_t *pte; - - CTR2(KTR_PMAP, "pmap_remove_page: pmap=%p va=0x%x", - pmap, va); - - rw_assert(&pvh_global_lock, RA_WLOCKED); - KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - if ((pte = pmap_pte_quick(pmap, va)) == NULL || (*pte & PG_V) == 0) - return; - pmap_remove_pte(pmap, pte, va, free); - pmap_invalidate_page(pmap, va); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - -} - -/* - * Remove the given range of addresses from the specified map. - * - * It is assumed that the start and end are properly - * rounded to the page size. - */ -void -pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - vm_offset_t pdnxt; - pd_entry_t ptpaddr; - pt_entry_t *pte; - vm_page_t free = NULL; - int anyvalid; - - CTR3(KTR_PMAP, "pmap_remove: pmap=%p sva=0x%x eva=0x%x", - pmap, sva, eva); - - /* - * Perform an unsynchronized read. This is, however, safe. - */ - if (pmap->pm_stats.resident_count == 0) - return; - - anyvalid = 0; - - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - - /* - * special handling of removing one page. a very - * common operation and easy to short circuit some - * code. - */ - if ((sva + PAGE_SIZE == eva) && - ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { - pmap_remove_page(pmap, sva, &free); - goto out; - } - - for (; sva < eva; sva = pdnxt) { - u_int pdirindex; - - /* - * Calculate index for next page table. - */ - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - if (pmap->pm_stats.resident_count == 0) - break; - - pdirindex = sva >> PDRSHIFT; - ptpaddr = pmap->pm_pdir[pdirindex]; - - /* - * Weed out invalid mappings. Note: we assume that the page - * directory table is always allocated, and in kernel virtual. - */ - if (ptpaddr == 0) - continue; - - /* - * Check for large page. - */ - if ((ptpaddr & PG_PS) != 0) { - PD_CLEAR_VA(pmap, pdirindex, TRUE); - pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; - anyvalid = 1; - continue; - } - - /* - * Limit our scan to either the end of the va represented - * by the current page table page, or to the end of the - * range being removed. - */ - if (pdnxt > eva) - pdnxt = eva; - - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - if ((*pte & PG_V) == 0) - continue; - - /* - * The TLB entry for a PG_G mapping is invalidated - * by pmap_remove_pte(). - */ - if ((*pte & PG_G) == 0) - anyvalid = 1; - if (pmap_remove_pte(pmap, pte, sva, &free)) - break; - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); -out: - if (anyvalid) - pmap_invalidate_all(pmap); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - pmap_free_zero_pages(free); -} - -/* - * Routine: pmap_remove_all - * Function: - * Removes this physical page from - * all physical maps in which it resides. - * Reflects back modify bits to the pager. - * - * Notes: - * Original versions of this routine were very - * inefficient because they iteratively called - * pmap_remove (slow...) - */ - -void -pmap_remove_all(vm_page_t m) -{ - pv_entry_t pv; - pmap_t pmap; - pt_entry_t *pte, tpte; - vm_page_t free; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_remove_all: page %p is not managed", m)); - free = NULL; - rw_wlock(&pvh_global_lock); - sched_pin(); - while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pmap->pm_stats.resident_count--; - pte = pmap_pte_quick(pmap, pv->pv_va); - tpte = *pte; - PT_SET_VA_MA(pte, 0, TRUE); - KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", - pmap, pv->pv_va)); - if (tpte & PG_W) - pmap->pm_stats.wired_count--; - if (tpte & PG_A) - vm_page_aflag_set(m, PGA_REFERENCED); - - /* - * Update the vm_page_t clean and reference bits. - */ - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - vm_page_dirty(m); - pmap_unuse_pt(pmap, pv->pv_va, &free); - pmap_invalidate_page(pmap, pv->pv_va); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - free_pv_entry(pmap, pv); - PMAP_UNLOCK(pmap); - } - vm_page_aflag_clear(m, PGA_WRITEABLE); - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - pmap_free_zero_pages(free); -} - -/* - * Set the physical protection on the - * specified range of this map as requested. - */ -void -pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) -{ - vm_offset_t pdnxt; - pd_entry_t ptpaddr; - pt_entry_t *pte; - int anychanged; - - CTR4(KTR_PMAP, "pmap_protect: pmap=%p sva=0x%x eva=0x%x prot=0x%x", - pmap, sva, eva, prot); - - if ((prot & VM_PROT_READ) == VM_PROT_NONE) { - pmap_remove(pmap, sva, eva); - return; - } - -#ifdef PAE - if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == - (VM_PROT_WRITE|VM_PROT_EXECUTE)) - return; -#else - if (prot & VM_PROT_WRITE) - return; -#endif - - anychanged = 0; - - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - for (; sva < eva; sva = pdnxt) { - pt_entry_t obits, pbits; - u_int pdirindex; - - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - - pdirindex = sva >> PDRSHIFT; - ptpaddr = pmap->pm_pdir[pdirindex]; - - /* - * Weed out invalid mappings. Note: we assume that the page - * directory table is always allocated, and in kernel virtual. - */ - if (ptpaddr == 0) - continue; - - /* - * Check for large page. - */ - if ((ptpaddr & PG_PS) != 0) { - if ((prot & VM_PROT_WRITE) == 0) - pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pmap->pm_pdir[pdirindex] |= pg_nx; -#endif - anychanged = 1; - continue; - } - - if (pdnxt > eva) - pdnxt = eva; - - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - vm_page_t m; - -retry: - /* - * Regardless of whether a pte is 32 or 64 bits in - * size, PG_RW, PG_A, and PG_M are among the least - * significant 32 bits. - */ - obits = pbits = *pte; - if ((pbits & PG_V) == 0) - continue; - - if ((prot & VM_PROT_WRITE) == 0) { - if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == - (PG_MANAGED | PG_M | PG_RW)) { - m = PHYS_TO_VM_PAGE(xpmap_mtop(pbits) & - PG_FRAME); - vm_page_dirty(m); - } - pbits &= ~(PG_RW | PG_M); - } -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pbits |= pg_nx; -#endif - - if (pbits != obits) { - obits = *pte; - PT_SET_VA_MA(pte, pbits, TRUE); - if (*pte != pbits) - goto retry; - if (obits & PG_G) - pmap_invalidate_page(pmap, sva); - else - anychanged = 1; - } - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); - if (anychanged) - pmap_invalidate_all(pmap); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -/* - * Insert the given physical page (p) at - * the specified virtual address (v) in the - * target physical map with the protection requested. - * - * If specified, the page will be wired down, meaning - * that the related pte can not be reclaimed. - * - * NB: This is the only routine which MAY NOT lazy-evaluate - * or lose information. That is, this routine must actually - * insert this page into the given map NOW. - */ -int -pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - u_int flags, int8_t psind __unused) -{ - pd_entry_t *pde; - pt_entry_t *pte; - pt_entry_t newpte, origpte; - pv_entry_t pv; - vm_paddr_t opa, pa; - vm_page_t mpte, om; - boolean_t invlva, wired; - - CTR5(KTR_PMAP, - "pmap_enter: pmap=%08p va=0x%08x ma=0x%08x prot=0x%x flags=0x%x", - pmap, va, VM_PAGE_TO_MACH(m), prot, flags); - va = trunc_page(va); - KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); - KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, - ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", - va)); - if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) - VM_OBJECT_ASSERT_LOCKED(m->object); - - mpte = NULL; - wired = (flags & PMAP_ENTER_WIRED) != 0; - - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - sched_pin(); - - /* - * In the case that a page table page is not - * resident, we are creating it here. - */ - if (va < VM_MAXUSER_ADDRESS) { - mpte = pmap_allocpte(pmap, va, flags); - if (mpte == NULL) { - KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, - ("pmap_allocpte failed with sleep allowed")); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - return (KERN_RESOURCE_SHORTAGE); - } - } - - pde = pmap_pde(pmap, va); - if ((*pde & PG_PS) != 0) - panic("pmap_enter: attempted pmap_enter on 4MB page"); - pte = pmap_pte_quick(pmap, va); - - /* - * Page Directory table entry not valid, we need a new PT page - */ - if (pte == NULL) { - panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", - (uintmax_t)pmap->pm_pdir[va >> PDRSHIFT], va); - } - - pa = VM_PAGE_TO_PHYS(m); - om = NULL; - opa = origpte = 0; - -#if 0 - KASSERT((*pte & PG_V) || (*pte == 0), ("address set but not valid pte=%p *pte=0x%016jx", - pte, *pte)); -#endif - origpte = *pte; - if (origpte) - origpte = xpmap_mtop(origpte); - opa = origpte & PG_FRAME; - - /* - * Mapping has not changed, must be protection or wiring change. - */ - if (origpte && (opa == pa)) { - /* - * Wiring change, just update stats. We don't worry about - * wiring PT pages as they remain resident as long as there - * are valid mappings in them. Hence, if a user page is wired, - * the PT page will be also. - */ - if (wired && ((origpte & PG_W) == 0)) - pmap->pm_stats.wired_count++; - else if (!wired && (origpte & PG_W)) - pmap->pm_stats.wired_count--; - - /* - * Remove extra pte reference - */ - if (mpte) - mpte->wire_count--; - - if (origpte & PG_MANAGED) { - om = m; - pa |= PG_MANAGED; - } - goto validate; - } - - pv = NULL; - - /* - * Mapping has changed, invalidate old range and fall through to - * handle validating new mapping. - */ - if (opa) { - if (origpte & PG_W) - pmap->pm_stats.wired_count--; - if (origpte & PG_MANAGED) { - om = PHYS_TO_VM_PAGE(opa); - pv = pmap_pvh_remove(&om->md, pmap, va); - } else if (va < VM_MAXUSER_ADDRESS) - printf("va=0x%x is unmanaged :-( \n", va); - - if (mpte != NULL) { - mpte->wire_count--; - KASSERT(mpte->wire_count > 0, - ("pmap_enter: missing reference to page table page," - " va: 0x%x", va)); - } - } else - pmap->pm_stats.resident_count++; - - /* - * Enter on the PV list if part of our managed memory. - */ - if ((m->oflags & VPO_UNMANAGED) == 0) { - KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, - ("pmap_enter: managed mapping within the clean submap")); - if (pv == NULL) - pv = get_pv_entry(pmap, FALSE); - pv->pv_va = va; - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - pa |= PG_MANAGED; - } else if (pv != NULL) - free_pv_entry(pmap, pv); - - /* - * Increment counters - */ - if (wired) - pmap->pm_stats.wired_count++; - -validate: - /* - * Now validate mapping with desired protection/wiring. - */ - newpte = (pt_entry_t)(pa | PG_V); - if ((prot & VM_PROT_WRITE) != 0) { - newpte |= PG_RW; - if ((newpte & PG_MANAGED) != 0) - vm_page_aflag_set(m, PGA_WRITEABLE); - } -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - newpte |= pg_nx; -#endif - if (wired) - newpte |= PG_W; - if (va < VM_MAXUSER_ADDRESS) - newpte |= PG_U; - if (pmap == kernel_pmap) - newpte |= pgeflag; - - critical_enter(); - /* - * if the mapping or permission bits are different, we need - * to update the pte. - */ - if ((origpte & ~(PG_M|PG_A)) != newpte) { - if (origpte) { - invlva = FALSE; - origpte = *pte; - PT_SET_VA(pte, newpte | PG_A, FALSE); - if (origpte & PG_A) { - if (origpte & PG_MANAGED) - vm_page_aflag_set(om, PGA_REFERENCED); - if (opa != VM_PAGE_TO_PHYS(m)) - invlva = TRUE; -#ifdef PAE - if ((origpte & PG_NX) == 0 && - (newpte & PG_NX) != 0) - invlva = TRUE; -#endif - } - if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - if ((origpte & PG_MANAGED) != 0) - vm_page_dirty(om); - if ((prot & VM_PROT_WRITE) == 0) - invlva = TRUE; - } - if ((origpte & PG_MANAGED) != 0 && - TAILQ_EMPTY(&om->md.pv_list)) - vm_page_aflag_clear(om, PGA_WRITEABLE); - if (invlva) - pmap_invalidate_page(pmap, va); - } else{ - PT_SET_VA(pte, newpte | PG_A, FALSE); - } - - } - PT_UPDATES_FLUSH(); - critical_exit(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - return (KERN_SUCCESS); -} - -/* - * Maps a sequence of resident pages belonging to the same object. - * The sequence begins with the given page m_start. This page is - * mapped at the given virtual address start. Each subsequent page is - * mapped at a virtual address that is offset from start by the same - * amount as the page is offset from m_start within the object. The - * last page in the sequence is the page with the largest offset from - * m_start that can be mapped at a virtual address less than the given - * virtual address end. Not every virtual page between start and end - * is mapped; only those for which a resident page exists with the - * corresponding offset from m_start are mapped. - */ -void -pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, - vm_page_t m_start, vm_prot_t prot) -{ - vm_page_t m, mpte; - vm_pindex_t diff, psize; - multicall_entry_t mcl[16]; - multicall_entry_t *mclp = mcl; - int error, count = 0; - - VM_OBJECT_ASSERT_LOCKED(m_start->object); - - psize = atop(end - start); - mpte = NULL; - m = m_start; - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { - mpte = pmap_enter_quick_locked(&mclp, &count, pmap, start + ptoa(diff), m, - prot, mpte); - m = TAILQ_NEXT(m, listq); - if (count == 16) { - error = HYPERVISOR_multicall(mcl, count); - KASSERT(error == 0, ("bad multicall %d", error)); - mclp = mcl; - count = 0; - } - } - if (count) { - error = HYPERVISOR_multicall(mcl, count); - KASSERT(error == 0, ("bad multicall %d", error)); - } - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -/* - * this code makes some *MAJOR* assumptions: - * 1. Current pmap & pmap exists. - * 2. Not wired. - * 3. Read access. - * 4. No page table pages. - * but is *MUCH* faster than pmap_enter... - */ - -void -pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) -{ - multicall_entry_t mcl, *mclp; - int count = 0; - mclp = &mcl; - - CTR4(KTR_PMAP, "pmap_enter_quick: pmap=%p va=0x%x m=%p prot=0x%x", - pmap, va, m, prot); - - rw_wlock(&pvh_global_lock); - PMAP_LOCK(pmap); - (void)pmap_enter_quick_locked(&mclp, &count, pmap, va, m, prot, NULL); - if (count) - HYPERVISOR_multicall(&mcl, count); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -#ifdef notyet -void -pmap_enter_quick_range(pmap_t pmap, vm_offset_t *addrs, vm_page_t *pages, vm_prot_t *prots, int count) -{ - int i, error, index = 0; - multicall_entry_t mcl[16]; - multicall_entry_t *mclp = mcl; - - PMAP_LOCK(pmap); - for (i = 0; i < count; i++, addrs++, pages++, prots++) { - if (!pmap_is_prefaultable_locked(pmap, *addrs)) - continue; - - (void) pmap_enter_quick_locked(&mclp, &index, pmap, *addrs, *pages, *prots, NULL); - if (index == 16) { - error = HYPERVISOR_multicall(mcl, index); - mclp = mcl; - index = 0; - KASSERT(error == 0, ("bad multicall %d", error)); - } - } - if (index) { - error = HYPERVISOR_multicall(mcl, index); - KASSERT(error == 0, ("bad multicall %d", error)); - } - - PMAP_UNLOCK(pmap); -} -#endif - -static vm_page_t -pmap_enter_quick_locked(multicall_entry_t **mclpp, int *count, pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_prot_t prot, vm_page_t mpte) -{ - pt_entry_t *pte; - vm_paddr_t pa; - vm_page_t free; - multicall_entry_t *mcl = *mclpp; - - KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || - (m->oflags & VPO_UNMANAGED) != 0, - ("pmap_enter_quick_locked: managed mapping within the clean submap")); - rw_assert(&pvh_global_lock, RA_WLOCKED); - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - - /* - * In the case that a page table page is not - * resident, we are creating it here. - */ - if (va < VM_MAXUSER_ADDRESS) { - u_int ptepindex; - pd_entry_t ptema; - - /* - * Calculate pagetable page index - */ - ptepindex = va >> PDRSHIFT; - if (mpte && (mpte->pindex == ptepindex)) { - mpte->wire_count++; - } else { - /* - * Get the page directory entry - */ - ptema = pmap->pm_pdir[ptepindex]; - - /* - * If the page table page is mapped, we just increment - * the hold count, and activate it. - */ - if (ptema & PG_V) { - if (ptema & PG_PS) - panic("pmap_enter_quick: unexpected mapping into 4MB page"); - mpte = PHYS_TO_VM_PAGE(xpmap_mtop(ptema) & PG_FRAME); - mpte->wire_count++; - } else { - mpte = _pmap_allocpte(pmap, ptepindex, - PMAP_ENTER_NOSLEEP); - if (mpte == NULL) - return (mpte); - } - } - } else { - mpte = NULL; - } - - /* - * This call to vtopte makes the assumption that we are - * entering the page into the current pmap. In order to support - * quick entry into any pmap, one would likely use pmap_pte_quick. - * But that isn't as quick as vtopte. - */ - KASSERT(pmap_is_current(pmap), ("entering pages in non-current pmap")); - pte = vtopte(va); - if (*pte & PG_V) { - if (mpte != NULL) { - mpte->wire_count--; - mpte = NULL; - } - return (mpte); - } - - /* - * Enter on the PV list if part of our managed memory. - */ - if ((m->oflags & VPO_UNMANAGED) == 0 && - !pmap_try_insert_pv_entry(pmap, va, m)) { - if (mpte != NULL) { - free = NULL; - if (pmap_unwire_ptp(pmap, mpte, &free)) { - pmap_invalidate_page(pmap, va); - pmap_free_zero_pages(free); - } - - mpte = NULL; - } - return (mpte); - } - - /* - * Increment counters - */ - pmap->pm_stats.resident_count++; - - pa = VM_PAGE_TO_PHYS(m); -#ifdef PAE - if ((prot & VM_PROT_EXECUTE) == 0) - pa |= pg_nx; -#endif - -#if 0 - /* - * Now validate mapping with RO protection - */ - if ((m->oflags & VPO_UNMANAGED) != 0) - pte_store(pte, pa | PG_V | PG_U); - else - pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); -#else - /* - * Now validate mapping with RO protection - */ - if ((m->oflags & VPO_UNMANAGED) != 0) - pa = xpmap_ptom(pa | PG_V | PG_U); - else - pa = xpmap_ptom(pa | PG_V | PG_U | PG_MANAGED); - - mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = va; - mcl->args[1] = (uint32_t)(pa & 0xffffffff); - mcl->args[2] = (uint32_t)(pa >> 32); - mcl->args[3] = 0; - *mclpp = mcl + 1; - *count = *count + 1; -#endif - return (mpte); -} - -/* - * Make a temporary mapping for a physical address. This is only intended - * to be used for panic dumps. - */ -void * -pmap_kenter_temporary(vm_paddr_t pa, int i) -{ - vm_offset_t va; - vm_paddr_t ma = xpmap_ptom(pa); - - va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); - PT_SET_MA(va, (ma & ~PAGE_MASK) | PG_V | pgeflag); - invlpg(va); - return ((void *)crashdumpmap); -} - -/* - * This code maps large physical mmap regions into the - * processor address space. Note that some shortcuts - * are taken, but the code works. - */ -void -pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, - vm_pindex_t pindex, vm_size_t size) -{ - pd_entry_t *pde; - vm_paddr_t pa, ptepa; - vm_page_t p; - int pat_mode; - - VM_OBJECT_ASSERT_WLOCKED(object); - KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, - ("pmap_object_init_pt: non-device object")); - if (pseflag && - (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { - if (!vm_object_populate(object, pindex, pindex + atop(size))) - return; - p = vm_page_lookup(object, pindex); - KASSERT(p->valid == VM_PAGE_BITS_ALL, - ("pmap_object_init_pt: invalid page %p", p)); - pat_mode = p->md.pat_mode; - - /* - * Abort the mapping if the first page is not physically - * aligned to a 2/4MB page boundary. - */ - ptepa = VM_PAGE_TO_PHYS(p); - if (ptepa & (NBPDR - 1)) - return; - - /* - * Skip the first page. Abort the mapping if the rest of - * the pages are not physically contiguous or have differing - * memory attributes. - */ - p = TAILQ_NEXT(p, listq); - for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; - pa += PAGE_SIZE) { - KASSERT(p->valid == VM_PAGE_BITS_ALL, - ("pmap_object_init_pt: invalid page %p", p)); - if (pa != VM_PAGE_TO_PHYS(p) || - pat_mode != p->md.pat_mode) - return; - p = TAILQ_NEXT(p, listq); - } - - /* - * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and - * "size" is a multiple of 2/4M, adding the PAT setting to - * "pa" will not affect the termination of this loop. - */ - PMAP_LOCK(pmap); - for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + - size; pa += NBPDR) { - pde = pmap_pde(pmap, addr); - if (*pde == 0) { - pde_store(pde, pa | PG_PS | PG_M | PG_A | - PG_U | PG_RW | PG_V); - pmap->pm_stats.resident_count += NBPDR / - PAGE_SIZE; - pmap_pde_mappings++; - } - /* Else continue on if the PDE is already valid. */ - addr += NBPDR; - } - PMAP_UNLOCK(pmap); - } -} - -/* - * Clear the wired attribute from the mappings for the specified range of - * addresses in the given pmap. Every valid mapping within that range - * must have the wired attribute set. In contrast, invalid mappings - * cannot have the wired attribute set, so they are ignored. - * - * The wired attribute of the page table entry is not a hardware feature, - * so there is no need to invalidate any TLB entries. - */ -void -pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) -{ - vm_offset_t pdnxt; - pd_entry_t *pde; - pt_entry_t *pte; - - CTR3(KTR_PMAP, "pmap_unwire: pmap=%p sva=0x%x eva=0x%x", pmap, sva, - eva); - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - for (; sva < eva; sva = pdnxt) { - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - pde = pmap_pde(pmap, sva); - if ((*pde & PG_V) == 0) - continue; - if ((*pde & PG_PS) != 0) - panic("pmap_unwire: unexpected PG_PS in pde %#jx", - (uintmax_t)*pde); - if (pdnxt > eva) - pdnxt = eva; - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - if ((*pte & PG_V) == 0) - continue; - if ((*pte & PG_W) == 0) - panic("pmap_unwire: pte %#jx is missing PG_W", - (uintmax_t)*pte); - PT_SET_VA_MA(pte, *pte & ~PG_W, FALSE); - pmap->pm_stats.wired_count--; - } - } - if (*PMAP1) - PT_CLEAR_VA(PMAP1, FALSE); - PT_UPDATES_FLUSH(); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - - -/* - * Copy the range specified by src_addr/len - * from the source map to the range dst_addr/len - * in the destination map. - * - * This routine is only advisory and need not do anything. - */ - -void -pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, - vm_offset_t src_addr) -{ - vm_page_t free; - vm_offset_t addr; - vm_offset_t end_addr = src_addr + len; - vm_offset_t pdnxt; - - if (dst_addr != src_addr) - return; - - if (!pmap_is_current(src_pmap)) { - CTR2(KTR_PMAP, - "pmap_copy, skipping: pdir[PTDPTDI]=0x%jx PTDpde[0]=0x%jx", - (src_pmap->pm_pdir[PTDPTDI] & PG_FRAME), (PTDpde[0] & PG_FRAME)); - - return; - } - CTR5(KTR_PMAP, "pmap_copy: dst_pmap=%p src_pmap=%p dst_addr=0x%x len=%d src_addr=0x%x", - dst_pmap, src_pmap, dst_addr, len, src_addr); - -#ifdef HAMFISTED_LOCKING - mtx_lock(&createdelete_lock); -#endif - - rw_wlock(&pvh_global_lock); - if (dst_pmap < src_pmap) { - PMAP_LOCK(dst_pmap); - PMAP_LOCK(src_pmap); - } else { - PMAP_LOCK(src_pmap); - PMAP_LOCK(dst_pmap); - } - sched_pin(); - for (addr = src_addr; addr < end_addr; addr = pdnxt) { - pt_entry_t *src_pte, *dst_pte; - vm_page_t dstmpte, srcmpte; - pd_entry_t srcptepaddr; - u_int ptepindex; - - KASSERT(addr < UPT_MIN_ADDRESS, - ("pmap_copy: invalid to pmap_copy page tables")); - - pdnxt = (addr + NBPDR) & ~PDRMASK; - if (pdnxt < addr) - pdnxt = end_addr; - ptepindex = addr >> PDRSHIFT; - - srcptepaddr = PT_GET(&src_pmap->pm_pdir[ptepindex]); - if (srcptepaddr == 0) - continue; - - if (srcptepaddr & PG_PS) { - if (dst_pmap->pm_pdir[ptepindex] == 0) { - PD_SET_VA(dst_pmap, ptepindex, srcptepaddr & ~PG_W, TRUE); - dst_pmap->pm_stats.resident_count += - NBPDR / PAGE_SIZE; - } - continue; - } - - srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); - KASSERT(srcmpte->wire_count > 0, - ("pmap_copy: source page table page is unused")); - - if (pdnxt > end_addr) - pdnxt = end_addr; - - src_pte = vtopte(addr); - while (addr < pdnxt) { - pt_entry_t ptetemp; - ptetemp = *src_pte; - /* - * we only virtual copy managed pages - */ - if ((ptetemp & PG_MANAGED) != 0) { - dstmpte = pmap_allocpte(dst_pmap, addr, - PMAP_ENTER_NOSLEEP); - if (dstmpte == NULL) - goto out; - dst_pte = pmap_pte_quick(dst_pmap, addr); - if (*dst_pte == 0 && - pmap_try_insert_pv_entry(dst_pmap, addr, - PHYS_TO_VM_PAGE(xpmap_mtop(ptetemp) & PG_FRAME))) { - /* - * Clear the wired, modified, and - * accessed (referenced) bits - * during the copy. - */ - KASSERT(ptetemp != 0, ("src_pte not set")); - PT_SET_VA_MA(dst_pte, ptetemp & ~(PG_W | PG_M | PG_A), TRUE /* XXX debug */); - KASSERT(*dst_pte == (ptetemp & ~(PG_W | PG_M | PG_A)), - ("no pmap copy expected: 0x%jx saw: 0x%jx", - ptetemp & ~(PG_W | PG_M | PG_A), *dst_pte)); - dst_pmap->pm_stats.resident_count++; - } else { - free = NULL; - if (pmap_unwire_ptp(dst_pmap, dstmpte, - &free)) { - pmap_invalidate_page(dst_pmap, - addr); - pmap_free_zero_pages(free); - } - goto out; - } - if (dstmpte->wire_count >= srcmpte->wire_count) - break; - } - addr += PAGE_SIZE; - src_pte++; - } - } -out: - PT_UPDATES_FLUSH(); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(src_pmap); - PMAP_UNLOCK(dst_pmap); - -#ifdef HAMFISTED_LOCKING - mtx_unlock(&createdelete_lock); -#endif -} - -static __inline void -pagezero(void *page) -{ -#if defined(I686_CPU) - if (cpu_class == CPUCLASS_686) { -#if defined(CPU_ENABLE_SSE) - if (cpu_feature & CPUID_SSE2) - sse2_pagezero(page); - else -#endif - i686_pagezero(page); - } else -#endif - bzero(page, PAGE_SIZE); -} - -/* - * pmap_zero_page zeros the specified hardware page by mapping - * the page into KVM and using bzero to clear its contents. - */ -void -pmap_zero_page(vm_page_t m) -{ - struct sysmaps *sysmaps; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP2) - panic("pmap_zero_page: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); - pagezero(sysmaps->CADDR2); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -/* - * pmap_zero_page_area zeros the specified hardware page by mapping - * the page into KVM and using bzero to clear its contents. - * - * off and size may not cover an area beyond a single hardware page. - */ -void -pmap_zero_page_area(vm_page_t m, int off, int size) -{ - struct sysmaps *sysmaps; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP2) - panic("pmap_zero_page_area: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); - - if (off == 0 && size == PAGE_SIZE) - pagezero(sysmaps->CADDR2); - else - bzero((char *)sysmaps->CADDR2 + off, size); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -/* - * pmap_zero_page_idle zeros the specified hardware page by mapping - * the page into KVM and using bzero to clear its contents. This - * is intended to be called from the vm_pagezero process only and - * outside of Giant. - */ -void -pmap_zero_page_idle(vm_page_t m) -{ - - if (*CMAP3) - panic("pmap_zero_page_idle: CMAP3 busy"); - sched_pin(); - PT_SET_MA(CADDR3, PG_V | PG_RW | VM_PAGE_TO_MACH(m) | PG_A | PG_M); - pagezero(CADDR3); - PT_SET_MA(CADDR3, 0); - sched_unpin(); -} - -/* - * pmap_copy_page copies the specified (machine independent) - * page by mapping the page into virtual memory and using - * bcopy to copy the page, one machine dependent page at a - * time. - */ -void -pmap_copy_page(vm_page_t src, vm_page_t dst) -{ - struct sysmaps *sysmaps; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP1) - panic("pmap_copy_page: CMAP1 busy"); - if (*sysmaps->CMAP2) - panic("pmap_copy_page: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(src) | PG_A); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | VM_PAGE_TO_MACH(dst) | PG_A | PG_M); - bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); - PT_SET_MA(sysmaps->CADDR1, 0); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -int unmapped_buf_allowed = 1; - -void -pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], - vm_offset_t b_offset, int xfersize) -{ - struct sysmaps *sysmaps; - vm_page_t a_pg, b_pg; - char *a_cp, *b_cp; - vm_offset_t a_pg_offset, b_pg_offset; - int cnt; - - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP1 != 0) - panic("pmap_copy_pages: CMAP1 busy"); - if (*sysmaps->CMAP2 != 0) - panic("pmap_copy_pages: CMAP2 busy"); - sched_pin(); - while (xfersize > 0) { - a_pg = ma[a_offset >> PAGE_SHIFT]; - a_pg_offset = a_offset & PAGE_MASK; - cnt = min(xfersize, PAGE_SIZE - a_pg_offset); - b_pg = mb[b_offset >> PAGE_SHIFT]; - b_pg_offset = b_offset & PAGE_MASK; - cnt = min(cnt, PAGE_SIZE - b_pg_offset); - PT_SET_MA(sysmaps->CADDR1, PG_V | VM_PAGE_TO_MACH(a_pg) | PG_A); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | - VM_PAGE_TO_MACH(b_pg) | PG_A | PG_M); - a_cp = sysmaps->CADDR1 + a_pg_offset; - b_cp = sysmaps->CADDR2 + b_pg_offset; - bcopy(a_cp, b_cp, cnt); - a_offset += cnt; - b_offset += cnt; - xfersize -= cnt; - } - PT_SET_MA(sysmaps->CADDR1, 0); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); -} - -/* - * Returns true if the pmap's pv is one of the first - * 16 pvs linked to from this page. This count may - * be changed upwards or downwards in the future; it - * is only necessary that true be returned for a small - * subset of pmaps for proper page aging. - */ -boolean_t -pmap_page_exists_quick(pmap_t pmap, vm_page_t m) -{ - pv_entry_t pv; - int loops = 0; - boolean_t rv; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_page_exists_quick: page %p is not managed", m)); - rv = FALSE; - rw_wlock(&pvh_global_lock); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - if (PV_PMAP(pv) == pmap) { - rv = TRUE; - break; - } - loops++; - if (loops >= 16) - break; - } - rw_wunlock(&pvh_global_lock); - return (rv); -} - -/* - * pmap_page_wired_mappings: - * - * Return the number of managed mappings to the given physical page - * that are wired. - */ -int -pmap_page_wired_mappings(vm_page_t m) -{ - pv_entry_t pv; - pt_entry_t *pte; - pmap_t pmap; - int count; - - count = 0; - if ((m->oflags & VPO_UNMANAGED) != 0) - return (count); - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & PG_W) != 0) - count++; - PMAP_UNLOCK(pmap); - } - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (count); -} - -/* - * Returns TRUE if the given page is mapped. Otherwise, returns FALSE. - */ -boolean_t -pmap_page_is_mapped(vm_page_t m) -{ - - if ((m->oflags & VPO_UNMANAGED) != 0) - return (FALSE); - return (!TAILQ_EMPTY(&m->md.pv_list)); -} - -/* - * Remove all pages from specified address space - * this aids process exit speeds. Also, this code - * is special cased for current process only, but - * can have the more generic (and slightly slower) - * mode enabled. This is much faster than pmap_remove - * in the case of running down an entire address space. - */ -void -pmap_remove_pages(pmap_t pmap) -{ - pt_entry_t *pte, tpte; - vm_page_t m, free = NULL; - pv_entry_t pv; - struct pv_chunk *pc, *npc; - int field, idx; - int32_t bit; - uint32_t inuse, bitmask; - int allfree; - - CTR1(KTR_PMAP, "pmap_remove_pages: pmap=%p", pmap); - - if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { - printf("warning: pmap_remove_pages called with non-current pmap\n"); - return; - } - rw_wlock(&pvh_global_lock); - KASSERT(pmap_is_current(pmap), ("removing pages from non-current pmap")); - PMAP_LOCK(pmap); - sched_pin(); - TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { - KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, - pc->pc_pmap)); - allfree = 1; - for (field = 0; field < _NPCM; field++) { - inuse = ~pc->pc_map[field] & pc_freemask[field]; - while (inuse != 0) { - bit = bsfl(inuse); - bitmask = 1UL << bit; - idx = field * 32 + bit; - pv = &pc->pc_pventry[idx]; - inuse &= ~bitmask; - - pte = vtopte(pv->pv_va); - tpte = *pte ? xpmap_mtop(*pte) : 0; - - if (tpte == 0) { - printf( - "TPTE at %p IS ZERO @ VA %08x\n", - pte, pv->pv_va); - panic("bad pte"); - } - -/* - * We cannot remove wired pages from a process' mapping at this time - */ - if (tpte & PG_W) { - allfree = 0; - continue; - } - - m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - KASSERT(m->phys_addr == (tpte & PG_FRAME), - ("vm_page_t %p phys_addr mismatch %016jx %016jx", - m, (uintmax_t)m->phys_addr, - (uintmax_t)tpte)); - - KASSERT(m < &vm_page_array[vm_page_array_size], - ("pmap_remove_pages: bad tpte %#jx", - (uintmax_t)tpte)); - - - PT_CLEAR_VA(pte, FALSE); - - /* - * Update the vm_page_t clean/reference bits. - */ - if (tpte & PG_M) - vm_page_dirty(m); - - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - if (TAILQ_EMPTY(&m->md.pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); - - pmap_unuse_pt(pmap, pv->pv_va, &free); - - /* Mark free */ - PV_STAT(pv_entry_frees++); - PV_STAT(pv_entry_spare++); - pv_entry_count--; - pc->pc_map[field] |= bitmask; - pmap->pm_stats.resident_count--; - } - } - PT_UPDATES_FLUSH(); - if (allfree) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - free_pv_chunk(pc); - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - - sched_unpin(); - pmap_invalidate_all(pmap); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); - pmap_free_zero_pages(free); -} - -/* - * pmap_is_modified: - * - * Return whether or not the specified physical page was modified - * in any physical maps. - */ -boolean_t -pmap_is_modified(vm_page_t m) -{ - pv_entry_t pv; - pt_entry_t *pte; - pmap_t pmap; - boolean_t rv; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_is_modified: page %p is not managed", m)); - rv = FALSE; - - /* - * If the page is not exclusive busied, then PGA_WRITEABLE cannot be - * concurrently set while the object is locked. Thus, if PGA_WRITEABLE - * is clear, no PTEs can have PG_M set. - */ - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) - return (rv); - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - rv = (*pte & PG_M) != 0; - PMAP_UNLOCK(pmap); - if (rv) - break; - } - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (rv); -} - -/* - * pmap_is_prefaultable: - * - * Return whether or not the specified virtual address is elgible - * for prefault. - */ -static boolean_t -pmap_is_prefaultable_locked(pmap_t pmap, vm_offset_t addr) -{ - pt_entry_t *pte; - boolean_t rv = FALSE; - - return (rv); - - if (pmap_is_current(pmap) && *pmap_pde(pmap, addr)) { - pte = vtopte(addr); - rv = (*pte == 0); - } - return (rv); -} - -boolean_t -pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) -{ - boolean_t rv; - - PMAP_LOCK(pmap); - rv = pmap_is_prefaultable_locked(pmap, addr); - PMAP_UNLOCK(pmap); - return (rv); -} - -boolean_t -pmap_is_referenced(vm_page_t m) -{ - pv_entry_t pv; - pt_entry_t *pte; - pmap_t pmap; - boolean_t rv; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_is_referenced: page %p is not managed", m)); - rv = FALSE; - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); - PMAP_UNLOCK(pmap); - if (rv) - break; - } - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (rv); -} - -void -pmap_map_readonly(pmap_t pmap, vm_offset_t va, int len) -{ - int i, npages = round_page(len) >> PAGE_SHIFT; - for (i = 0; i < npages; i++) { - pt_entry_t *pte; - pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); - rw_wlock(&pvh_global_lock); - pte_store(pte, xpmap_mtop(*pte & ~(PG_RW|PG_M))); - rw_wunlock(&pvh_global_lock); - PMAP_MARK_PRIV(xpmap_mtop(*pte)); - pmap_pte_release(pte); - } -} - -void -pmap_map_readwrite(pmap_t pmap, vm_offset_t va, int len) -{ - int i, npages = round_page(len) >> PAGE_SHIFT; - for (i = 0; i < npages; i++) { - pt_entry_t *pte; - pte = pmap_pte(pmap, (vm_offset_t)(va + i*PAGE_SIZE)); - PMAP_MARK_UNPRIV(xpmap_mtop(*pte)); - rw_wlock(&pvh_global_lock); - pte_store(pte, xpmap_mtop(*pte) | (PG_RW|PG_M)); - rw_wunlock(&pvh_global_lock); - pmap_pte_release(pte); - } -} - -/* - * Clear the write and modified bits in each of the given page's mappings. - */ -void -pmap_remove_write(vm_page_t m) -{ - pv_entry_t pv; - pmap_t pmap; - pt_entry_t oldpte, *pte; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_remove_write: page %p is not managed", m)); - - /* - * If the page is not exclusive busied, then PGA_WRITEABLE cannot be - * set by another thread while the object is locked. Thus, - * if PGA_WRITEABLE is clear, no page table entries need updating. - */ - VM_OBJECT_ASSERT_WLOCKED(m->object); - if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) - return; - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); -retry: - oldpte = *pte; - if ((oldpte & PG_RW) != 0) { - vm_paddr_t newpte = oldpte & ~(PG_RW | PG_M); - - /* - * Regardless of whether a pte is 32 or 64 bits - * in size, PG_RW and PG_M are among the least - * significant 32 bits. - */ - PT_SET_VA_MA(pte, newpte, TRUE); - if (*pte != newpte) - goto retry; - - if ((oldpte & PG_M) != 0) - vm_page_dirty(m); - pmap_invalidate_page(pmap, pv->pv_va); - } - PMAP_UNLOCK(pmap); - } - vm_page_aflag_clear(m, PGA_WRITEABLE); - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); -} - -/* - * pmap_ts_referenced: - * - * Return a count of reference bits for a page, clearing those bits. - * It is not necessary for every reference bit to be cleared, but it - * is necessary that 0 only be returned when there are truly no - * reference bits set. - * - * XXX: The exact number of bits to check and clear is a matter that - * should be tested and standardized at some point in the future for - * optimal aging of shared pages. - */ -int -pmap_ts_referenced(vm_page_t m) -{ - pv_entry_t pv, pvf, pvn; - pmap_t pmap; - pt_entry_t *pte; - int rtval = 0; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_ts_referenced: page %p is not managed", m)); - rw_wlock(&pvh_global_lock); - sched_pin(); - if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { - pvf = pv; - do { - pvn = TAILQ_NEXT(pv, pv_next); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & PG_A) != 0) { - PT_SET_VA_MA(pte, *pte & ~PG_A, FALSE); - pmap_invalidate_page(pmap, pv->pv_va); - rtval++; - if (rtval > 4) - pvn = NULL; - } - PMAP_UNLOCK(pmap); - } while ((pv = pvn) != NULL && pv != pvf); - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_MA(PADDR1, 0); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - return (rtval); -} - -/* - * Apply the given advice to the specified range of addresses within the - * given pmap. Depending on the advice, clear the referenced and/or - * modified flags in each mapping and set the mapped page's dirty field. - */ -void -pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) -{ - pd_entry_t oldpde; - pt_entry_t *pte; - vm_offset_t pdnxt; - vm_page_t m; - boolean_t anychanged; - - if (advice != MADV_DONTNEED && advice != MADV_FREE) - return; - anychanged = FALSE; - rw_wlock(&pvh_global_lock); - sched_pin(); - PMAP_LOCK(pmap); - for (; sva < eva; sva = pdnxt) { - pdnxt = (sva + NBPDR) & ~PDRMASK; - if (pdnxt < sva) - pdnxt = eva; - oldpde = pmap->pm_pdir[sva >> PDRSHIFT]; - if ((oldpde & (PG_PS | PG_V)) != PG_V) - continue; - if (pdnxt > eva) - pdnxt = eva; - for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, - sva += PAGE_SIZE) { - if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | - PG_V)) - continue; - else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - if (advice == MADV_DONTNEED) { - /* - * Future calls to pmap_is_modified() - * can be avoided by making the page - * dirty now. - */ - m = PHYS_TO_VM_PAGE(xpmap_mtop(*pte) & - PG_FRAME); - vm_page_dirty(m); - } - PT_SET_VA_MA(pte, *pte & ~(PG_M | PG_A), TRUE); - } else if ((*pte & PG_A) != 0) - PT_SET_VA_MA(pte, *pte & ~PG_A, TRUE); - else - continue; - if ((*pte & PG_G) != 0) - pmap_invalidate_page(pmap, sva); - else - anychanged = TRUE; - } - } - PT_UPDATES_FLUSH(); - if (*PMAP1) - PT_SET_VA_MA(PMAP1, 0, TRUE); - if (anychanged) - pmap_invalidate_all(pmap); - sched_unpin(); - rw_wunlock(&pvh_global_lock); - PMAP_UNLOCK(pmap); -} - -/* - * Clear the modify bits on the specified physical page. - */ -void -pmap_clear_modify(vm_page_t m) -{ - pv_entry_t pv; - pmap_t pmap; - pt_entry_t *pte; - - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_clear_modify: page %p is not managed", m)); - VM_OBJECT_ASSERT_WLOCKED(m->object); - KASSERT(!vm_page_xbusied(m), - ("pmap_clear_modify: page %p is exclusive busied", m)); - - /* - * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. - * If the object containing the page is locked and the page is not - * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. - */ - if ((m->aflags & PGA_WRITEABLE) == 0) - return; - rw_wlock(&pvh_global_lock); - sched_pin(); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - PMAP_LOCK(pmap); - pte = pmap_pte_quick(pmap, pv->pv_va); - if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - /* - * Regardless of whether a pte is 32 or 64 bits - * in size, PG_M is among the least significant - * 32 bits. - */ - PT_SET_VA_MA(pte, *pte & ~PG_M, FALSE); - pmap_invalidate_page(pmap, pv->pv_va); - } - PMAP_UNLOCK(pmap); - } - sched_unpin(); - rw_wunlock(&pvh_global_lock); -} - -/* - * Miscellaneous support routines follow - */ - -/* - * Map a set of physical memory pages into the kernel virtual - * address space. Return a pointer to where it is mapped. This - * routine is intended to be used for mapping device memory, - * NOT real memory. - */ -void * -pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) -{ - vm_offset_t va, offset; - vm_size_t tmpsize; - - offset = pa & PAGE_MASK; - size = round_page(offset + size); - pa = pa & PG_FRAME; - - if (pa < KERNLOAD && pa + size <= KERNLOAD) - va = KERNBASE + pa; - else - va = kva_alloc(size); - if (!va) - panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); - - for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) - pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); - pmap_invalidate_range(kernel_pmap, va, va + tmpsize); - pmap_invalidate_cache_range(va, va + size, FALSE); - return ((void *)(va + offset)); -} - -void * -pmap_mapdev(vm_paddr_t pa, vm_size_t size) -{ - - return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); -} - -void * -pmap_mapbios(vm_paddr_t pa, vm_size_t size) -{ - - return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); -} - -void -pmap_unmapdev(vm_offset_t va, vm_size_t size) -{ - vm_offset_t base, offset; - - if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) - return; - base = trunc_page(va); - offset = va & PAGE_MASK; - size = round_page(offset + size); - kva_free(base, size); -} - -/* - * Sets the memory attribute for the specified page. - */ -void -pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) -{ - - m->md.pat_mode = ma; - if ((m->flags & PG_FICTITIOUS) != 0) - return; - - /* - * If "m" is a normal page, flush it from the cache. - * See pmap_invalidate_cache_range(). - * - * First, try to find an existing mapping of the page by sf - * buffer. sf_buf_invalidate_cache() modifies mapping and - * flushes the cache. - */ - if (sf_buf_invalidate_cache(m)) - return; - - /* - * If page is not mapped by sf buffer, but CPU does not - * support self snoop, map the page transient and do - * invalidation. In the worst case, whole cache is flushed by - * pmap_invalidate_cache_range(). - */ - if ((cpu_feature & CPUID_SS) == 0) - pmap_flush_page(m); -} - -static void -pmap_flush_page(vm_page_t m) -{ - struct sysmaps *sysmaps; - vm_offset_t sva, eva; - - if ((cpu_feature & CPUID_CLFSH) != 0) { - sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; - mtx_lock(&sysmaps->lock); - if (*sysmaps->CMAP2) - panic("pmap_flush_page: CMAP2 busy"); - sched_pin(); - PT_SET_MA(sysmaps->CADDR2, PG_V | PG_RW | - VM_PAGE_TO_MACH(m) | PG_A | PG_M | - pmap_cache_bits(m->md.pat_mode, 0)); - invlcaddr(sysmaps->CADDR2); - sva = (vm_offset_t)sysmaps->CADDR2; - eva = sva + PAGE_SIZE; - - /* - * Use mfence despite the ordering implied by - * mtx_{un,}lock() because clflush is not guaranteed - * to be ordered by any other instruction. - */ - mfence(); - for (; sva < eva; sva += cpu_clflush_line_size) - clflush(sva); - mfence(); - PT_SET_MA(sysmaps->CADDR2, 0); - sched_unpin(); - mtx_unlock(&sysmaps->lock); - } else - pmap_invalidate_cache(); -} - -/* - * Changes the specified virtual address range's memory type to that given by - * the parameter "mode". The specified virtual address range must be - * completely contained within either the kernel map. - * - * Returns zero if the change completed successfully, and either EINVAL or - * ENOMEM if the change failed. Specifically, EINVAL is returned if some part - * of the virtual address range was not mapped, and ENOMEM is returned if - * there was insufficient memory available to complete the change. - */ -int -pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) -{ - vm_offset_t base, offset, tmpva; - pt_entry_t *pte; - u_int opte, npte; - pd_entry_t *pde; - boolean_t changed; - - base = trunc_page(va); - offset = va & PAGE_MASK; - size = round_page(offset + size); - - /* Only supported on kernel virtual addresses. */ - if (base <= VM_MAXUSER_ADDRESS) - return (EINVAL); - - /* 4MB pages and pages that aren't mapped aren't supported. */ - for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { - pde = pmap_pde(kernel_pmap, tmpva); - if (*pde & PG_PS) - return (EINVAL); - if ((*pde & PG_V) == 0) - return (EINVAL); - pte = vtopte(va); - if ((*pte & PG_V) == 0) - return (EINVAL); - } - - changed = FALSE; - - /* - * Ok, all the pages exist and are 4k, so run through them updating - * their cache mode. - */ - for (tmpva = base; size > 0; ) { - pte = vtopte(tmpva); - - /* - * The cache mode bits are all in the low 32-bits of the - * PTE, so we can just spin on updating the low 32-bits. - */ - do { - opte = *(u_int *)pte; - npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); - npte |= pmap_cache_bits(mode, 0); - PT_SET_VA_MA(pte, npte, TRUE); - } while (npte != opte && (*pte != npte)); - if (npte != opte) - changed = TRUE; - tmpva += PAGE_SIZE; - size -= PAGE_SIZE; - } - - /* - * Flush CPU caches to make sure any data isn't cached that - * shouldn't be, etc. - */ - if (changed) { - pmap_invalidate_range(kernel_pmap, base, tmpva); - pmap_invalidate_cache_range(base, tmpva, FALSE); - } - return (0); -} - -/* - * perform the pmap work for mincore - */ -int -pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) -{ - pt_entry_t *ptep, pte; - vm_paddr_t pa; - int val; - - PMAP_LOCK(pmap); -retry: - ptep = pmap_pte(pmap, addr); - pte = (ptep != NULL) ? PT_GET(ptep) : 0; - pmap_pte_release(ptep); - val = 0; - if ((pte & PG_V) != 0) { - val |= MINCORE_INCORE; - if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; - if ((pte & PG_A) != 0) - val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; - } - if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != - (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && - (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { - pa = pte & PG_FRAME; - /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ - if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) - goto retry; - } else - PA_UNLOCK_COND(*locked_pa); - PMAP_UNLOCK(pmap); - return (val); -} - -void -pmap_activate(struct thread *td) -{ - pmap_t pmap, oldpmap; - u_int cpuid; - u_int32_t cr3; - - critical_enter(); - pmap = vmspace_pmap(td->td_proc->p_vmspace); - oldpmap = PCPU_GET(curpmap); - cpuid = PCPU_GET(cpuid); -#if defined(SMP) - CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); - CPU_SET_ATOMIC(cpuid, &pmap->pm_active); -#else - CPU_CLR(cpuid, &oldpmap->pm_active); - CPU_SET(cpuid, &pmap->pm_active); -#endif -#ifdef PAE - cr3 = vtophys(pmap->pm_pdpt); -#else - cr3 = vtophys(pmap->pm_pdir); -#endif - /* - * pmap_activate is for the current thread on the current cpu - */ - td->td_pcb->pcb_cr3 = cr3; - PT_UPDATES_FLUSH(); - load_cr3(cr3); - PCPU_SET(curpmap, pmap); - critical_exit(); -} - -void -pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) -{ -} - -/* - * Increase the starting virtual address of the given mapping if a - * different alignment might result in more superpage mappings. - */ -void -pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, - vm_offset_t *addr, vm_size_t size) -{ - vm_offset_t superpage_offset; - - if (size < NBPDR) - return; - if (object != NULL && (object->flags & OBJ_COLORED) != 0) - offset += ptoa(object->pg_color); - superpage_offset = offset & PDRMASK; - if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || - (*addr & PDRMASK) == superpage_offset) - return; - if ((*addr & PDRMASK) < superpage_offset) - *addr = (*addr & ~PDRMASK) + superpage_offset; - else - *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; -} - -void -pmap_suspend() -{ - pmap_t pmap; - int i, pdir, offset; - vm_paddr_t pdirma; - mmu_update_t mu[4]; - - /* - * We need to remove the recursive mapping structure from all - * our pmaps so that Xen doesn't get confused when it restores - * the page tables. The recursive map lives at page directory - * index PTDPTDI. We assume that the suspend code has stopped - * the other vcpus (if any). - */ - LIST_FOREACH(pmap, &allpmaps, pm_list) { - for (i = 0; i < 4; i++) { - /* - * Figure out which page directory (L2) page - * contains this bit of the recursive map and - * the offset within that page of the map - * entry - */ - pdir = (PTDPTDI + i) / NPDEPG; - offset = (PTDPTDI + i) % NPDEPG; - pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; - mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); - mu[i].val = 0; - } - HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); - } -} - -void -pmap_resume() -{ - pmap_t pmap; - int i, pdir, offset; - vm_paddr_t pdirma; - mmu_update_t mu[4]; - - /* - * Restore the recursive map that we removed on suspend. - */ - LIST_FOREACH(pmap, &allpmaps, pm_list) { - for (i = 0; i < 4; i++) { - /* - * Figure out which page directory (L2) page - * contains this bit of the recursive map and - * the offset within that page of the map - * entry - */ - pdir = (PTDPTDI + i) / NPDEPG; - offset = (PTDPTDI + i) % NPDEPG; - pdirma = pmap->pm_pdpt[pdir] & PG_FRAME; - mu[i].ptr = pdirma + offset * sizeof(pd_entry_t); - mu[i].val = (pmap->pm_pdpt[i] & PG_FRAME) | PG_V; - } - HYPERVISOR_mmu_update(mu, 4, NULL, DOMID_SELF); - } -} - -#if defined(PMAP_DEBUG) -pmap_pid_dump(int pid) -{ - pmap_t pmap; - struct proc *p; - int npte = 0; - int index; - - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - if (p->p_pid != pid) - continue; - - if (p->p_vmspace) { - int i,j; - index = 0; - pmap = vmspace_pmap(p->p_vmspace); - for (i = 0; i < NPDEPTD; i++) { - pd_entry_t *pde; - pt_entry_t *pte; - vm_offset_t base = i << PDRSHIFT; - - pde = &pmap->pm_pdir[i]; - if (pde && pmap_pde_v(pde)) { - for (j = 0; j < NPTEPG; j++) { - vm_offset_t va = base + (j << PAGE_SHIFT); - if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { - if (index) { - index = 0; - printf("\n"); - } - sx_sunlock(&allproc_lock); - return (npte); - } - pte = pmap_pte(pmap, va); - if (pte && pmap_pte_v(pte)) { - pt_entry_t pa; - vm_page_t m; - pa = PT_GET(pte); - m = PHYS_TO_VM_PAGE(pa & PG_FRAME); - printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", - va, pa, m->hold_count, m->wire_count, m->flags); - npte++; - index++; - if (index >= 2) { - index = 0; - printf("\n"); - } else { - printf(" "); - } - } - } - } - } - } - } - sx_sunlock(&allproc_lock); - return (npte); -} -#endif - -#if defined(DEBUG) - -static void pads(pmap_t pm); -void pmap_pvdump(vm_paddr_t pa); - -/* print address space of pmap*/ -static void -pads(pmap_t pm) -{ - int i, j; - vm_paddr_t va; - pt_entry_t *ptep; - - if (pm == kernel_pmap) - return; - for (i = 0; i < NPDEPTD; i++) - if (pm->pm_pdir[i]) - for (j = 0; j < NPTEPG; j++) { - va = (i << PDRSHIFT) + (j << PAGE_SHIFT); - if (pm == kernel_pmap && va < KERNBASE) - continue; - if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) - continue; - ptep = pmap_pte(pm, va); - if (pmap_pte_v(ptep)) - printf("%x:%x ", va, *ptep); - }; - -} - -void -pmap_pvdump(vm_paddr_t pa) -{ - pv_entry_t pv; - pmap_t pmap; - vm_page_t m; - - printf("pa %x", pa); - m = PHYS_TO_VM_PAGE(pa); - TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { - pmap = PV_PMAP(pv); - printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); - pads(pmap); - } - printf(" "); -} -#endif Index: sys/i386/xen/xen_machdep.c =================================================================== --- sys/i386/xen/xen_machdep.c +++ /dev/null @@ -1,1236 +0,0 @@ -/* - * - * Copyright (c) 2004 Christian Limpach. - * Copyright (c) 2004-2006,2008 Kip Macy - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Christian Limpach. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - - -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef SMP -#include -#endif - - -#include - - -#define IDTVEC(name) __CONCAT(X,name) - -extern inthand_t -IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), - IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), - IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), - IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), - IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); - - -int xendebug_flags; -start_info_t *xen_start_info; -start_info_t *HYPERVISOR_start_info; -shared_info_t *HYPERVISOR_shared_info; -xen_pfn_t *xen_machine_phys = machine_to_phys_mapping; -xen_pfn_t *xen_phys_machine; -xen_pfn_t *xen_pfn_to_mfn_frame_list[16]; -xen_pfn_t *xen_pfn_to_mfn_frame_list_list; -int preemptable, init_first; -extern unsigned int avail_space; -int xen_vector_callback_enabled = 0; -enum xen_domain_type xen_domain_type = XEN_PV_DOMAIN; - -void ni_cli(void); -void ni_sti(void); - - -void -ni_cli(void) -{ - CTR0(KTR_SPARE2, "ni_cli disabling interrupts"); - __asm__("pushl %edx;" - "pushl %eax;" - ); - __cli(); - __asm__("popl %eax;" - "popl %edx;" - ); -} - - -void -ni_sti(void) -{ - __asm__("pushl %edx;" - "pushl %esi;" - "pushl %eax;" - ); - __sti(); - __asm__("popl %eax;" - "popl %esi;" - "popl %edx;" - ); -} - -void -force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} - -/* - * Modify the cmd_line by converting ',' to NULLs so that it is in a format - * suitable for the static env vars. - */ -char * -xen_setbootenv(char *cmd_line) -{ - char *cmd_line_next; - - /* Skip leading spaces */ - for (; *cmd_line == ' '; cmd_line++); - - xc_printf("xen_setbootenv(): cmd_line='%s'\n", cmd_line); - - for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); - return cmd_line; -} - -int -xen_boothowto(char *envp) -{ - int i, howto = 0; - - /* get equivalents from the environment */ - for (i = 0; howto_names[i].ev != NULL; i++) - if (kern_getenv(howto_names[i].ev) != NULL) - howto |= howto_names[i].mask; - return howto; -} - - -#define XPQUEUE_SIZE 128 - -struct mmu_log { - char *file; - int line; -}; - -#ifdef SMP -/* per-cpu queues and indices */ -#ifdef INVARIANTS -static struct mmu_log xpq_queue_log[XEN_LEGACY_MAX_VCPUS][XPQUEUE_SIZE]; -#endif - -static int xpq_idx[XEN_LEGACY_MAX_VCPUS]; -static mmu_update_t xpq_queue[XEN_LEGACY_MAX_VCPUS][XPQUEUE_SIZE]; - -#define XPQ_QUEUE_LOG xpq_queue_log[vcpu] -#define XPQ_QUEUE xpq_queue[vcpu] -#define XPQ_IDX xpq_idx[vcpu] -#define SET_VCPU() int vcpu = smp_processor_id() -#else - -static mmu_update_t xpq_queue[XPQUEUE_SIZE]; -#ifdef INVARIANTS -static struct mmu_log xpq_queue_log[XPQUEUE_SIZE]; -#endif -static int xpq_idx = 0; - -#define XPQ_QUEUE_LOG xpq_queue_log -#define XPQ_QUEUE xpq_queue -#define XPQ_IDX xpq_idx -#define SET_VCPU() -#endif /* !SMP */ - -#define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1); - -#if 0 -static void -xen_dump_queue(void) -{ - int _xpq_idx = XPQ_IDX; - int i; - - if (_xpq_idx <= 1) - return; - - xc_printf("xen_dump_queue(): %u entries\n", _xpq_idx); - for (i = 0; i < _xpq_idx; i++) { - xc_printf(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, - XPQ_QUEUE[i].ptr); - } -} -#endif - - -static __inline void -_xen_flush_queue(void) -{ - SET_VCPU(); - int _xpq_idx = XPQ_IDX; - int error, i; - -#ifdef INVARIANTS - if (__predict_true(gdtset)) - CRITICAL_ASSERT(curthread); -#endif - - XPQ_IDX = 0; - /* Make sure index is cleared first to avoid double updates. */ - error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE, - _xpq_idx, NULL, DOMID_SELF); - -#if 0 - if (__predict_true(gdtset)) - for (i = _xpq_idx; i > 0;) { - if (i >= 3) { - CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx " - "ptr: %lx val: %lx ptr: %lx", - (XPQ_QUEUE[i-1].val & 0xffffffff), - (XPQ_QUEUE[i-1].ptr & 0xffffffff), - (XPQ_QUEUE[i-2].val & 0xffffffff), - (XPQ_QUEUE[i-2].ptr & 0xffffffff), - (XPQ_QUEUE[i-3].val & 0xffffffff), - (XPQ_QUEUE[i-3].ptr & 0xffffffff)); - i -= 3; - } else if (i == 2) { - CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx", - (XPQ_QUEUE[i-1].val & 0xffffffff), - (XPQ_QUEUE[i-1].ptr & 0xffffffff), - (XPQ_QUEUE[i-2].val & 0xffffffff), - (XPQ_QUEUE[i-2].ptr & 0xffffffff)); - i = 0; - } else { - CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx", - (XPQ_QUEUE[i-1].val & 0xffffffff), - (XPQ_QUEUE[i-1].ptr & 0xffffffff)); - i = 0; - } - } -#endif - if (__predict_false(error < 0)) { - for (i = 0; i < _xpq_idx; i++) - printf("val: %llx ptr: %llx\n", - XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr); - panic("Failed to execute MMU updates: %d", error); - } - -} - -void -xen_flush_queue(void) -{ - SET_VCPU(); - - if (__predict_true(gdtset)) - critical_enter(); - if (XPQ_IDX != 0) _xen_flush_queue(); - if (__predict_true(gdtset)) - critical_exit(); -} - -static __inline void -xen_increment_idx(void) -{ - SET_VCPU(); - - XPQ_IDX++; - if (__predict_false(XPQ_IDX == XPQUEUE_SIZE)) - xen_flush_queue(); -} - -void -xen_check_queue(void) -{ -#ifdef INVARIANTS - SET_VCPU(); - - KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); -#endif -} - -void -xen_invlpg(vm_offset_t va) -{ - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_ALL; - op.arg1.linear_addr = va & ~PAGE_MASK; - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_load_cr3(u_int val) -{ - struct mmuext_op op; -#ifdef INVARIANTS - SET_VCPU(); - - KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX)); -#endif - op.cmd = MMUEXT_NEW_BASEPTR; - op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT; - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -#ifdef KTR -static __inline u_int -rebp(void) -{ - u_int data; - - __asm __volatile("movl 4(%%ebp),%0" : "=r" (data)); - return (data); -} -#endif - -u_int -read_eflags(void) -{ - vcpu_info_t *_vcpu; - u_int eflags; - - eflags = _read_eflags(); - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; - if (_vcpu->evtchn_upcall_mask) - eflags &= ~PSL_I; - - return (eflags); -} - -void -write_eflags(u_int eflags) -{ - u_int intr; - - CTR2(KTR_SPARE2, "%x xen_restore_flags eflags %x", rebp(), eflags); - intr = ((eflags & PSL_I) == 0); - __restore_flags(intr); - _write_eflags(eflags); -} - -void -xen_cli(void) -{ - CTR1(KTR_SPARE2, "%x xen_cli disabling interrupts", rebp()); - __cli(); -} - -void -xen_sti(void) -{ - CTR1(KTR_SPARE2, "%x xen_sti enabling interrupts", rebp()); - __sti(); -} - -u_int -xen_rcr2(void) -{ - - return (HYPERVISOR_shared_info->vcpu_info[curcpu].arch.cr2); -} - -void -_xen_machphys_update(vm_paddr_t mfn, vm_paddr_t pfn, char *file, int line) -{ - SET_VCPU(); - - if (__predict_true(gdtset)) - critical_enter(); - XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - XPQ_QUEUE[XPQ_IDX].val = pfn; -#ifdef INVARIANTS - XPQ_QUEUE_LOG[XPQ_IDX].file = file; - XPQ_QUEUE_LOG[XPQ_IDX].line = line; -#endif - xen_increment_idx(); - if (__predict_true(gdtset)) - critical_exit(); -} - -extern struct rwlock pvh_global_lock; - -void -_xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line) -{ - SET_VCPU(); - - if (__predict_true(gdtset)) - rw_assert(&pvh_global_lock, RA_WLOCKED); - - KASSERT((ptr & 7) == 0, ("misaligned update")); - - if (__predict_true(gdtset)) - critical_enter(); - - XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE; - XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val; -#ifdef INVARIANTS - XPQ_QUEUE_LOG[XPQ_IDX].file = file; - XPQ_QUEUE_LOG[XPQ_IDX].line = line; -#endif - xen_increment_idx(); - if (__predict_true(gdtset)) - critical_exit(); -} - -void -xen_pgdpt_pin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L3_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pgd_pin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L2_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pgd_unpin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pt_pin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L1_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_pt_unpin(vm_paddr_t ma) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.arg1.mfn = ma >> PAGE_SHIFT; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_set_ldt(vm_paddr_t ptr, unsigned long len) -{ - struct mmuext_op op; - op.cmd = MMUEXT_SET_LDT; - op.arg1.linear_addr = ptr; - op.arg2.nr_ents = len; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_tlb_flush(void) -{ - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_LOCAL; - xen_flush_queue(); - PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void -xen_update_descriptor(union descriptor *table, union descriptor *entry) -{ - vm_paddr_t pa; - pt_entry_t *ptp; - - ptp = vtopte((vm_offset_t)table); - pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK); - if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry)) - panic("HYPERVISOR_update_descriptor failed\n"); -} - - -#if 0 -/* - * Bitmap is indexed by page number. If bit is set, the page is part of a - * xen_create_contiguous_region() area of memory. - */ -unsigned long *contiguous_bitmap; - -static void -contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages) -{ - unsigned long start_off, end_off, curr_idx, end_idx; - - curr_idx = first_page / BITS_PER_LONG; - start_off = first_page & (BITS_PER_LONG-1); - end_idx = (first_page + nr_pages) / BITS_PER_LONG; - end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); - - if (curr_idx == end_idx) { - contiguous_bitmap[curr_idx] |= - ((1UL<> PAGE_SHIFT; - mfn = PFNTOMFN(pfn); - PFNTOMFN(pfn) = INVALID_P2M_ENTRY; - PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != 1); - } - - - /* 2. Get a new contiguous memory extent. */ - reservation.extent_order = order; - /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not - * running with a broxen driver XXXEN - */ - reservation.address_bits = 31; - if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1) - goto fail; - - /* 3. Map the new extent in place of old pages. */ - for (i = 0; i < (1 << order); i++) { - int pfn; - pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; - xen_machphys_update(mfn+i, pfn); - PFNTOMFN(pfn) = mfn+i; - } - - xen_tlb_flush(); - -#if 0 - contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order); -#endif - - balloon_unlock(flags); - - return 0; - - fail: - reservation.extent_order = 0; - reservation.address_bits = 0; - - for (i = 0; i < (1 << order); i++) { - int pfn; - pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT; - PANIC_IF(HYPERVISOR_memory_op( - XENMEM_increase_reservation, &reservation) != 1); - xen_machphys_update(mfn, pfn); - PFNTOMFN(pfn) = mfn; - } - - xen_tlb_flush(); - - balloon_unlock(flags); - - return ENOMEM; -} - -void -xen_destroy_contiguous_region(void *addr, int npages) -{ - unsigned long mfn, i, flags, order, pfn0; - struct xen_memory_reservation reservation = { - .nr_extents = 1, - .extent_order = 0, - .domid = DOMID_SELF - }; - set_xen_guest_handle(reservation.extent_start, &mfn); - - pfn0 = vtophys(addr) >> PAGE_SHIFT; -#if 0 - scrub_pages(vstart, 1 << order); -#endif - /* can currently only handle power of two allocation */ - PANIC_IF(ffs(npages) != fls(npages)); - - /* 0. determine order */ - order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages); - - balloon_lock(flags); - -#if 0 - contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order); -#endif - - /* 1. Zap current PTEs, giving away the underlying pages. */ - for (i = 0; i < (1 << order); i++) { - int pfn; - uint64_t new_val = 0; - pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT; - - PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0)); - PFNTOMFN(pfn) = INVALID_P2M_ENTRY; - PANIC_IF(HYPERVISOR_memory_op( - XENMEM_decrease_reservation, &reservation) != 1); - } - - /* 2. Map new pages in place of old pages. */ - for (i = 0; i < (1 << order); i++) { - int pfn; - uint64_t new_val; - pfn = pfn0 + i; - PANIC_IF(HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1); - - new_val = mfn << PAGE_SHIFT; - PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)addr + (i * PAGE_SIZE), - new_val, PG_KERNEL)); - xen_machphys_update(mfn, pfn); - PFNTOMFN(pfn) = mfn; - } - - xen_tlb_flush(); - - balloon_unlock(flags); -} - -extern vm_offset_t proc0kstack; -extern int vm86paddr, vm86phystk; -char *bootmem_start, *bootmem_current, *bootmem_end; - -pteinfo_t *pteinfo_list; -void initvalues(start_info_t *startinfo); - -void * -bootmem_alloc(unsigned int size) -{ - char *retptr; - - retptr = bootmem_current; - PANIC_IF(retptr + size > bootmem_end); - bootmem_current += size; - - return retptr; -} - -void -bootmem_free(void *ptr, unsigned int size) -{ - char *tptr; - - tptr = ptr; - PANIC_IF(tptr != bootmem_current - size || - bootmem_current - size < bootmem_start); - - bootmem_current -= size; -} - -#if 0 -static vm_paddr_t -xpmap_mtop2(vm_paddr_t mpa) -{ - return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT) - ) | (mpa & ~PG_FRAME); -} - -static pd_entry_t -xpmap_get_bootpde(vm_paddr_t va) -{ - - return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22]; -} - -static pd_entry_t -xpmap_get_vbootpde(vm_paddr_t va) -{ - pd_entry_t pde; - - pde = xpmap_get_bootpde(va); - if ((pde & PG_V) == 0) - return (pde & ~PG_FRAME); - return (pde & ~PG_FRAME) | - (xpmap_mtop2(pde & PG_FRAME) + KERNBASE); -} - -static pt_entry_t 8* -xpmap_get_bootptep(vm_paddr_t va) -{ - pd_entry_t pde; - - pde = xpmap_get_vbootpde(va); - if ((pde & PG_V) == 0) - return (void *)-1; -#define PT_MASK 0x003ff000 /* page table address bits */ - return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]); -} - -static pt_entry_t -xpmap_get_bootpte(vm_paddr_t va) -{ - - return xpmap_get_bootptep(va)[0]; -} -#endif - - -#ifdef ADD_ISA_HOLE -static void -shift_phys_machine(unsigned long *phys_machine, int nr_pages) -{ - - unsigned long *tmp_page, *current_page, *next_page; - int i; - - tmp_page = bootmem_alloc(PAGE_SIZE); - current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long)); - next_page = current_page - (PAGE_SIZE/sizeof(unsigned long)); - bcopy(phys_machine, tmp_page, PAGE_SIZE); - - while (current_page > phys_machine) { - /* save next page */ - bcopy(next_page, tmp_page, PAGE_SIZE); - /* shift down page */ - bcopy(current_page, next_page, PAGE_SIZE); - /* finish swap */ - bcopy(tmp_page, current_page, PAGE_SIZE); - - current_page -= (PAGE_SIZE/sizeof(unsigned long)); - next_page -= (PAGE_SIZE/sizeof(unsigned long)); - } - bootmem_free(tmp_page, PAGE_SIZE); - - for (i = 0; i < nr_pages; i++) { - xen_machphys_update(phys_machine[i], i); - } - memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE); - -} -#endif /* ADD_ISA_HOLE */ - -/* - * Build a directory of the pages that make up our Physical to Machine - * mapping table. The Xen suspend/restore code uses this to find our - * mapping table. - */ -static void -init_frame_list_list(void *arg) -{ - unsigned long nr_pages = xen_start_info->nr_pages; -#define FPP (PAGE_SIZE/sizeof(xen_pfn_t)) - int i, j, k; - - xen_pfn_to_mfn_frame_list_list = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); - for (i = 0, j = 0, k = -1; i < nr_pages; - i += FPP, j++) { - if ((j & (FPP - 1)) == 0) { - k++; - xen_pfn_to_mfn_frame_list[k] = - malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); - xen_pfn_to_mfn_frame_list_list[k] = - VTOMFN(xen_pfn_to_mfn_frame_list[k]); - j = 0; - } - xen_pfn_to_mfn_frame_list[k][j] = - VTOMFN(&xen_phys_machine[i]); - } - - HYPERVISOR_shared_info->arch.max_pfn = nr_pages; - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list - = VTOMFN(xen_pfn_to_mfn_frame_list_list); -} -SYSINIT(init_fll, SI_SUB_DEVFS, SI_ORDER_ANY, init_frame_list_list, NULL); - -extern unsigned long physfree; - -int pdir, curoffset; -extern int nkpt; - -extern uint32_t kernbase; - -void -initvalues(start_info_t *startinfo) -{ - vm_offset_t cur_space, cur_space_pt; - struct physdev_set_iopl set_iopl; - - int l3_pages, l2_pages, l1_pages, offset; - vm_paddr_t console_page_ma, xen_store_ma; - vm_offset_t tmpva; - vm_paddr_t shinfo; -#ifdef PAE - vm_paddr_t IdlePDPTma, IdlePDPTnewma; - vm_paddr_t IdlePTDnewma[4]; - pd_entry_t *IdlePDPTnew, *IdlePTDnew; - vm_paddr_t IdlePTDma[4]; -#else - vm_paddr_t IdlePTDma[1]; -#endif - unsigned long i; - int ncpus = MAXCPU; - - nkpt = min( - min( - max((startinfo->nr_pages >> NPGPTD_SHIFT), nkpt), - NPGPTD*NPDEPG - KPTDI), - (HYPERVISOR_VIRT_START - KERNBASE) >> PDRSHIFT); - - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); -#ifdef notyet - /* - * need to install handler - */ - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); -#endif - xen_start_info = startinfo; - HYPERVISOR_start_info = startinfo; - xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list; - - IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE); - l1_pages = 0; - -#ifdef PAE - l3_pages = 1; - l2_pages = 0; - IdlePDPT = (pd_entry_t *)startinfo->pt_base; - IdlePDPTma = VTOM(startinfo->pt_base); - for (i = (KERNBASE >> 30); - (i < 4) && (IdlePDPT[i] != 0); i++) - l2_pages++; - /* - * Note that only one page directory has been allocated at this point. - * Thus, if KERNBASE - */ - for (i = 0; i < l2_pages; i++) - IdlePTDma[i] = VTOM(IdlePTD + i*PAGE_SIZE); - - l2_pages = (l2_pages == 0) ? 1 : l2_pages; -#else - l3_pages = 0; - l2_pages = 1; -#endif - for (i = (((KERNBASE>>18) & PAGE_MASK)>>PAGE_SHIFT); - (i>PDRSHIFT)); i++) { - - if (IdlePTD[i] == 0) - break; - l1_pages++; - } - - /* number of pages allocated after the pts + 1*/; - cur_space = xen_start_info->pt_base + - (l3_pages + l2_pages + l1_pages + 1)*PAGE_SIZE; - - xc_printf("initvalues(): wooh - availmem=%x,%x\n", avail_space, - cur_space); - - xc_printf("KERNBASE=%x,pt_base=%lx, VTOPFN(base)=%x, nr_pt_frames=%lx\n", - KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base), - xen_start_info->nr_pt_frames); - xendebug_flags = 0; /* 0xffffffff; */ - -#ifdef ADD_ISA_HOLE - shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages); -#endif - XENPRINTF("IdlePTD %p\n", IdlePTD); - XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%x pt_base: 0x%lx " - "mod_start: 0x%lx mod_len: 0x%lx\n", - xen_start_info->nr_pages, xen_start_info->shared_info, - xen_start_info->flags, xen_start_info->pt_base, - xen_start_info->mod_start, xen_start_info->mod_len); - -#ifdef PAE - IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE; - bzero(IdlePDPTnew, PAGE_SIZE); - - IdlePDPTnewma = VTOM(IdlePDPTnew); - IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE; - bzero(IdlePTDnew, 4*PAGE_SIZE); - - for (i = 0; i < 4; i++) - IdlePTDnewma[i] = VTOM((uint8_t *)IdlePTDnew + i*PAGE_SIZE); - /* - * L3 - * - * Copy the 4 machine addresses of the new PTDs in to the PDPT - * - */ - for (i = 0; i < 4; i++) - IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V; - - __asm__("nop;"); - /* - * - * re-map the new PDPT read-only - */ - PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V); - /* - * - * Unpin the current PDPT - */ - xen_pt_unpin(IdlePDPTma); - -#endif /* PAE */ - - /* Map proc0's KSTACK */ - proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE); - xc_printf("proc0kstack=%u\n", proc0kstack); - - /* vm86/bios stack */ - cur_space += PAGE_SIZE; - - /* Map space for the vm86 region */ - vm86paddr = (vm_offset_t)cur_space; - cur_space += (PAGE_SIZE * 3); - - /* allocate 4 pages for bootmem allocator */ - bootmem_start = bootmem_current = (char *)cur_space; - cur_space += (4 * PAGE_SIZE); - bootmem_end = (char *)cur_space; - - /* allocate pages for gdt */ - gdt = (union descriptor *)cur_space; - cur_space += PAGE_SIZE*ncpus; - - /* allocate page for ldt */ - ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE; - cur_space += PAGE_SIZE; - - /* unmap remaining pages from initial chunk - * - */ - for (tmpva = cur_space; tmpva < (((uint32_t)&kernbase) + (l1_pages<> 18)), - ((uint8_t *)IdlePTD) + ((KERNBASE >> 18) & PAGE_MASK), - l1_pages*sizeof(pt_entry_t)); - - for (i = 0; i < 4; i++) { - PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE, - IdlePTDnewma[i] | PG_V); - } - xen_load_cr3(VTOP(IdlePDPTnew)); - xen_pgdpt_pin(VTOM(IdlePDPTnew)); - - /* allocate remainder of nkpt pages */ - cur_space_pt = cur_space; - for (offset = (KERNBASE >> PDRSHIFT), i = l1_pages; i < nkpt; - i++, cur_space += PAGE_SIZE) { - pdir = (offset + i) / NPDEPG; - curoffset = ((offset + i) % NPDEPG); - if (((offset + i) << PDRSHIFT) == VM_MAX_KERNEL_ADDRESS) - break; - - /* - * make sure that all the initial page table pages - * have been zeroed - */ - PT_SET_MA(cur_space, VTOM(cur_space) | PG_V | PG_RW); - bzero((char *)cur_space, PAGE_SIZE); - PT_SET_MA(cur_space, (vm_paddr_t)0); - xen_pt_pin(VTOM(cur_space)); - xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + - curoffset*sizeof(vm_paddr_t)), - VTOM(cur_space) | PG_KERNEL); - PT_UPDATES_FLUSH(); - } - - for (i = 0; i < 4; i++) { - pdir = (PTDPTDI + i) / NPDEPG; - curoffset = (PTDPTDI + i) % NPDEPG; - - xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] + - curoffset*sizeof(vm_paddr_t)), - IdlePTDnewma[i] | PG_V); - } - - PT_UPDATES_FLUSH(); - - IdlePTD = IdlePTDnew; - IdlePDPT = IdlePDPTnew; - IdlePDPTma = IdlePDPTnewma; - - HYPERVISOR_shared_info = (shared_info_t *)cur_space; - cur_space += PAGE_SIZE; - - xen_store = (struct xenstore_domain_interface *)cur_space; - cur_space += PAGE_SIZE; - - console_page = (char *)cur_space; - cur_space += PAGE_SIZE; - - /* - * shared_info is an unsigned long so this will randomly break if - * it is allocated above 4GB - I guess people are used to that - * sort of thing with Xen ... sigh - */ - shinfo = xen_start_info->shared_info; - PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL); - - xc_printf("#4\n"); - - xen_store_ma = (((vm_paddr_t)xen_start_info->store_mfn) << PAGE_SHIFT); - PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL); - console_page_ma = (((vm_paddr_t)xen_start_info->console.domU.mfn) << PAGE_SHIFT); - PT_SET_MA(console_page, console_page_ma | PG_KERNEL); - - xc_printf("#5\n"); - - set_iopl.iopl = 1; - PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl)); - xc_printf("#6\n"); -#if 0 - /* add page table for KERNBASE */ - xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t), - VTOM(cur_space) | PG_KERNEL); - xen_flush_queue(); -#ifdef PAE - xen_queue_pt_update(pdir_shadow_ma[3] + KPTDI*sizeof(vm_paddr_t), - VTOM(cur_space) | PG_V | PG_A); -#else - xen_queue_pt_update(pdir_shadow_ma + KPTDI*sizeof(vm_paddr_t), - VTOM(cur_space) | PG_V | PG_A); -#endif - xen_flush_queue(); - cur_space += PAGE_SIZE; - xc_printf("#6\n"); -#endif /* 0 */ -#ifdef notyet - if (xen_start_info->flags & SIF_INITDOMAIN) { - /* Map first megabyte */ - for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE) - PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD); - xen_flush_queue(); - } -#endif - /* - * re-map kernel text read-only - * - */ - for (i = (((vm_offset_t)&btext) & ~PAGE_MASK); - i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE) - PT_SET_MA(i, VTOM(i) | PG_V | PG_A); - - xc_printf("#7\n"); - physfree = VTOP(cur_space); - init_first = physfree >> PAGE_SHIFT; - IdlePTD = (pd_entry_t *)VTOP(IdlePTD); - IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT); - setup_xen_features(); - xc_printf("#8, proc0kstack=%u\n", proc0kstack); -} - - -trap_info_t trap_table[] = { - { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)}, - { 1, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)}, - { 3, 3|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)}, - { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)}, - /* This is UPL on Linux and KPL on BSD */ - { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)}, - { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)}, - { 7, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)}, - /* - * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)}, - * no handler for double fault - */ - { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)}, - {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)}, - {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)}, - {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)}, - {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)}, - {14, 0|4, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)}, - {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)}, - {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)}, - {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)}, - {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)}, - {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)}, - {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)}, - { 0, 0, 0, 0 } -}; - -/* Perform a multicall and check that individual calls succeeded. */ -int -HYPERVISOR_multicall(struct multicall_entry * call_list, int nr_calls) -{ - int ret = 0; - int i; - - /* Perform the multicall. */ - PANIC_IF(_HYPERVISOR_multicall(call_list, nr_calls)); - - /* Check the results of individual hypercalls. */ - for (i = 0; i < nr_calls; i++) - if (__predict_false(call_list[i].result < 0)) - ret++; - if (__predict_false(ret > 0)) - panic("%d multicall(s) failed: cpu %d\n", - ret, smp_processor_id()); - - /* If we didn't panic already, everything succeeded. */ - return (0); -} - -/********** CODE WORTH KEEPING ABOVE HERE *****************/ - -void xen_failsafe_handler(void); - -void -xen_failsafe_handler(void) -{ - - panic("xen_failsafe_handler called!\n"); -} - -void xen_handle_thread_switch(struct pcb *pcb); - -/* This is called by cpu_switch() when switching threads. */ -/* The pcb arg refers to the process control block of the */ -/* next thread which is to run */ -void -xen_handle_thread_switch(struct pcb *pcb) -{ - uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0]; - uint32_t *b = (uint32_t *)&pcb->pcb_fsd; - multicall_entry_t mcl[3]; - int i = 0; - - /* Notify Xen of task switch */ - mcl[i].op = __HYPERVISOR_stack_switch; - mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL); - mcl[i++].args[1] = (unsigned long)pcb; - - /* Check for update of fsd */ - if (*a != *b || *(a+1) != *(b+1)) { - mcl[i].op = __HYPERVISOR_update_descriptor; - *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); - *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; - } - - a += 2; - b += 2; - - /* Check for update of gsd */ - if (*a != *b || *(a+1) != *(b+1)) { - mcl[i].op = __HYPERVISOR_update_descriptor; - *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a); - *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b; - } - - (void)HYPERVISOR_multicall(mcl, i); -} Index: sys/kern/kern_intr.c =================================================================== --- sys/kern/kern_intr.c +++ sys/kern/kern_intr.c @@ -1455,12 +1455,7 @@ /* Schedule the ithread if needed. */ if (thread) { error = intr_event_schedule_thread(ie); -#ifndef XEN KASSERT(error == 0, ("bad stray interrupt")); -#else - if (error != 0) - log(LOG_WARNING, "bad stray interrupt"); -#endif } critical_exit(); td->td_intr_nesting_level--; Index: sys/kern/kern_synch.c =================================================================== --- sys/kern/kern_synch.c +++ sys/kern/kern_synch.c @@ -66,12 +66,6 @@ #include -#ifdef XEN -#include -#include -#include -#endif - #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ @@ -475,9 +469,6 @@ "lockname:\"%s\"", td->td_lockname); #endif SDT_PROBE0(sched, , , preempt); -#ifdef XEN - PT_UPDATES_FLUSH(); -#endif sched_switch(td, newtd, flags); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); Index: sys/kern/subr_param.c =================================================================== --- sys/kern/subr_param.c +++ sys/kern/subr_param.c @@ -99,11 +99,7 @@ long maxswzone; /* max swmeta KVA storage */ long maxbcache; /* max buffer cache KVA storage */ long maxpipekva; /* Limit on pipe KVA */ -#ifdef XEN -int vm_guest = VM_GUEST_XEN; -#else int vm_guest = VM_GUEST_NO; /* Running as virtual machine guest? */ -#endif u_long maxtsiz; /* max text size */ u_long dfldsiz; /* initial data size limit */ u_long maxdsiz; /* max data size */ Index: sys/kern/subr_trap.c =================================================================== --- sys/kern/subr_trap.c +++ sys/kern/subr_trap.c @@ -79,12 +79,6 @@ #include #endif -#ifdef XEN -#include -#include -#include -#endif - #ifdef HWPMC_HOOKS #include #endif @@ -135,9 +129,6 @@ * Let the scheduler adjust our priority etc. */ sched_userret(td); -#ifdef XEN - PT_UPDATES_FLUSH(); -#endif /* * Check for misbehavior. Index: sys/x86/include/segments.h =================================================================== --- sys/x86/include/segments.h +++ sys/x86/include/segments.h @@ -46,11 +46,7 @@ */ #define SEL_RPL_MASK 3 /* requester priv level */ #define ISPL(s) ((s)&3) /* priority level of a selector */ -#ifdef XEN -#define SEL_KPL 1 /* kernel priority level */ -#else #define SEL_KPL 0 /* kernel priority level */ -#endif #define SEL_UPL 3 /* user priority level */ #define ISLDT(s) ((s)&SEL_LDT) /* is it local or global */ #define SEL_LDT 4 /* local descriptor table */ @@ -244,11 +240,7 @@ #define GBIOSUTIL_SEL 16 /* BIOS interface (Utility) */ #define GBIOSARGS_SEL 17 /* BIOS interface (Arguments) */ #define GNDIS_SEL 18 /* For the NDIS layer */ -#ifdef XEN -#define NGDT 9 -#else #define NGDT 19 -#endif /* * Entries in the Local Descriptor Table (LDT) Index: sys/x86/x86/busdma_bounce.c =================================================================== --- sys/x86/x86/busdma_bounce.c +++ sys/x86/x86/busdma_bounce.c @@ -147,11 +147,6 @@ static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags); -#ifdef XEN -#undef pmap_kextract -#define pmap_kextract pmap_kextract_ma -#endif - /* * Allocate a device specific dma_tag. */ Index: sys/x86/x86/cpu_machdep.c =================================================================== --- sys/x86/x86/cpu_machdep.c +++ sys/x86/x86/cpu_machdep.c @@ -100,15 +100,6 @@ #include #include -#ifdef XEN -/* XEN includes */ -#include -#include -#include -#include -#include -#endif - /* * Machine dependent boot() routine * @@ -193,33 +184,6 @@ return (0); } -#if defined(__i386__) && defined(XEN) - -static void -idle_block(void) -{ - - HYPERVISOR_sched_op(SCHEDOP_block, 0); -} - -void -cpu_halt(void) -{ - HYPERVISOR_shutdown(SHUTDOWN_poweroff); -} - -int scheduler_running; - -static void -cpu_idle_hlt(sbintime_t sbt) -{ - - scheduler_running = 1; - enable_intr(); - idle_block(); -} - -#else /* * Shutdown the CPU as much as possible */ @@ -230,8 +194,6 @@ halt(); } -#endif - void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ @@ -263,7 +225,6 @@ } #endif /* !PC98 */ -#if !defined(__i386__) || !defined(XEN) static void cpu_idle_hlt(sbintime_t sbt) { @@ -295,7 +256,6 @@ __asm __volatile("sti; hlt"); *state = STATE_RUNNING; } -#endif static void cpu_idle_mwait(sbintime_t sbt) @@ -370,7 +330,7 @@ } } -#if defined(__i386__) && (defined(PC98) || defined(XEN)) +#if defined(__i386__) && defined(PC98) void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; #else void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; @@ -379,17 +339,15 @@ void cpu_idle(int busy) { -#if !defined(__i386__) || !defined(XEN) uint64_t msr; -#endif sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); -#if defined(MP_WATCHDOG) && (!defined(__i386__) || !defined(XEN)) +#ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif -#if !defined(__i386__) || !defined(XEN) + /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { @@ -397,7 +355,6 @@ goto out; } } -#endif /* If we have time - switch timers into idle mode. */ if (!busy) { @@ -405,14 +362,12 @@ sbt = cpu_idleclock(); } -#if !defined(__i386__) || !defined(XEN) /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } -#endif /* Call main idle method. */ cpu_idle_fn(sbt); @@ -422,9 +377,7 @@ cpu_activeclock(); critical_exit(); } -#if !defined(__i386__) || !defined(XEN) out: -#endif CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } Index: sys/x86/x86/identcpu.c =================================================================== --- sys/x86/x86/identcpu.c +++ sys/x86/x86/identcpu.c @@ -1190,7 +1190,6 @@ SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL); -#ifndef XEN static const char *const vm_bnames[] = { "QEMU", /* QEMU */ "Plex86", /* Plex86 */ @@ -1281,7 +1280,6 @@ freeenv(p); } } -#endif /* * Final stage of CPU identification. @@ -1314,9 +1312,7 @@ cpu_feature2 = regs[2]; #endif -#ifndef XEN identify_hypervisor(); -#endif cpu_vendor_id = find_cpu_vendor_id(); /* Index: sys/x86/x86/intr_machdep.c =================================================================== --- sys/x86/x86/intr_machdep.c +++ sys/x86/x86/intr_machdep.c @@ -532,13 +532,6 @@ struct intsrc *isrc; int i; -#ifdef XEN - /* - * Doesn't work yet - */ - return; -#endif - /* Don't bother on UP. */ if (mp_ncpus == 1) return; Index: sys/x86/x86/local_apic.c =================================================================== --- sys/x86/x86/local_apic.c +++ sys/x86/x86/local_apic.c @@ -1579,17 +1579,13 @@ * Local APIC must be registered before other PICs and pseudo PICs * for proper suspend/resume order. */ -#ifndef XEN intr_register_pic(&lapic_pic); -#endif retval = best_enum->apic_setup_io(); if (retval != 0) printf("%s: Failed to setup I/O APICs: returned %d\n", best_enum->apic_name, retval); -#ifdef XEN - return; -#endif + /* * Finish setting up the local APIC on the BSP once we know * how to properly program the LINT pins. In particular, this Index: sys/x86/xen/xen_nexus.c =================================================================== --- sys/x86/xen/xen_nexus.c +++ sys/x86/xen/xen_nexus.c @@ -66,14 +66,11 @@ nexus_xen_attach(device_t dev) { int error; -#ifndef XEN device_t acpi_dev = NULL; -#endif nexus_init_resources(); bus_generic_probe(dev); -#ifndef XEN if (xen_initial_domain()) { /* Disable some ACPI devices that are not usable by Dom0 */ acpi_cpu_disabled = true; @@ -84,13 +81,10 @@ if (acpi_dev == NULL) panic("Unable to add ACPI bus to Xen Dom0"); } -#endif error = bus_generic_attach(dev); -#ifndef XEN if (xen_initial_domain() && (error == 0)) acpi_install_wakeup_handler(device_get_softc(acpi_dev)); -#endif return (error); }