Index: TODO =================================================================== --- /dev/null +++ TODO @@ -0,0 +1,20 @@ ++ PAE +- no, NULL deref needs two PDEs: move base from 8M to 4M (and 2M for PAE) ? ++ ISA Hole double-mapping ? ++ NULL deref protection ++ TSS and LDT in trampoline map +audit all KERNBASE usage: + + i386/i386/minidump + + kern/link_elf.c and kern/link_elf_obj.c +coredump ++ LOR on copyout ("w") ++ NMI: always reload %cr3 ++ MCHK: same ++ Comment explaining KVA layout. ++ lcall ++ superpage mapping of the kernel text ++ enable PTD[0] for sleep/wakeup + +Usermode: ++ 1. trampoline ++ 2. sys_machdep. ldt and tss allocation \ No newline at end of file Index: gnu/usr.bin/gdb/kgdb/trgt_i386.c =================================================================== --- gnu/usr.bin/gdb/kgdb/trgt_i386.c +++ gnu/usr.bin/gdb/kgdb/trgt_i386.c @@ -279,12 +279,26 @@ char buf[MAX_REGISTER_SIZE]; struct kgdb_frame_cache *cache; char *pname; + CORE_ADDR pcx; + uintptr_t addr, setidt_disp; cache = *this_cache; if (cache == NULL) { cache = FRAME_OBSTACK_ZALLOC(struct kgdb_frame_cache); *this_cache = cache; - cache->pc = frame_func_unwind(next_frame); + pcx = frame_func_unwind(next_frame); + if (pcx >= 0xffc00000) { + addr = kgdb_lookup("setidt_disp"); + if (addr != 0) { + if (kvm_read(kvm, addr, &setidt_disp, + sizeof(setidt_disp)) != + sizeof(setidt_disp)) + warnx("kvm_read: %s", kvm_geterr(kvm)); + else + pcx += setidt_disp; + } + } + cache->pc = pcx; find_pc_partial_function(cache->pc, &pname, NULL, NULL); if (pname[0] != 'X') cache->frame_type = FT_NORMAL; Index: sys/conf/files.i386 =================================================================== --- sys/conf/files.i386 +++ sys/conf/files.i386 @@ -472,6 +472,7 @@ i386/i386/bios.c standard i386/i386/bioscall.s standard i386/i386/bpf_jit_machdep.c optional bpf_jitter +i386/i386/copyout.c standard i386/i386/db_disasm.c optional ddb i386/i386/db_interface.c optional ddb i386/i386/db_trace.c optional ddb Index: sys/conf/ldscript.i386 =================================================================== --- sys/conf/ldscript.i386 +++ sys/conf/ldscript.i386 @@ -6,7 +6,7 @@ SECTIONS { /* Read-only sections, merged into text segment: */ - . = kernbase + kernload + SIZEOF_HEADERS; + . = kernbase + SIZEOF_HEADERS; .interp : { *(.interp) } .hash : { *(.hash) } .gnu.hash : { *(.gnu.hash) } Index: sys/dev/dcons/dcons_crom.c =================================================================== --- sys/dev/dcons/dcons_crom.c +++ sys/dev/dcons/dcons_crom.c @@ -109,7 +109,11 @@ static off_t idt_paddr; /* XXX */ +#ifdef __amd64__ idt_paddr = (char *)idt - (char *)KERNBASE; +#else /* __i386__ */ + idt_paddr = (off_t)pmap_kextract((vm_offset_t)idt); +#endif crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_HI, ADDR_HI(idt_paddr)); crom_add_entry(&sc->unit, DCONS_CSR_KEY_RESET_LO, ADDR_LO(idt_paddr)); Index: sys/dev/dcons/dcons_os.c =================================================================== --- sys/dev/dcons/dcons_os.c +++ sys/dev/dcons/dcons_os.c @@ -309,11 +309,16 @@ * Allow read/write access to dcons buffer. */ for (pa = trunc_page(addr); pa < addr + size; pa += PAGE_SIZE) - *vtopte(KERNBASE + pa) |= PG_RW; + *vtopte(pa) |= PG_RW; invltlb(); #endif /* XXX P to V */ +#ifdef __i386__ + dg.buf = (struct dcons_buf *)((vm_offset_t)PMAP_MAP_LOW + + addr); +#else /* __amd64__ */ dg.buf = (struct dcons_buf *)(vm_offset_t)(KERNBASE + addr); +#endif dg.size = size; if (dcons_load_buffer(dg.buf, dg.size, sc) < 0) dg.buf = NULL; Index: sys/dev/hyperv/vmbus/i386/vmbus_vector.S =================================================================== --- sys/dev/hyperv/vmbus/i386/vmbus_vector.S +++ sys/dev/hyperv/vmbus/i386/vmbus_vector.S @@ -26,11 +26,11 @@ * $FreeBSD$ */ +#include "assym.s" + #include #include -#include "assym.s" - /* * This is the Hyper-V vmbus channel direct callback interrupt. * Only used when it is running on Hyper-V. @@ -42,6 +42,7 @@ PUSH_FRAME SET_KERNEL_SREGS cld + KENTER FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp call vmbus_handle_intr Index: sys/i386/conf/NOTES =================================================================== --- sys/i386/conf/NOTES +++ sys/i386/conf/NOTES @@ -951,22 +951,6 @@ ##################################################################### # VM OPTIONS -# Disable the 4 MByte page PSE CPU feature. The PSE feature allows the -# kernel to use 4 MByte pages to map the kernel instead of 4k pages. -# This saves on the amount of memory needed for page tables needed to -# map the kernel. You should only disable this feature as a temporary -# workaround if you are having problems with it enabled. -# -#options DISABLE_PSE - -# Disable the global pages PGE CPU feature. The PGE feature allows pages -# to be marked with the PG_G bit. TLB entries for these pages are not -# flushed from the cache when %cr3 is reloaded. This can make context -# switches less expensive. You should only disable this feature as a -# temporary workaround if you are having problems with it enabled. -# -#options DISABLE_PG_G - # KSTACK_PAGES is the number of memory pages to assign to the kernel # stack of each thread. Index: sys/i386/conf/X =================================================================== --- sys/i386/conf/X +++ sys/i386/conf/X @@ -5,6 +5,7 @@ options HWPMC_HOOKS device apic device acpi +options EARLY_AP_STARTUP options INVARIANTS options INVARIANT_SUPPORT Index: sys/i386/i386/apic_vector.s =================================================================== --- sys/i386/i386/apic_vector.s +++ sys/i386/i386/apic_vector.s @@ -67,34 +67,39 @@ * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ -#define ISR_VEC(index, vec_name) \ - .text ; \ - SUPERALIGN_TEXT ; \ -IDTVEC(vec_name ## _pti) ; \ -IDTVEC(vec_name) ; \ - PUSH_FRAME ; \ - SET_KERNEL_SREGS ; \ - cld ; \ - FAKE_MCOUNT(TF_EIP(%esp)) ; \ - cmpl $0,x2apic_mode ; \ - je 1f ; \ - movl $(MSR_APIC_ISR0 + index),%ecx ; \ - rdmsr ; \ - jmp 2f ; \ -1: ; \ - movl lapic_map, %edx ;/* pointer to local APIC */ \ - movl LA_ISR + 16 * (index)(%edx), %eax ; /* load ISR */ \ -2: ; \ - bsrl %eax, %eax ; /* index of highest set bit in ISR */ \ - jz 3f ; \ - addl $(32 * index),%eax ; \ - pushl %esp ; \ - pushl %eax ; /* pass the IRQ */ \ - call lapic_handle_intr ; \ - addl $8, %esp ; /* discard parameter */ \ -3: ; \ - MEXITCOUNT ; \ + .macro ISR_VEC index, vec_name + .text + SUPERALIGN_TEXT + .globl X\()\vec_name\()_pti, X\()\vec_name + +X\()\vec_name\()_pti: +X\()\vec_name: + PUSH_FRAME + SET_KERNEL_SREGS + cld + KENTER + FAKE_MCOUNT(TF_EIP(%esp)) + cmpl $0,x2apic_mode + je 2f + movl $(MSR_APIC_ISR0 + \index),%ecx + rdmsr + jmp 3f +2: + movl lapic_map, %edx /* pointer to local APIC */ + movl LA_ISR + 16 * \index(%edx), %eax /* load ISR */ +3: + bsrl %eax, %eax /* index of highest set bit in ISR */ + jz 4f + addl $(32 * \index),%eax + pushl %esp + pushl %eax /* pass the IRQ */ + movl $lapic_handle_intr, %eax + call *%eax + addl $8, %esp /* discard parameter */ +4: + MEXITCOUNT jmp doreti + .endm /* * Handle "spurious INTerrupts". @@ -111,13 +116,13 @@ iret - ISR_VEC(1, apic_isr1) - ISR_VEC(2, apic_isr2) - ISR_VEC(3, apic_isr3) - ISR_VEC(4, apic_isr4) - ISR_VEC(5, apic_isr5) - ISR_VEC(6, apic_isr6) - ISR_VEC(7, apic_isr7) + ISR_VEC 1, apic_isr1 + ISR_VEC 2, apic_isr2 + ISR_VEC 3, apic_isr3 + ISR_VEC 4, apic_isr4 + ISR_VEC 5, apic_isr5 + ISR_VEC 6, apic_isr6 + ISR_VEC 7, apic_isr7 /* * Local APIC periodic timer handler. @@ -129,9 +134,11 @@ PUSH_FRAME SET_KERNEL_SREGS cld + KENTER FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp - call lapic_handle_timer + movl $lapic_handle_timer, %eax + call *%eax add $4, %esp MEXITCOUNT jmp doreti @@ -146,8 +153,10 @@ PUSH_FRAME SET_KERNEL_SREGS cld + KENTER FAKE_MCOUNT(TF_EIP(%esp)) - call lapic_handle_cmc + movl $lapic_handle_cmc, %eax + call *%eax MEXITCOUNT jmp doreti @@ -161,8 +170,10 @@ PUSH_FRAME SET_KERNEL_SREGS cld + KENTER FAKE_MCOUNT(TF_EIP(%esp)) - call lapic_handle_error + movl $lapic_handle_error, %eax + call *%eax MEXITCOUNT jmp doreti @@ -177,9 +188,11 @@ PUSH_FRAME SET_KERNEL_SREGS cld + KENTER FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp - call xen_intr_handle_upcall + movl $xen_intr_handle_upcall, %eax + call *%eax add $4, %esp MEXITCOUNT jmp doreti @@ -200,9 +213,9 @@ PUSH_FRAME SET_KERNEL_SREGS cld - - call invltlb_handler - + KENTER + movl $invltlb_handler, %eax + call *%eax jmp invltlb_ret /* @@ -214,9 +227,9 @@ PUSH_FRAME SET_KERNEL_SREGS cld - - call invlpg_handler - + KENTER + movl $invlpg_handler, %eax + call *%eax jmp invltlb_ret /* @@ -228,9 +241,9 @@ PUSH_FRAME SET_KERNEL_SREGS cld - - call invlrng_handler - + KENTER + movl $invlrng_handler, %eax + call *%eax jmp invltlb_ret /* @@ -242,9 +255,9 @@ PUSH_FRAME SET_KERNEL_SREGS cld - - call invlcache_handler - + KENTER + movl $invlcache_handler, %eax + call *%eax jmp invltlb_ret /* @@ -256,12 +269,11 @@ PUSH_FRAME SET_KERNEL_SREGS cld - + KENTER call as_lapic_eoi - FAKE_MCOUNT(TF_EIP(%esp)) - - call ipi_bitmap_handler + movl $ipi_bitmap_handler, %eax + call *%eax MEXITCOUNT jmp doreti @@ -274,9 +286,10 @@ PUSH_FRAME SET_KERNEL_SREGS cld - + KENTER call as_lapic_eoi - call cpustop_handler + movl $cpustop_handler, %eax + call *%eax jmp doreti /* @@ -288,9 +301,10 @@ PUSH_FRAME SET_KERNEL_SREGS cld - + KENTER call as_lapic_eoi - call cpususpend_handler + movl $cpususpend_handler, %eax + call *%eax jmp doreti /* @@ -304,14 +318,14 @@ PUSH_FRAME SET_KERNEL_SREGS cld - + KENTER #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movl ipi_rendezvous_counts(,%eax,4), %eax incl (%eax) #endif - call smp_rendezvous_action - + movl $smp_rendezvous_action, %eax + call *%eax call as_lapic_eoi jmp doreti Index: sys/i386/i386/atpic_vector.s =================================================================== --- sys/i386/i386/atpic_vector.s +++ sys/i386/i386/atpic_vector.s @@ -43,37 +43,41 @@ /* * Macros for interrupt entry, call to handler, and exit. */ -#define INTR(irq_num, vec_name) \ - .text ; \ - SUPERALIGN_TEXT ; \ -IDTVEC(vec_name ##_pti) ; \ -IDTVEC(vec_name) ; \ - PUSH_FRAME ; \ - SET_KERNEL_SREGS ; \ - cld ; \ -; \ - FAKE_MCOUNT(TF_EIP(%esp)) ; \ - pushl %esp ; \ - pushl $irq_num; /* pass the IRQ */ \ - call atpic_handle_intr ; \ - addl $8, %esp ; /* discard the parameters */ \ -; \ - MEXITCOUNT ; \ + .macro INTR irq_num, vec_name + .text + SUPERALIGN_TEXT + .globl X\()\vec_name\()_pti, X\()\vec_name + +X\()\vec_name\()_pti: +X\()\vec_name: + PUSH_FRAME + SET_KERNEL_SREGS + cld + KENTER + FAKE_MCOUNT(TF_EIP(%esp)) + pushl %esp + pushl $\irq_num /* pass the IRQ */ + movl $atpic_handle_intr, %eax + call *%eax + addl $8, %esp /* discard the parameters */ + + MEXITCOUNT jmp doreti + .endm - INTR(0, atpic_intr0) - INTR(1, atpic_intr1) - INTR(2, atpic_intr2) - INTR(3, atpic_intr3) - INTR(4, atpic_intr4) - INTR(5, atpic_intr5) - INTR(6, atpic_intr6) - INTR(7, atpic_intr7) - INTR(8, atpic_intr8) - INTR(9, atpic_intr9) - INTR(10, atpic_intr10) - INTR(11, atpic_intr11) - INTR(12, atpic_intr12) - INTR(13, atpic_intr13) - INTR(14, atpic_intr14) - INTR(15, atpic_intr15) + INTR 0, atpic_intr0 + INTR 1, atpic_intr1 + INTR 2, atpic_intr2 + INTR 3, atpic_intr3 + INTR 4, atpic_intr4 + INTR 5, atpic_intr5 + INTR 6, atpic_intr6 + INTR 7, atpic_intr7 + INTR 8, atpic_intr8 + INTR 9, atpic_intr9 + INTR 10, atpic_intr10 + INTR 11, atpic_intr11 + INTR 12, atpic_intr12 + INTR 13, atpic_intr13 + INTR 14, atpic_intr14 + INTR 15, atpic_intr15 Index: sys/i386/i386/bios.c =================================================================== --- sys/i386/i386/bios.c +++ sys/i386/i386/bios.c @@ -305,6 +305,7 @@ } extern int vm86pa; +extern u_long vm86phystk; extern void bios16_jmp(void); /* @@ -329,7 +330,7 @@ int flags = BIOSCODE_FLAG | BIOSDATA_FLAG; u_int i, arg_start, arg_end; pt_entry_t *pte; - pd_entry_t *ptd; + pd_entry_t *ptd, orig_ptd; arg_start = 0xffffffff; arg_end = 0; @@ -390,27 +391,14 @@ args->seg.code32.base = (u_int)&bios16_jmp & PG_FRAME; args->seg.code32.limit = 0xffff; - ptd = (pd_entry_t *)rcr3(); -#if defined(PAE) || defined(PAE_TABLES) - if (ptd == IdlePDPT) -#else - if (ptd == IdlePTD) -#endif - { - /* - * no page table, so create one and install it. - */ - pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); - ptd = (pd_entry_t *)((u_int)IdlePTD + KERNBASE); - *pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V; - *ptd = vtophys(pte) | PG_RW | PG_V; - } else { - /* - * this is a user-level page table - */ - pte = PTmap; - *pte = (vm86pa - PAGE_SIZE) | PG_RW | PG_V; - } + /* + * no page table, so create one and install it. + */ + pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); + ptd = IdlePTD; + *pte = vm86phystk | PG_RW | PG_V; + orig_ptd = *ptd; + *ptd = vtophys(pte) | PG_RW | PG_V; pmap_invalidate_all(kernel_pmap); /* XXX insurance for now */ stack_top = stack; @@ -464,20 +452,12 @@ i = bios16_call(&args->r, stack_top); - if (pte == PTmap) { - *pte = 0; /* remove entry */ - /* - * XXX only needs to be invlpg(0) but that doesn't work on the 386 - */ - pmap_invalidate_all(kernel_pmap); - } else { - *ptd = 0; /* remove page table */ - /* - * XXX only needs to be invlpg(0) but that doesn't work on the 386 - */ - pmap_invalidate_all(kernel_pmap); - free(pte, M_TEMP); /* ... and free it */ - } + *ptd = orig_ptd; /* remove page table */ + /* + * XXX only needs to be invlpg(0) but that doesn't work on the 386 + */ + pmap_invalidate_all(kernel_pmap); + free(pte, M_TEMP); /* ... and free it */ return (i); } Index: sys/i386/i386/copyout.c =================================================================== --- /dev/null +++ sys/i386/i386/copyout.c @@ -0,0 +1,385 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +cp_fast0(vm_offset_t uva, size_t len, bool write, + void (*f)(vm_offset_t, void *), void *arg) +{ + struct pcpu *pc; + vm_page_t m[2]; + pt_entry_t *pte; + vm_offset_t kaddr; + int error, i, plen; + bool sleepable; + + plen = howmany(uva - trunc_page(uva) + len, PAGE_SIZE); + MPASS(plen <= nitems(m)); + error = 0; + i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, uva, len, + (write ? VM_PROT_WRITE : VM_PROT_READ) | VM_PROT_QUICK_NOFAULT, + m, nitems(m)); + if (i != plen) + return (EFAULT); + sched_pin(); + pc = get_pcpu(); + if (curthread->td_vslock_sz > 0) { + sleepable = false; + mtx_lock(&pc->pc_copyout_mlock); + kaddr = pc->pc_copyout_maddr; + } else { + sleepable = true; + sx_xlock(&pc->pc_copyout_slock); + kaddr = pc->pc_copyout_saddr; + } + for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) { + *pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(m[i]) | + pmap_cache_bits(pmap_page_get_memattr(m[i]), FALSE); + invlpg(kaddr + ptoa(i)); + } + kaddr += uva - trunc_page(uva); + f(kaddr, arg); + sched_unpin(); + if (sleepable) + sx_xunlock(&pc->pc_copyout_slock); + else + mtx_unlock(&pc->pc_copyout_mlock); + for (i = 0; i < plen; i++) { + vm_page_lock(m[i]); + vm_page_unhold(m[i]); + vm_page_unlock(m[i]); + } + return (error); +} + +struct copyinstr_arg0 { + vm_offset_t kc; + size_t len; + size_t alen; + bool end; +}; + +static void +copyinstr_fast0(vm_offset_t kva, void *arg) +{ + struct copyinstr_arg0 *ca; + char c; + + ca = arg; + MPASS(ca->alen == 0 && ca->len > 0 && !ca->end); + while (ca->alen < ca->len && !ca->end) { + c = *(char *)(kva + ca->alen); + *(char *)ca->kc = c; + ca->alen++; + ca->kc++; + if (c == '\0') + ca->end = true; + } +} + +int +copyinstr(const void *udaddr, void *kaddr, size_t maxlen, size_t *lencopied) +{ + struct copyinstr_arg0 ca; + vm_offset_t uc; + size_t plen; + int error; + + error = 0; + ca.end = false; + for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr; + plen < maxlen && !ca.end; uc += ca.alen, plen += ca.alen) { + ca.len = round_page(uc) - uc; + if (ca.len == 0) + ca.len = PAGE_SIZE; + if (plen + ca.len > maxlen) + ca.len = maxlen - plen; + ca.alen = 0; + if (cp_fast0(uc, ca.len, false, copyinstr_fast0, &ca) != 0) { + error = EFAULT; + break; + } + } + if (!ca.end && plen == maxlen && error == 0) + error = ENAMETOOLONG; + if (lencopied != NULL) + *lencopied = plen; + return (error); +} + +struct copyin_arg0 { + vm_offset_t kc; + size_t len; +}; + +static void +copyin_fast0(vm_offset_t kva, void *arg) +{ + struct copyin_arg0 *ca; + + ca = arg; + bcopy((void *)kva, (void *)ca->kc, ca->len); +} + +int +copyin(const void *udaddr, void *kaddr, size_t len) +{ + struct copyin_arg0 ca; + vm_offset_t uc; + size_t plen; + + for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr; + plen < len; uc += ca.len, ca.kc += ca.len, plen += ca.len) { + ca.len = round_page(uc) - uc; + if (ca.len == 0) + ca.len = PAGE_SIZE; + if (plen + ca.len > len) + ca.len = len - plen; + if (cp_fast0(uc, ca.len, false, copyin_fast0, &ca) != 0) + return (EFAULT); + } + return (0); +} + +static void +copyout_fast0(vm_offset_t kva, void *arg) +{ + struct copyin_arg0 *ca; + + ca = arg; + bcopy((void *)ca->kc, (void *)kva, ca->len); +} + +int +copyout(const void *kaddr, void *udaddr, size_t len) +{ + struct copyin_arg0 ca; + vm_offset_t uc; + size_t plen; + + for (plen = 0, uc = (vm_offset_t)udaddr, ca.kc = (vm_offset_t)kaddr; + plen < len; uc += ca.len, ca.kc += ca.len, plen += ca.len) { + ca.len = round_page(uc) - uc; + if (ca.len == 0) + ca.len = PAGE_SIZE; + if (plen + ca.len > len) + ca.len = len - plen; + if (cp_fast0(uc, ca.len, true, copyout_fast0, &ca) != 0) + return (EFAULT); + } + return (0); +} + +static void +fubyte_fast0(vm_offset_t kva, void *arg) +{ + + *(int *)arg = *(u_char *)kva; +} + +int +fubyte(volatile const void *base) +{ + int res; + + if (cp_fast0((vm_offset_t)base, sizeof(char), false, fubyte_fast0, + &res) != 0) + return (-1); + return (res); +} + +static void +fuword16_fast0(vm_offset_t kva, void *arg) +{ + + *(int *)arg = *(uint16_t *)kva; +} + +int +fuword16(volatile const void *base) +{ + int res; + + if (cp_fast0((vm_offset_t)base, sizeof(uint16_t), false, fuword16_fast0, + &res) != 0) + return (-1); + return (res); +} + +static void +fueword_fast0(vm_offset_t kva, void *arg) +{ + + *(uint32_t *)arg = *(uint32_t *)kva; +} + +int +fueword(volatile const void *base, long *val) +{ + uint32_t res; + + if (cp_fast0((vm_offset_t)base, sizeof(long), false, fueword_fast0, + &res) != 0) + return (-1); + *val = res; + return (0); +} + +int +fueword32(volatile const void *base, int32_t *val) +{ + uint32_t res; + + if (cp_fast0((vm_offset_t)base, sizeof(int32_t), false, fueword_fast0, + &res) != 0) + return (-1); + *val = res; + return (0); +} + +static void +subyte_fast0(vm_offset_t kva, void *arg) +{ + + *(u_char *)kva = *(int *)arg; +} + +int +subyte(volatile void *base, int byte) +{ + + return (cp_fast0((vm_offset_t)base, sizeof(u_char), true, subyte_fast0, + &byte) != 0 ? -1 : 0); +} + +static void +suword16_fast0(vm_offset_t kva, void *arg) +{ + + *(int *)kva = *(uint16_t *)arg; +} + +int +suword16(volatile void *base, int word) +{ + + return (cp_fast0((vm_offset_t)base, sizeof(int16_t), true, + suword16_fast0, &word) != 0 ? -1 : 0); +} + +static void +suword_fast0(vm_offset_t kva, void *arg) +{ + + *(int *)kva = *(uint32_t *)arg; +} + +int +suword(volatile void *base, long word) +{ + + return (cp_fast0((vm_offset_t)base, sizeof(long), true, + suword_fast0, &word) != 0 ? -1 : 0); +} + +int +suword32(volatile void *base, int32_t word) +{ + + + return (cp_fast0((vm_offset_t)base, sizeof(int32_t), true, + suword_fast0, &word) != 0 ? -1 : 0); +} + +struct casueword_arg0 { + uint32_t oldval; + uint32_t newval; +}; + +static void +casueword_fast0(vm_offset_t kva, void *arg) +{ + struct casueword_arg0 *ca; + + ca = arg; + atomic_fcmpset_int((u_int *)kva, &ca->oldval, ca->newval); +} + +int +casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, + uint32_t newval) +{ + struct casueword_arg0 ca; + int res; + + ca.oldval = oldval; + ca.newval = newval; + res = cp_fast0((vm_offset_t)base, sizeof(int32_t), true, + casueword_fast0, &ca); + if (res == 0) { + *oldvalp = ca.oldval; + return (0); + } + return (-1); +} + +int +casueword(volatile u_long *base, u_long oldval, u_long *oldvalp, u_long newval) +{ + struct casueword_arg0 ca; + int res; + + ca.oldval = oldval; + ca.newval = newval; + res = cp_fast0((vm_offset_t)base, sizeof(int32_t), true, + casueword_fast0, &ca); + if (res == 0) { + *oldvalp = ca.oldval; + return (0); + } + return (-1); +} Index: sys/i386/i386/db_interface.c =================================================================== --- sys/i386/i386/db_interface.c +++ sys/i386/i386/db_interface.c @@ -115,4 +115,7 @@ db_printf("APIC ID = %d\n", pc->pc_apic_id); db_printf("currentldt = 0x%x\n", pc->pc_currentldt); + db_printf("trampstk = 0x%x\n", pc->pc_trampstk); + db_printf("kesp0 = 0x%x\n", pc->pc_kesp0); + db_printf("common_tssp = 0x%x\n", (u_int)pc->pc_common_tssp); } Index: sys/i386/i386/db_trace.c =================================================================== --- sys/i386/i386/db_trace.c +++ sys/i386/i386/db_trace.c @@ -317,7 +317,12 @@ * actually made the call. */ frame_type = NORMAL; - sym = db_search_symbol(eip - 1, DB_STGY_ANY, &offset); + if (eip >= PMAP_TRM_MIN_ADDRESS) { + sym = db_search_symbol(eip - 1 - setidt_disp, DB_STGY_ANY, + &offset); + } else { + sym = db_search_symbol(eip - 1, DB_STGY_ANY, &offset); + } db_symbol_values(sym, &name, NULL); if (name != NULL) { if (strcmp(name, "calltrap") == 0 || @@ -357,9 +362,9 @@ * switch to a known good state. */ if (frame_type == DOUBLE_FAULT) { - esp = PCPU_GET(common_tss.tss_esp); - eip = PCPU_GET(common_tss.tss_eip); - ebp = PCPU_GET(common_tss.tss_ebp); + esp = PCPU_GET(common_tssp)->tss_esp; + eip = PCPU_GET(common_tssp)->tss_eip; + ebp = PCPU_GET(common_tssp)->tss_ebp; db_printf( "--- trap 0x17, eip = %#r, esp = %#r, ebp = %#r ---\n", eip, esp, ebp); @@ -379,29 +384,26 @@ else tf = (struct trapframe *)((int)*fp + 12); - if (INKERNEL((int) tf)) { - esp = get_esp(tf); - eip = tf->tf_eip; - ebp = tf->tf_ebp; - switch (frame_type) { - case TRAP: - db_printf("--- trap %#r", tf->tf_trapno); - break; - case SYSCALL: - db_printf("--- syscall"); - decode_syscall(tf->tf_eax, td); - break; - case TRAP_TIMERINT: - case TRAP_INTERRUPT: - case INTERRUPT: - db_printf("--- interrupt"); - break; - default: - panic("The moon has moved again."); - } - db_printf(", eip = %#r, esp = %#r, ebp = %#r ---\n", eip, - esp, ebp); + esp = get_esp(tf); + eip = tf->tf_eip; + ebp = tf->tf_ebp; + switch (frame_type) { + case TRAP: + db_printf("--- trap %#r", tf->tf_trapno); + break; + case SYSCALL: + db_printf("--- syscall"); + decode_syscall(tf->tf_eax, td); + break; + case TRAP_TIMERINT: + case TRAP_INTERRUPT: + case INTERRUPT: + db_printf("--- interrupt"); + break; + default: + panic("The moon has moved again."); } + db_printf(", eip = %#r, esp = %#r, ebp = %#r ---\n", eip, esp, ebp); *ip = (db_addr_t) eip; *fp = (struct i386_frame *) ebp; @@ -432,6 +434,10 @@ return (0); } + /* 'frame' can be null initially. Just print the pc then. */ + if (frame == NULL) + goto out; + /* * If an indirect call via an invalid pointer caused a trap, * %pc contains the invalid address while the return address @@ -540,15 +546,21 @@ db_nextframe(&frame, &pc, td); - if (INKERNEL((int)pc) && !INKERNEL((int) frame)) { +out: + /* + * 'frame' can be null here, either because it was initially + * null or because db_nextframe() found no frame. + * db_nextframe() may also have found a non-kernel frame. + * !INKERNEL() classifies both. Stop tracing if either, + * after printing the pc if it is the kernel. + */ + if (INKERNEL((int)pc) && (!INKERNEL((int)frame) || + frame == NULL || frame <= actframe)) { sym = db_search_symbol(pc, DB_STGY_ANY, &offset); db_symbol_values(sym, &name, NULL); db_print_stack_entry(name, 0, 0, 0, pc, frame); break; } - if (!INKERNEL((int) frame)) { - break; - } } return (0); Index: sys/i386/i386/elf_machdep.c =================================================================== --- sys/i386/i386/elf_machdep.c +++ sys/i386/i386/elf_machdep.c @@ -137,7 +137,6 @@ (sysinit_cfunc_t) elf32_insert_brand_entry, &kfreebsd_brand_info); - void elf32_dump_thread(struct thread *td, void *dst, size_t *off) { Index: sys/i386/i386/exception.s =================================================================== --- sys/i386/i386/exception.s +++ sys/i386/i386/exception.s @@ -1,7 +1,7 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. - * Copyright (c) 2007 The FreeBSD Foundation + * Copyright (c) 2007, 2018 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under @@ -38,15 +38,12 @@ #include "opt_atpic.h" #include "opt_hwpmc_hooks.h" +#include "assym.s" + #include #include #include -#include "assym.s" - -#define SEL_RPL_MASK 0x0003 -#define GSEL_KPL 0x0020 /* GSEL(GCODE_SEL, SEL_KPL) */ - #ifdef KDTRACE_HOOKS .bss .globl dtrace_invop_jump_addr @@ -63,9 +60,10 @@ .zero 8 #endif .text -#ifdef HWPMC_HOOKS - ENTRY(start_exceptions) -#endif +ENTRY(start_exceptions) + .globl tramp_idleptd +tramp_idleptd: .long 0 + /*****************************************************************************/ /* Trap handling */ /*****************************************************************************/ @@ -92,6 +90,10 @@ * must restore them prior to calling 'iret'. The cpu adjusts the %cs and * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we * must load them with appropriate values for supervisor mode operation. + * + * This code is not executed at the address of linking. It is copied to the + * trampoline area. As the consequence, all code there and in included files + * must be PIC. */ MCOUNT_LABEL(user) @@ -103,8 +105,6 @@ pushl $0; TRAP(T_DIVIDE) IDTVEC(dbg) pushl $0; TRAP(T_TRCTRAP) -IDTVEC(nmi) - pushl $0; TRAP(T_NMI) IDTVEC(bpt) pushl $0; TRAP(T_BPTFLT) IDTVEC(dtrace_ret) @@ -124,15 +124,16 @@ IDTVEC(tss) TRAP(T_TSSFLT) IDTVEC(missing) - TRAP(T_SEGNPFLT) + pushl $T_SEGNPFLT + jmp irettraps IDTVEC(stk) - TRAP(T_STKFLT) + pushl $T_STKFLT + jmp irettraps IDTVEC(prot) - TRAP(T_PROTFLT) + pushl $T_PROTFLT + jmp irettraps IDTVEC(page) TRAP(T_PAGEFLT) -IDTVEC(mchk) - pushl $0; TRAP(T_MCHK) IDTVEC(rsvd_pti) IDTVEC(rsvd) pushl $0; TRAP(T_RESERVED) @@ -144,7 +145,8 @@ pushl $0; TRAP(T_XMMFLT) /* - * All traps except ones for syscalls jump to alltraps. If + * All traps except ones for syscalls or invalid segment, + * jump to alltraps. If * interrupts were enabled when the trap occurred, then interrupts * are enabled now if the trap was through a trap gate, else * disabled if the trap was through an interrupt gate. Note that @@ -156,20 +158,16 @@ .globl alltraps .type alltraps,@function alltraps: - pushal - pushl $0 - movw %ds,(%esp) - pushl $0 - movw %es,(%esp) - pushl $0 - movw %fs,(%esp) + PUSH_FRAME2 alltraps_with_regs_pushed: SET_KERNEL_SREGS cld + KENTER FAKE_MCOUNT(TF_EIP(%esp)) calltrap: pushl %esp - call trap + movl $trap,%eax + call *%eax add $4, %esp /* @@ -178,20 +176,43 @@ MEXITCOUNT jmp doreti + .globl irettraps + .type irettraps,@function +irettraps: + PUSH_FRAME2 + SET_KERNEL_SREGS + cld + /* XXXKIB vm86 */ + testb $SEL_RPL_MASK, TF_CS(%esp) + jnz 2f + call 1f +1: popl %ebx + leal (doreti_iret - 1b)(%ebx), %ecx + cmpl %ecx, TF_EIP(%esp) + je 2f + leal (doreti_popl_ds - 1b)(%ebx), %ecx + cmpl %ecx, TF_EIP(%esp) + je 2f + leal (doreti_popl_es - 1b)(%ebx), %ecx + cmpl %ecx, TF_EIP(%esp) + je 2f + leal (doreti_popl_fs - 1b)(%ebx), %ecx + cmpl %ecx, TF_EIP(%esp) + je 2f + /* kernel mode */ + FAKE_MCOUNT(TF_EIP(%esp)) + jmp calltrap +2: /* user mode, or kernel mode with user %cr3 and trampoline stack */ + MOVE_STACKS + FAKE_MCOUNT(TF_EIP(%esp)) + jmp calltrap + /* * Privileged instruction fault. */ #ifdef KDTRACE_HOOKS SUPERALIGN_TEXT IDTVEC(ill) - /* - * Check if a DTrace hook is registered. The default (data) segment - * cannot be used for this since %ds is not known good until we - * verify that the entry was from kernel mode. - */ - cmpl $0,%ss:dtrace_invop_jump_addr - je norm_ill - /* * Check if this is a user fault. If so, just handle it as a normal * trap. @@ -201,6 +222,13 @@ testl $PSL_VM, 8(%esp) /* and vm86 mode. */ jnz norm_ill + /* + * Check if a DTrace hook is registered. The trampoline cannot + * be instrumented. + */ + cmpl $0, dtrace_invop_jump_addr + je norm_ill + /* * This is a kernel instruction fault that might have been caused * by a DTrace provider. @@ -221,43 +249,39 @@ * Process the instruction fault in the normal way. */ norm_ill: - pushl $0 - TRAP(T_PRIVINFLT) + pushl $0 + pushl $T_PRIVINFLT + jmp alltraps #endif -/* - * Call gate entry for syscalls (lcall 7,0). - * This is used by FreeBSD 1.x a.out executables and "old" NetBSD executables. - * - * The intersegment call has been set up to specify one dummy parameter. - * This leaves a place to put eflags so that the call frame can be - * converted to a trap frame. Note that the eflags is (semi-)bogusly - * pushed into (what will be) tf_err and then copied later into the - * final spot. It has to be done this way because esp can't be just - * temporarily altered for the pushfl - an interrupt might come in - * and clobber the saved cs/eip. - */ - SUPERALIGN_TEXT -IDTVEC(lcall_syscall) - pushfl /* save eflags */ - popl 8(%esp) /* shuffle into tf_eflags */ - pushl $7 /* sizeof "lcall 7,0" */ - pushl $0 /* tf_trapno */ - pushal - pushl $0 - movw %ds,(%esp) +IDTVEC(mchk) pushl $0 - movw %es,(%esp) + pushl $T_MCHK + jmp nmi_mchk_common + +IDTVEC(nmi) pushl $0 - movw %fs,(%esp) + pushl $T_NMI +nmi_mchk_common: + PUSH_FRAME2 SET_KERNEL_SREGS cld + /* + * Save %cr3 into tf_err. There is no good place to put it. + * Always reload %cr3, since we might have interrupted the + * kernel entry or exit. + * Do not switch to the thread kernel stack, otherwise we might + * obliterate the previous context partially copied from the + * trampoline stack. + */ + movl %cr3, %eax + movl %eax, TF_ERR(%esp) + call 1f +1: popl %eax + movl (tramp_idleptd - 1b)(%eax), %eax + movl %eax, %cr3 FAKE_MCOUNT(TF_EIP(%esp)) - pushl %esp - call syscall - add $4, %esp - MEXITCOUNT - jmp doreti + jmp calltrap /* * Trap gate entry for syscalls (int 0x80). @@ -272,18 +296,15 @@ IDTVEC(int0x80_syscall) pushl $2 /* sizeof "int 0x80" */ pushl $0 /* tf_trapno */ - pushal - pushl $0 - movw %ds,(%esp) - pushl $0 - movw %es,(%esp) - pushl $0 - movw %fs,(%esp) + PUSH_FRAME2 SET_KERNEL_SREGS cld + MOVE_STACKS + sti FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp - call syscall + movl $syscall, %eax + call *%eax add $4, %esp MEXITCOUNT jmp doreti @@ -292,7 +313,8 @@ pushl %esp /* trapframe pointer */ pushl %ebx /* arg1 */ pushl %esi /* function */ - call fork_exit + movl $fork_exit, %eax + call *%eax addl $12,%esp /* cut from syscall */ @@ -393,7 +415,8 @@ je doreti_exit sti pushl %esp /* pass a pointer to the trapframe */ - call ast + movl $ast, %eax + call *%eax add $4,%esp jmp doreti_ast @@ -407,6 +430,23 @@ doreti_exit: MEXITCOUNT + cmpl $T_NMI, TF_TRAPNO(%esp) + je doreti_iret_nmi + cmpl $T_MCHK, TF_TRAPNO(%esp) + je doreti_iret_nmi + testl $SEL_RPL_MASK, TF_CS(%esp) + jz doreti_popl_fs + movl %esp, %esi + movl PCPU(TRAMPSTK), %edx + movl $TF_SZ, %ecx + subl %ecx, %edx + movl %edx, %edi + rep; movsb + movl %edx, %esp + movl PCPU(CURPCB),%eax + movl PCB_CR3(%eax), %eax + movl %eax, %cr3 + .globl doreti_popl_fs doreti_popl_fs: popl %fs @@ -422,6 +462,11 @@ doreti_iret: iret +doreti_iret_nmi: + movl TF_ERR(%esp), %eax + movl %eax, %cr3 + jmp doreti_popl_fs + /* * doreti_iret_fault and friends. Alternative return code for * the case where we get a fault in the doreti_exit code @@ -489,12 +534,21 @@ jz doreti_exit testl $TDP_CALLCHAIN,TD_PFLAGS(%eax) /* flagged for capture? */ jz doreti_exit + /* + * Switch to thread stack. Reset tf_trapno to not indicate NMI, + * to cause normal userspace exit. + */ + movl $T_RESERVED, TF_TRAPNO(%esp) + NMOVE_STACKS /* * Take the processor out of NMI mode by executing a fake "iret". */ pushfl pushl %cs - pushl $outofnmi + call 1f +1: popl %eax + leal (outofnmi-1b)(%eax),%eax + pushl %eax iret outofnmi: /* @@ -511,5 +565,6 @@ call *%ecx addl $12,%esp jmp doreti_ast - ENTRY(end_exceptions) #endif + +ENTRY(end_exceptions) Index: sys/i386/i386/genassym.c =================================================================== --- sys/i386/i386/genassym.c +++ sys/i386/i386/genassym.c @@ -75,6 +75,7 @@ #include #endif #include +#include #include #include #include @@ -142,6 +143,8 @@ ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); +ASSYM(PCB_EXT_TSS, offsetof(struct pcb_ext, ext_tss)); + ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd)); ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd)); ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86)); @@ -165,6 +168,7 @@ ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); +ASSYM(TF_SZ, sizeof(struct trapframe)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); #ifdef COMPAT_43 @@ -207,7 +211,7 @@ ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); -ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss)); +ASSYM(PC_COMMON_TSSP, offsetof(struct pcpu, pc_common_tssp)); ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd)); ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt)); ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt)); @@ -215,6 +219,8 @@ ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss)); +ASSYM(PC_KESP0, offsetof(struct pcpu, pc_kesp0)); +ASSYM(PC_TRAMPSTK, offsetof(struct pcpu, pc_trampstk)); #ifdef DEV_APIC ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL); @@ -228,6 +234,7 @@ ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); +ASSYM(VM86_STACK_SPACE, VM86_STACK_SPACE); #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); Index: sys/i386/i386/locore.s =================================================================== --- sys/i386/i386/locore.s +++ sys/i386/i386/locore.s @@ -54,14 +54,6 @@ #include "assym.s" -/* - * XXX - * - * Note: This version greatly munged to avoid various assembler errors - * that may be fixed in newer versions of gas. Perhaps newer versions - * will have more pleasant appearance. - */ - /* * PTmap is recursive pagemap at top of virtual address space. * Within PTmap, the page directory can be found (third indirection). @@ -72,7 +64,7 @@ .set PTDpde,PTD + (PTDPTDI * PDESIZE) /* - * Compiled KERNBASE location and the kernel load address + * Compiled KERNBASE location and the kernel load address, now identical. */ .globl kernbase .set kernbase,KERNBASE @@ -91,83 +83,6 @@ .globl bootinfo bootinfo: .space BOOTINFO_SIZE /* bootinfo that we can handle */ - .globl KERNend -KERNend: .long 0 /* phys addr end of kernel (just after bss) */ -physfree: .long 0 /* phys addr of next free page */ - - .globl IdlePTD -IdlePTD: .long 0 /* phys addr of kernel PTD */ - -#if defined(PAE) || defined(PAE_TABLES) - .globl IdlePDPT -IdlePDPT: .long 0 /* phys addr of kernel PDPT */ -#endif - - .globl KPTmap -KPTmap: .long 0 /* address of kernel page tables */ - - .globl KPTphys -KPTphys: .long 0 /* phys addr of kernel page tables */ - - .globl proc0kstack -proc0kstack: .long 0 /* address of proc 0 kstack space */ -p0kpa: .long 0 /* phys addr of proc0's STACK */ - -vm86phystk: .long 0 /* PA of vm86/bios stack */ - - .globl vm86paddr, vm86pa -vm86paddr: .long 0 /* address of vm86 region */ -vm86pa: .long 0 /* phys addr of vm86 region */ - -/********************************************************************** - * - * Some handy macros - * - */ - -#define R(foo) ((foo)-KERNBASE) - -#define ALLOCPAGES(foo) \ - movl R(physfree), %esi ; \ - movl $((foo)*PAGE_SIZE), %eax ; \ - addl %esi, %eax ; \ - movl %eax, R(physfree) ; \ - movl %esi, %edi ; \ - movl $((foo)*PAGE_SIZE),%ecx ; \ - xorl %eax,%eax ; \ - cld ; \ - rep ; \ - stosb - -/* - * fillkpt - * eax = page frame address - * ebx = index into page table - * ecx = how many pages to map - * base = base address of page dir/table - * prot = protection bits - */ -#define fillkpt(base, prot) \ - shll $PTESHIFT,%ebx ; \ - addl base,%ebx ; \ - orl $PG_V,%eax ; \ - orl prot,%eax ; \ -1: movl %eax,(%ebx) ; \ - addl $PAGE_SIZE,%eax ; /* increment physical address */ \ - addl $PTESIZE,%ebx ; /* next pte */ \ - loop 1b - -/* - * fillkptphys(prot) - * eax = physical address - * ecx = how many pages to map - * prot = protection bits - */ -#define fillkptphys(prot) \ - movl %eax, %ebx ; \ - shrl $PAGE_SHIFT, %ebx ; \ - fillkpt(R(KPTphys), prot) - .text /********************************************************************** * @@ -180,6 +95,7 @@ movw $0x1234,0x472 /* Set up a real frame in case the double return in newboot is executed. */ + xorl %ebp,%ebp pushl %ebp movl %esp, %ebp @@ -205,8 +121,8 @@ * inactive from now until we switch to new ones, since we don't load any * more segment registers or permit interrupts until after the switch. */ - movl $R(end),%ecx - movl $R(edata),%edi + movl $end,%ecx + movl $edata,%edi subl %edi,%ecx xorl %eax,%eax cld @@ -221,48 +137,10 @@ * the old stack, but it need not be, since recover_bootinfo actually * returns via the old frame. */ - movl $R(tmpstk),%esp + movl $tmpstk,%esp call identify_cpu - call create_pagetables - -/* - * If the CPU has support for VME, turn it on. - */ - testl $CPUID_VME, R(cpu_feature) - jz 1f - movl %cr4, %eax - orl $CR4_VME, %eax - movl %eax, %cr4 -1: - -/* Now enable paging */ -#if defined(PAE) || defined(PAE_TABLES) - movl R(IdlePDPT), %eax - movl %eax, %cr3 - movl %cr4, %edx - orl $CR4_PAE, %edx - movl %edx, %cr4 -#else - movl R(IdlePTD), %eax - movl %eax,%cr3 /* load ptd addr into mmu */ -#endif - movl %cr0,%edx /* get control word */ - orl $CR0_PE|CR0_PG,%edx /* enable paging */ - movl %edx,%cr0 /* and let's page NOW! */ - - pushl $begin /* jump to high virtualized address */ - ret - -begin: - /* - * Now running relocated at KERNBASE where the system is linked to run. - * - * Remove the lowest part of the double mapping of low memory to get - * some null pointer checks. - */ - movl $0,PTD - movl %eax,%cr3 /* invalidate TLB */ + call pmap_cold /* set up bootstrap stack */ movl proc0kstack,%eax /* location of in-kernel stack */ @@ -376,7 +254,7 @@ cmpl $0,%esi je 2f /* No kernelname */ movl $MAXPATHLEN,%ecx /* Brute force!!! */ - movl $R(kernelname),%edi + movl $kernelname,%edi cmpb $'/',(%esi) /* Make sure it starts with a slash */ je 1f movb $'/',(%edi) @@ -404,7 +282,7 @@ * Copy the common part of the bootinfo struct */ movl %ebx,%esi - movl $R(bootinfo),%edi + movl $bootinfo,%edi cmpl $BOOTINFO_SIZE,%ecx jbe got_common_bi_size movl $BOOTINFO_SIZE,%ecx @@ -421,12 +299,12 @@ movl BI_NFS_DISKLESS(%ebx),%esi cmpl $0,%esi je olddiskboot - movl $R(nfs_diskless),%edi + movl $nfs_diskless,%edi movl $NFSDISKLESS_SIZE,%ecx cld rep movsb - movl $R(nfs_diskless_valid),%edi + movl $nfs_diskless_valid,%edi movl $1,(%edi) #endif #endif @@ -439,9 +317,9 @@ */ olddiskboot: movl 8(%ebp),%eax - movl %eax,R(boothowto) + movl %eax,boothowto movl 12(%ebp),%eax - movl %eax,R(bootdev) + movl %eax,bootdev ret @@ -479,16 +357,16 @@ divl %ecx jz trynexgen popfl - movl $CPU_386,R(cpu) + movl $CPU_386,cpu jmp 3f trynexgen: popfl - movl $CPU_NX586,R(cpu) - movl $0x4778654e,R(cpu_vendor) # store vendor string - movl $0x72446e65,R(cpu_vendor+4) - movl $0x6e657669,R(cpu_vendor+8) - movl $0,R(cpu_vendor+12) + movl $CPU_NX586,cpu + movl $0x4778654e,cpu_vendor # store vendor string + movl $0x72446e65,cpu_vendor+4 + movl $0x6e657669,cpu_vendor+8 + movl $0,cpu_vendor+12 jmp 3f try486: /* Try to toggle identification flag; does not exist on early 486s. */ @@ -507,7 +385,7 @@ testl %eax,%eax jnz trycpuid - movl $CPU_486,R(cpu) + movl $CPU_486,cpu /* * Check Cyrix CPU @@ -534,250 +412,46 @@ * CPU, we couldn't distinguish it from Cyrix's (including IBM * brand of Cyrix CPUs). */ - movl $0x69727943,R(cpu_vendor) # store vendor string - movl $0x736e4978,R(cpu_vendor+4) - movl $0x64616574,R(cpu_vendor+8) + movl $0x69727943,cpu_vendor # store vendor string + movl $0x736e4978,cpu_vendor+4 + movl $0x64616574,cpu_vendor+8 jmp 3f trycpuid: /* Use the `cpuid' instruction. */ xorl %eax,%eax cpuid # cpuid 0 - movl %eax,R(cpu_high) # highest capability - movl %ebx,R(cpu_vendor) # store vendor string - movl %edx,R(cpu_vendor+4) - movl %ecx,R(cpu_vendor+8) - movb $0,R(cpu_vendor+12) + movl %eax,cpu_high # highest capability + movl %ebx,cpu_vendor # store vendor string + movl %edx,cpu_vendor+4 + movl %ecx,cpu_vendor+8 + movb $0,cpu_vendor+12 movl $1,%eax cpuid # cpuid 1 - movl %eax,R(cpu_id) # store cpu_id - movl %ebx,R(cpu_procinfo) # store cpu_procinfo - movl %edx,R(cpu_feature) # store cpu_feature - movl %ecx,R(cpu_feature2) # store cpu_feature2 + movl %eax,cpu_id # store cpu_id + movl %ebx,cpu_procinfo # store cpu_procinfo + movl %edx,cpu_feature # store cpu_feature + movl %ecx,cpu_feature2 # store cpu_feature2 rorl $8,%eax # extract family type andl $15,%eax cmpl $5,%eax jae 1f /* less than Pentium; must be 486 */ - movl $CPU_486,R(cpu) + movl $CPU_486,cpu jmp 3f 1: /* a Pentium? */ cmpl $5,%eax jne 2f - movl $CPU_586,R(cpu) + movl $CPU_586,cpu jmp 3f 2: /* Greater than Pentium...call it a Pentium Pro */ - movl $CPU_686,R(cpu) + movl $CPU_686,cpu 3: ret - -/********************************************************************** - * - * Create the first page directory and its page tables. - * - */ - -create_pagetables: - -/* Find end of kernel image (rounded up to a page boundary). */ - movl $R(_end),%esi - -/* Include symbols, if any. */ - movl R(bootinfo+BI_ESYMTAB),%edi - testl %edi,%edi - je over_symalloc - movl %edi,%esi - movl $KERNBASE,%edi - addl %edi,R(bootinfo+BI_SYMTAB) - addl %edi,R(bootinfo+BI_ESYMTAB) -over_symalloc: - -/* If we are told where the end of the kernel space is, believe it. */ - movl R(bootinfo+BI_KERNEND),%edi - testl %edi,%edi - je no_kernend - movl %edi,%esi -no_kernend: - - addl $PDRMASK,%esi /* Play conservative for now, and */ - andl $~PDRMASK,%esi /* ... round up to PDR boundary */ - movl %esi,R(KERNend) /* save end of kernel */ - movl %esi,R(physfree) /* next free page is at end of kernel */ - -/* Allocate Kernel Page Tables */ - ALLOCPAGES(NKPT) - movl %esi,R(KPTphys) - addl $(KERNBASE-(KPTDI<<(PDRSHIFT-PAGE_SHIFT+PTESHIFT))),%esi - movl %esi,R(KPTmap) - -/* Allocate Page Table Directory */ -#if defined(PAE) || defined(PAE_TABLES) - /* XXX only need 32 bytes (easier for now) */ - ALLOCPAGES(1) - movl %esi,R(IdlePDPT) -#endif - ALLOCPAGES(NPGPTD) - movl %esi,R(IdlePTD) - -/* Allocate KSTACK */ - ALLOCPAGES(TD0_KSTACK_PAGES) - movl %esi,R(p0kpa) - addl $KERNBASE, %esi - movl %esi, R(proc0kstack) - - ALLOCPAGES(1) /* vm86/bios stack */ - movl %esi,R(vm86phystk) - - ALLOCPAGES(3) /* pgtable + ext + IOPAGES */ - movl %esi,R(vm86pa) - addl $KERNBASE, %esi - movl %esi, R(vm86paddr) - -/* - * Enable PSE and PGE. - */ -#ifndef DISABLE_PSE - testl $CPUID_PSE, R(cpu_feature) - jz 1f - movl $PG_PS, R(pseflag) - movl %cr4, %eax - orl $CR4_PSE, %eax - movl %eax, %cr4 -1: -#endif -#ifndef DISABLE_PG_G - testl $CPUID_PGE, R(cpu_feature) - jz 2f - movl $PG_G, R(pgeflag) - movl %cr4, %eax - orl $CR4_PGE, %eax - movl %eax, %cr4 -2: -#endif - -/* - * Initialize page table pages mapping physical address zero through the - * (physical) end of the kernel. Many of these pages must be reserved, - * and we reserve them all and map them linearly for convenience. We do - * this even if we've enabled PSE above; we'll just switch the corresponding - * kernel PDEs before we turn on paging. - * - * XXX: We waste some pages here in the PSE case! - * - * This and all other page table entries allow read and write access for - * various reasons. Kernel mappings never have any access restrictions. - */ - xorl %eax, %eax - movl R(KERNend),%ecx - shrl $PAGE_SHIFT,%ecx - fillkptphys($PG_RW) - -/* Map page table pages. */ - movl R(KPTphys),%eax - movl $NKPT,%ecx - fillkptphys($PG_RW) - -/* Map page directory. */ -#if defined(PAE) || defined(PAE_TABLES) - movl R(IdlePDPT), %eax - movl $1, %ecx - fillkptphys($PG_RW) -#endif - - movl R(IdlePTD), %eax - movl $NPGPTD, %ecx - fillkptphys($PG_RW) - -/* Map proc0's KSTACK in the physical way ... */ - movl R(p0kpa), %eax - movl $(TD0_KSTACK_PAGES), %ecx - fillkptphys($PG_RW) - -/* Map ISA hole */ - movl $ISA_HOLE_START, %eax - movl $ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx - fillkptphys($PG_RW) - -/* Map space for the vm86 region */ - movl R(vm86phystk), %eax - movl $4, %ecx - fillkptphys($PG_RW) - -/* Map page 0 into the vm86 page table */ - movl $0, %eax - movl $0, %ebx - movl $1, %ecx - fillkpt(R(vm86pa), $PG_RW|PG_U) - -/* ...likewise for the ISA hole */ - movl $ISA_HOLE_START, %eax - movl $ISA_HOLE_START>>PAGE_SHIFT, %ebx - movl $ISA_HOLE_LENGTH>>PAGE_SHIFT, %ecx - fillkpt(R(vm86pa), $PG_RW|PG_U) - -/* - * Create an identity mapping for low physical memory, including the kernel. - * This is only used to map the 2 instructions for jumping to 'begin' in - * locore (we map everything to avoid having to determine where these - * instructions are). ACPI resume will transiently restore the first PDE in - * this mapping (and depend on this PDE's page table created here not being - * destroyed). See pmap_bootstrap() for more details. - * - * Note: There are errata concerning large pages and physical address zero, - * so a PG_PS mapping should not be used for PDE 0. Our double mapping - * avoids this automatically by not using PG_PS for PDE #KPDI so that PAT - * bits can be set at the page level for i/o pages below 1 MB. - */ - movl R(KPTphys), %eax - xorl %ebx, %ebx - movl $NKPT, %ecx - fillkpt(R(IdlePTD), $PG_RW) - -/* - * Install PDEs for PTs covering enough kva to bootstrap. Then for the PSE - * case, replace the PDEs whose coverage is strictly within the kernel - * (between KERNLOAD (rounded up) and KERNend) by large-page PDEs. - */ - movl R(KPTphys), %eax - movl $KPTDI, %ebx - movl $NKPT, %ecx - fillkpt(R(IdlePTD), $PG_RW) - cmpl $0,R(pseflag) - je done_pde - - movl R(KERNend), %ecx - movl $(KERNLOAD + PDRMASK) & ~PDRMASK, %eax - subl %eax, %ecx - shrl $PDRSHIFT, %ecx - movl $KPTDI + ((KERNLOAD + PDRMASK) >> PDRSHIFT), %ebx - shll $PDESHIFT, %ebx - addl R(IdlePTD), %ebx - orl $(PG_V|PG_RW|PG_PS), %eax -1: movl %eax, (%ebx) - addl $(1 << PDRSHIFT), %eax - addl $PDESIZE, %ebx - loop 1b - -done_pde: -/* install a pde recursively mapping page directory as a page table */ - movl R(IdlePTD), %eax - movl $PTDPTDI, %ebx - movl $NPGPTD,%ecx - fillkpt(R(IdlePTD), $PG_RW) - -#if defined(PAE) || defined(PAE_TABLES) - movl R(IdlePTD), %eax - xorl %ebx, %ebx - movl $NPGPTD, %ecx - fillkpt(R(IdlePDPT), $0x0) -#endif - - ret - #ifdef XENHVM /* Xen Hypercall page */ .text Index: sys/i386/i386/machdep.c =================================================================== --- sys/i386/i386/machdep.c +++ sys/i386/i386/machdep.c @@ -1,6 +1,7 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * + * Copyright (c) 2018 The FreeBSD Foundation * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. @@ -8,6 +9,9 @@ * This code is derived from software contributed to Berkeley by * William Jolitz. * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -82,9 +86,7 @@ #include #include #include -#ifdef SMP #include -#endif #include #include #include @@ -129,6 +131,7 @@ #include #include #include +#include #include #include #ifdef PERFMON @@ -152,8 +155,8 @@ /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); -extern register_t init386(int first); -extern void dblfault_handler(void); +register_t init386(int first); +void dblfault_handler(void); static void cpu_startup(void *); static void fpstate_drop(struct thread *td); @@ -210,14 +213,18 @@ struct mem_range_softc mem_range_softc; - /* Default init_ops implementation. */ - struct init_ops init_ops = { +extern char start_exceptions[], end_exceptions[]; + +extern struct sysentvec elf32_freebsd_sysvec; + +/* Default init_ops implementation. */ +struct init_ops init_ops = { .early_clock_source_init = i8254_init, .early_delay = i8254_delay, #ifdef DEV_APIC .msi_init = msi_init, #endif - }; +}; static void cpu_startup(dummy) @@ -1098,24 +1105,59 @@ return (EJUSTRETURN); } +#ifdef COMPAT_43 +static void +setup_priv_lcall_gate(struct proc *p) +{ + struct i386_ldt_args uap; + union descriptor desc; + u_int lcall_addr; + + bzero(&uap, sizeof(uap)); + uap.start = 0; + uap.num = 1; + lcall_addr = p->p_sysent->sv_psstrings - sz_lcall_tramp; + bzero(&desc, sizeof(desc)); + desc.sd.sd_type = SDT_MEMERA; + desc.sd.sd_dpl = SEL_UPL; + desc.sd.sd_p = 1; + desc.sd.sd_def32 = 1; + desc.sd.sd_gran = 1; + desc.sd.sd_lolimit = 0xffff; + desc.sd.sd_hilimit = 0xf; + desc.sd.sd_lobase = lcall_addr; + desc.sd.sd_hibase = lcall_addr >> 24; + i386_set_ldt(curthread, &uap, &desc); +} +#endif + /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { - struct trapframe *regs = td->td_frame; - struct pcb *pcb = td->td_pcb; + struct trapframe *regs; + struct pcb *pcb; + + regs = td->td_frame; + pcb = td->td_pcb; /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ pcb->pcb_gs = _udatasel; load_gs(_udatasel); mtx_lock_spin(&dt_lock); - if (td->td_proc->p_md.md_ldt) + if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock_spin(&dt_lock); + +#ifdef COMPAT_43 + if (td->td_proc->p_sysent->sv_psstrings != + elf32_freebsd_sysvec.sv_psstrings) + setup_priv_lcall_gate(td->td_proc); +#endif /* * Reset the fs and gs bases. The values from the old address @@ -1217,18 +1259,22 @@ int _default_ldt; -union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ -union descriptor ldt[NLDT]; /* local descriptor table */ +struct mtx dt_lock; /* lock for GDT and LDT */ + +union descriptor gdt0[NGDT]; /* initial global descriptor table */ +union descriptor *gdt = gdt0; /* global descriptor table */ + +union descriptor *ldt; /* local descriptor table */ + static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ -struct region_descriptor r_gdt, r_idt; /* table descriptors */ -struct mtx dt_lock; /* lock for GDT and LDT */ -static struct i386tss dblfault_tss; -static char dblfault_stack[PAGE_SIZE]; +static struct i386tss *dblfault_tss; +static char *dblfault_stack; -extern vm_offset_t proc0kstack; +static struct i386tss common_tss0; +vm_offset_t proc0kstack; /* * software prototypes -- in more palatable form. @@ -1329,8 +1375,8 @@ .ssd_def32 = 0, .ssd_gran = 0 }, /* GLDT_SEL 10 LDT Descriptor */ -{ .ssd_base = (int) ldt, - .ssd_limit = sizeof(ldt)-1, +{ .ssd_base = 0, + .ssd_limit = sizeof(union descriptor) * NLDT - 1, .ssd_type = SDT_SYSLDT, .ssd_dpl = SEL_UPL, .ssd_p = 1, @@ -1338,7 +1384,7 @@ .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 User LDT Descriptor per process */ -{ .ssd_base = (int) ldt, +{ .ssd_base = 0, .ssd_limit = (512 * sizeof(union descriptor)-1), .ssd_type = SDT_SYSLDT, .ssd_dpl = 0, @@ -1347,7 +1393,7 @@ .ssd_def32 = 0, .ssd_gran = 0 }, /* GPANIC_SEL 12 Panic Tss Descriptor */ -{ .ssd_base = (int) &dblfault_tss, +{ .ssd_base = 0, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, @@ -1468,25 +1514,31 @@ .ssd_gran = 1 }, }; +uintptr_t setidt_disp; + void -setidt(idx, func, typ, dpl, selec) - int idx; - inthand_t *func; - int typ; - int dpl; - int selec; +setidt(int idx, inthand_t *func, int typ, int dpl, int selec) +{ + uintptr_t off; + + off = func != NULL ? (uintptr_t)func + setidt_disp : 0; + setidt_nodisp(idx, off, typ, dpl, selec); +} + +void +setidt_nodisp(int idx, uintptr_t off, int typ, int dpl, int selec) { struct gate_descriptor *ip; ip = idt + idx; - ip->gd_looffset = (int)func; + ip->gd_looffset = off; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; - ip->gd_hioffset = ((int)func)>>16 ; + ip->gd_hioffset = ((u_int)off) >> 16 ; } extern inthand_t @@ -1501,7 +1553,7 @@ #ifdef XENHVM IDTVEC(xen_intr_upcall), #endif - IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); + IDTVEC(int0x80_syscall); #ifdef DDB /* @@ -1512,15 +1564,29 @@ { struct gate_descriptor *ip; int idx; - uintptr_t func; + uintptr_t func, func_trm; + bool trm; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { - func = (ip->gd_hioffset << 16 | ip->gd_looffset); - if (func != (uintptr_t)&IDTVEC(rsvd)) { - db_printf("%3d\t", idx); - db_printsym(func, DB_STGY_PROC); - db_printf("\n"); + if (ip->gd_type == SDT_SYSTASKGT) { + db_printf("%3d\t\n", idx); + } else { + func = (ip->gd_hioffset << 16 | ip->gd_looffset); + if (func >= PMAP_TRM_MIN_ADDRESS) { + func_trm = func; + func -= setidt_disp; + trm = true; + } else + trm = false; + if (func != (uintptr_t)&IDTVEC(rsvd)) { + db_printf("%3d\t", idx); + db_printsym(func, DB_STGY_PROC); + if (trm) + db_printf(" (trampoline %#x)", + func_trm); + db_printf("\n"); + } } ip++; } @@ -1693,7 +1759,6 @@ static void basemem_setup(void) { - vm_paddr_t pa; pt_entry_t *pte; int i; @@ -1703,30 +1768,6 @@ basemem = 640; } - /* - * XXX if biosbasemem is now < 640, there is a `hole' - * between the end of base memory and the start of - * ISA memory. The hole may be empty or it may - * contain BIOS code or data. Map it read/write so - * that the BIOS can write to it. (Memory from 0 to - * the physical end of the kernel is mapped read-only - * to begin with and then parts of it are remapped. - * The parts that aren't remapped form holes that - * remain read-only and are unused by the kernel. - * The base memory area is below the physical end of - * the kernel and right now forms a read-only hole. - * The part of it from PAGE_SIZE to - * (trunc_page(biosbasemem * 1024) - 1) will be - * remapped and used by the kernel later.) - * - * This code is similar to the code used in - * pmap_mapdev, but since no memory needs to be - * allocated we simply change the mapping. - */ - for (pa = trunc_page(basemem * 1024); - pa < ISA_HOLE_START; pa += PAGE_SIZE) - pmap_kenter(KERNBASE + pa, pa); - /* * Map pages between basemem and ISA_HOLE_START, if any, r/w into * the vm86 page table so that vm86 can scribble on them using @@ -1807,9 +1848,9 @@ * the kernel page table so we can use it as a buffer. The * kernel will unmap this page later. */ - pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); + pmap_kenter(1 << PAGE_SHIFT, 1 << PAGE_SHIFT); vmc.npages = 0; - smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); + smap = (void *)vm86_addpage(&vmc, 1, 1 << PAGE_SHIFT); res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); KASSERT(res != 0, ("vm86_getptr() failed: address not found")); @@ -2130,13 +2171,119 @@ #endif } +static void +fixup_idt(void) +{ + struct gate_descriptor *ip; + uintptr_t off; + int x; + + for (x = 0; x < NIDT; x++) { + ip = &idt[x]; + if (ip->gd_type != SDT_SYS386IGT && + ip->gd_type != SDT_SYS386TGT) + continue; + off = ip->gd_looffset + (((u_int)ip->gd_hioffset) << 16); + KASSERT(off >= (uintptr_t)start_exceptions && + off < (uintptr_t)end_exceptions, + ("IDT[%d] type %d off %#x", x, ip->gd_type, off)); + off += setidt_disp; + MPASS(off >= PMAP_TRM_MIN_ADDRESS && + off < PMAP_TRM_MAX_ADDRESS); + ip->gd_looffset = off; + ip->gd_hioffset = off >> 16; + } +} + +static void +i386_setidt1(void) +{ + int x; + + /* exceptions */ + for (x = 0; x < NIDT; x++) + setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, + SEL_KPL)); + setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, + SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), + SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); +#ifdef KDTRACE_HOOKS + setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), + SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif +#ifdef XENHVM + setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif +} + +static void +i386_setidt2(void) +{ + + setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); +} + +#if defined(DEV_ISA) && !defined(DEV_ATPIC) +static void +i386_setidt3(void) +{ + + setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +} +#endif + register_t init386(int first) { - struct gate_descriptor *gdp; + struct region_descriptor r_gdt, r_idt; /* table descriptors */ int gsel_tss, metadata_missing, x, pa; struct pcpu *pc; struct xstate_hdr *xhdr; + vm_offset_t addend; int late_console; thread0.td_kstack = proc0kstack; @@ -2148,18 +2295,23 @@ */ proc_linkup0(&proc0, &thread0); - metadata_missing = 0; if (bootinfo.bi_modulep) { - preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; - preload_bootstrap_relocate(KERNBASE); + metadata_missing = 0; + addend = (vm_paddr_t)bootinfo.bi_modulep < KERNBASE ? + PMAP_MAP_LOW : 0; + preload_metadata = (caddr_t)bootinfo.bi_modulep + addend; + preload_bootstrap_relocate(addend); } else { metadata_missing = 1; } - if (bootinfo.bi_envp != 0) - init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0); - else + if (bootinfo.bi_envp != 0) { + addend = (vm_paddr_t)bootinfo.bi_envp < KERNBASE ? + PMAP_MAP_LOW : 0; + init_static_kenv((char *)bootinfo.bi_envp + addend, 0); + } else { init_static_kenv(NULL, 0); + } identify_hypervisor(); @@ -2179,8 +2331,8 @@ pc = &__pcpu[0]; gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); - gdt_segs[GPRIV_SEL].ssd_base = (int) pc; - gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + gdt_segs[GPRIV_SEL].ssd_base = (int)pc; + gdt_segs[GPROC0_SEL].ssd_base = (int)&common_tss0; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); @@ -2192,8 +2344,8 @@ pcpu_init(pc, 0, sizeof(struct pcpu)); for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) - pmap_kenter(pa + KERNBASE, pa); - dpcpu_init((void *)(first + KERNBASE), 0); + pmap_kenter(pa, pa); + dpcpu_init((void *)first, 0); first += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); @@ -2210,67 +2362,7 @@ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); - /* make ldt memory segments */ - ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); - ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); - for (x = 0; x < nitems(ldt_segs); x++) - ssdtosd(&ldt_segs[x], &ldt[x].sd); - - _default_ldt = GSEL(GLDT_SEL, SEL_KPL); - lldt(_default_ldt); - PCPU_SET(currentldt, _default_ldt); - - /* exceptions */ - for (x = 0; x < NIDT; x++) - setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL - , GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); - setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, - GSEL(GCODE_SEL, SEL_KPL)); -#ifdef KDTRACE_HOOKS - setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, - GSEL(GCODE_SEL, SEL_KPL)); -#endif -#ifdef XENHVM - setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); -#endif + i386_setidt1(); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; @@ -2283,41 +2375,21 @@ clock_init(); finishidentcpu(); /* Final stage of CPU initialization */ - setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); + i386_setidt2(); initializecpu(); /* Initialize CPU registers */ initializecpucache(); /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); - dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = - dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; - dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = - dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); -#if defined(PAE) || defined(PAE_TABLES) - dblfault_tss.tss_cr3 = (int)IdlePDPT; -#else - dblfault_tss.tss_cr3 = (int)IdlePTD; -#endif - dblfault_tss.tss_eip = (int)dblfault_handler; - dblfault_tss.tss_eflags = PSL_KERNEL; - dblfault_tss.tss_ds = dblfault_tss.tss_es = - dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); - dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); - dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); - dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); - /* Initialize the tss (except for the final esp0) early for vm86. */ - PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + - thread0.td_kstack_pages * PAGE_SIZE - 16); - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + common_tss0.tss_esp0 = thread0.td_kstack + thread0.td_kstack_pages * + PAGE_SIZE - VM86_STACK_SPACE; + common_tss0.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); + common_tss0.tss_ioopt = sizeof(struct i386tss) << 16; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); - PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); /* Initialize the PIC early for vm86 calls. */ @@ -2333,10 +2405,7 @@ * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ - setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); - setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, - GSEL(GCODE_SEL, SEL_KPL)); + i386_setidt3(); #endif #endif @@ -2386,22 +2455,10 @@ PCPU_SET(curpcb, thread0.td_pcb); /* Move esp0 in the tss to its final place. */ /* Note: -16 is so we can grow the trapframe if we came from vm86 */ - PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); + common_tss0.tss_esp0 = (vm_offset_t)thread0.td_pcb - VM86_STACK_SPACE; gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ ltr(gsel_tss); - /* make a call gate to reenter kernel with */ - gdp = &ldt[LSYS5CALLS_SEL].gd; - - x = (int) &IDTVEC(lcall_syscall); - gdp->gd_looffset = x; - gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); - gdp->gd_stkcpy = 1; - gdp->gd_type = SDT_SYS386CGT; - gdp->gd_dpl = SEL_UPL; - gdp->gd_p = 1; - gdp->gd_hioffset = x >> 16; - /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); @@ -2427,6 +2484,126 @@ return ((register_t)thread0.td_pcb); } +extern u_int tramp_idleptd; + +static void +machdep_init_trampoline(void) +{ + struct region_descriptor r_gdt, r_idt; + struct i386tss *tss; + char *trampoline, *tramp_stack_base; + u_int *tramp_idleptd_reloced; + int x; + + gdt = pmap_trm_alloc(sizeof(union descriptor) * NGDT * mp_ncpus, + M_NOWAIT | M_ZERO); + bcopy(gdt0, gdt, sizeof(union descriptor) * NGDT); + r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; + r_gdt.rd_base = (int)gdt; + lgdt(&r_gdt); + + tss = pmap_trm_alloc(sizeof(struct i386tss) * mp_ncpus, + M_NOWAIT | M_ZERO); + bcopy(&common_tss0, tss, sizeof(struct i386tss)); + gdt[GPROC0_SEL].sd.sd_lobase = (int)tss; + gdt[GPROC0_SEL].sd.sd_hibase = (u_int)tss >> 24; + gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; + ltr(GSEL(GPROC0_SEL, SEL_KPL)); + + PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); + PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); + PCPU_SET(common_tssp, tss); + + trampoline = pmap_trm_alloc(end_exceptions - start_exceptions, + M_NOWAIT); + bcopy(start_exceptions, trampoline, end_exceptions - start_exceptions); + tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); + PCPU_SET(trampstk, (uintptr_t)tramp_stack_base - VM86_STACK_SPACE); + tss[0].tss_esp0 = PCPU_GET(trampstk); + + idt = pmap_trm_alloc(sizeof(idt0), M_NOWAIT | M_ZERO); + bcopy(idt0, idt, sizeof(idt0)); + + /* Re-initialize new IDT since the handlers were relocated */ + setidt_disp = trampoline - start_exceptions; + fixup_idt(); + + tramp_idleptd_reloced = (u_int *)((uintptr_t)&tramp_idleptd + + setidt_disp); +#if defined(PAE) || defined(PAE_TABLES) + *tramp_idleptd_reloced = (u_int)IdlePDPT; +#else + *tramp_idleptd_reloced = (u_int)IdlePTD; +#endif + + r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; + r_idt.rd_base = (int)idt; + lidt(&r_idt); + + /* dblfault TSS */ + dblfault_tss = pmap_trm_alloc(sizeof(struct i386tss), M_NOWAIT | M_ZERO); + dblfault_stack = pmap_trm_alloc(PAGE_SIZE, M_NOWAIT); + dblfault_tss->tss_esp = dblfault_tss->tss_esp0 = + dblfault_tss->tss_esp1 = dblfault_tss->tss_esp2 = + (int)dblfault_stack; + dblfault_tss->tss_ss = dblfault_tss->tss_ss0 = dblfault_tss->tss_ss1 = + dblfault_tss->tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); +#if defined(PAE) || defined(PAE_TABLES) + dblfault_tss->tss_cr3 = (int)IdlePDPT; +#else + dblfault_tss->tss_cr3 = (int)IdlePTD; +#endif + dblfault_tss->tss_eip = (int)dblfault_handler + setidt_disp; + dblfault_tss->tss_eflags = PSL_KERNEL; + dblfault_tss->tss_ds = dblfault_tss->tss_es = + dblfault_tss->tss_gs = GSEL(GDATA_SEL, SEL_KPL); + dblfault_tss->tss_fs = GSEL(GPRIV_SEL, SEL_KPL); + dblfault_tss->tss_cs = GSEL(GCODE_SEL, SEL_KPL); + dblfault_tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL); + gdt[GPANIC_SEL].sd.sd_lobase = (int)dblfault_tss; + gdt[GPANIC_SEL].sd.sd_hibase = (u_int)dblfault_tss >> 24; + + /* make ldt memory segments */ + ldt = pmap_trm_alloc(sizeof(union descriptor) * NLDT, + M_NOWAIT | M_ZERO); + gdt[GLDT_SEL].sd.sd_lobase = (int)ldt; + gdt[GLDT_SEL].sd.sd_hibase = (u_int)ldt >> 24; + ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); + ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); + for (x = 0; x < nitems(ldt_segs); x++) + ssdtosd(&ldt_segs[x], &ldt[x].sd); + + _default_ldt = GSEL(GLDT_SEL, SEL_KPL); + lldt(_default_ldt); + PCPU_SET(currentldt, _default_ldt); +} +SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_SECOND, machdep_init_trampoline, NULL); + +static void +i386_setup_lcall_gate(void) +{ + struct sysentvec *sv; + struct user_segment_descriptor desc; + u_int lcall_addr; + + sv = &elf32_freebsd_sysvec; + lcall_addr = (uintptr_t)sv->sv_psstrings - sz_lcall_tramp; + + bzero(&desc, sizeof(desc)); + desc.sd_type = SDT_MEMERA; + desc.sd_dpl = SEL_UPL; + desc.sd_p = 1; + desc.sd_def32 = 1; + desc.sd_gran = 1; + desc.sd_lolimit = 0xffff; + desc.sd_hilimit = 0xf; + desc.sd_lobase = lcall_addr; + desc.sd_hibase = lcall_addr >> 24; + bcopy(&desc, &ldt[LSYS5CALLS_SEL], sizeof(desc)); +} +SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_ANY, i386_setup_lcall_gate, NULL); + void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { @@ -2507,6 +2684,7 @@ static void f00f_hack(void *unused) { + struct region_descriptor r_idt; struct gate_descriptor *new_idt; vm_offset_t tmp; @@ -2517,16 +2695,19 @@ printf("Intel Pentium detected, installing workaround for F00F bug\n"); - tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); + tmp = (vm_offset_t)pmap_trm_alloc(PAGE_SIZE * 3, M_NOWAIT | M_ZERO); if (tmp == 0) panic("kmem_malloc returned 0"); + tmp = round_page(tmp); /* Put the problematic entry (#6) at the end of the lower page. */ - new_idt = (struct gate_descriptor*) + new_idt = (struct gate_descriptor *) (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (u_int)new_idt; + r_idt.rd_limit = sizeof(idt0) - 1; lidt(&r_idt); + /* SMP machines do not need the F00F hack. */ idt = new_idt; pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); } Index: sys/i386/i386/mem.c =================================================================== --- sys/i386/i386/mem.c +++ sys/i386/i386/mem.c @@ -92,9 +92,6 @@ return EIO; if (dev2unit(dev) == CDEV_MINOR_KMEM && uio->uio_resid > 0) { - if (uio->uio_offset < (vm_offset_t)VADDR(PTDPTDI, 0)) - return (EFAULT); - if (!kernacc((caddr_t)(int)uio->uio_offset, uio->uio_resid, uio->uio_rw == UIO_READ ? VM_PROT_READ : VM_PROT_WRITE)) return (EFAULT); Index: sys/i386/i386/minidump_machdep.c =================================================================== --- sys/i386/i386/minidump_machdep.c +++ sys/i386/i386/minidump_machdep.c @@ -190,7 +190,7 @@ * page written corresponds to 2MB of space */ ptesize += PAGE_SIZE; - pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE); /* always mapped! */ + pd = IdlePTD; /* always mapped! */ j = va >> PDRSHIFT; if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is an entire 2M page. */ @@ -281,7 +281,7 @@ /* Dump kernel page table pages */ for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) { /* We always write a page, even if it is zero */ - pd = (pd_entry_t *)((uintptr_t)IdlePTD + KERNBASE); /* always mapped! */ + pd = IdlePTD; /* always mapped! */ j = va >> PDRSHIFT; if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V)) { /* This is a single 2M block. Generate a fake PTP */ Index: sys/i386/i386/mp_machdep.c =================================================================== --- sys/i386/i386/mp_machdep.c +++ sys/i386/i386/mp_machdep.c @@ -83,8 +83,8 @@ #include #define WARMBOOT_TARGET 0 -#define WARMBOOT_OFF (KERNBASE + 0x0467) -#define WARMBOOT_SEG (KERNBASE + 0x0469) +#define WARMBOOT_OFF (PMAP_MAP_LOW + 0x0467) +#define WARMBOOT_SEG (PMAP_MAP_LOW + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) @@ -140,6 +140,7 @@ static int start_ap(int apic_id); static u_int boot_address; +static char *ap_tramp_stack_base; /* * Calculate usable address in base memory for AP trampoline code. @@ -223,10 +224,10 @@ init_secondary(void) { struct pcpu *pc; - vm_offset_t addr; - int gsel_tss; - int x, myid; - u_int cr0; + struct i386tss *common_tssp; + struct region_descriptor r_gdt, r_idt; + int gsel_tss, myid, x; + u_int cr0; /* bootAP is set in start_ap() to our ID. */ myid = bootAP; @@ -240,11 +241,13 @@ pc->pc_apic_id = cpu_apic_ids[myid]; pc->pc_prvspace = pc; pc->pc_curthread = 0; + pc->pc_common_tssp = common_tssp = &(__pcpu[0].pc_common_tssp)[myid]; fix_cpuid(); - gdt_segs[GPRIV_SEL].ssd_base = (int) pc; - gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; + gdt_segs[GPRIV_SEL].ssd_base = (int)pc; + gdt_segs[GPROC0_SEL].ssd_base = (int)common_tssp; + gdt_segs[GLDT_SEL].ssd_base = (int)ldt; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); @@ -254,16 +257,20 @@ r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ + r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; + r_idt.rd_base = (int)idt; lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); + PCPU_SET(trampstk, (uintptr_t)ap_tramp_stack_base - VM86_STACK_SPACE); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; - PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); + common_tssp->tss_esp0 = PCPU_GET(trampstk); + common_tssp->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); + common_tssp->tss_ioopt = sizeof(struct i386tss) << 16; PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); @@ -290,8 +297,6 @@ /* BSP may have changed PTD while we were waiting */ invltlb(); - for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) - invlpg(addr); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); @@ -303,17 +308,20 @@ /* * start each AP in our list */ -/* Lowest 1MB is already mapped: don't touch*/ #define TMPMAP_START 1 static int start_all_aps(void) { u_char mpbiosreason; u_int32_t mpbioswarmvec; - int apic_id, cpu, i; + int apic_id, cpu; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); + /* Remap lowest 1MB */ + IdlePTD[0] = IdlePTD[1]; + load_cr3(rcr3()); /* invalidate TLB */ + /* install the AP 1st level boot code */ install_ap_tramp(); @@ -322,11 +330,7 @@ outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); - /* set up temporary P==V mapping for AP boot */ - /* XXX this is a hack, we should boot the AP on its own stack/PTD */ - for (i = TMPMAP_START; i < NKPT; i++) - PTD[i] = PTD[KPTDI + i]; - invltlb(); + /* take advantage of the P==V mapping for PTD[0] for AP boot */ /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { @@ -348,6 +352,8 @@ PAGE_SIZE - 4; bootAP = cpu; + ap_tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); + /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(apic_id)) { @@ -363,17 +369,16 @@ CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } + /* Unmap lowest 1MB again */ + IdlePTD[0] = 0; + load_cr3(rcr3()); + /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); - /* Undo V==P hack from above */ - for (i = TMPMAP_START; i < NKPT; i++) - PTD[i] = 0; - pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); - /* number of APs actually started */ return mp_naps; } @@ -395,7 +400,7 @@ { int x; int size = *(int *) ((u_long) & bootMP_size); - vm_offset_t va = boot_address + KERNBASE; + vm_offset_t va = boot_address; u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) va; u_int boot_base = (u_int) bootMP; @@ -425,7 +430,7 @@ /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); - *dst32 = ((u_int) MPentry - KERNBASE); + *dst32 = (u_int)MPentry; /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); Index: sys/i386/i386/mpboot.s =================================================================== --- sys/i386/i386/mpboot.s +++ sys/i386/i386/mpboot.s @@ -37,8 +37,6 @@ #include "assym.s" -#define R(x) ((x)-KERNBASE) - /* * this code MUST be enabled here and in mp_machdep.c * it follows the very early stages of AP boot by placing values in CMOS ram. @@ -80,18 +78,14 @@ movl $1,%eax cpuid /* Retrieve features */ movl %cr4,%eax -#ifndef DISABLE_PSE testl $CPUID_PSE,%edx jz 1f orl $CR4_PSE,%eax /* Enable PSE */ 1: -#endif -#ifndef DISABLE_PG_G testl $CPUID_PGE,%edx jz 1f orl $CR4_PGE,%eax /* Enable PGE */ 1: -#endif testl $CPUID_VME,%edx jz 1f orl $CR4_VME,%eax /* Enable VME */ @@ -100,13 +94,13 @@ /* Now enable paging mode */ #if defined(PAE) || defined(PAE_TABLES) - movl R(IdlePDPT), %eax + movl IdlePDPT, %eax movl %eax, %cr3 movl %cr4, %eax orl $CR4_PAE, %eax movl %eax, %cr4 #else - movl R(IdlePTD), %eax + movl IdlePTD, %eax movl %eax,%cr3 #endif movl %cr0,%eax Index: sys/i386/i386/pmap.c =================================================================== --- sys/i386/i386/pmap.c +++ sys/i386/i386/pmap.c @@ -47,6 +47,8 @@ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. + * Copyright (c) 2018 The FreeBSD Foundation + * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the @@ -54,6 +56,10 @@ * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -121,6 +127,7 @@ #include #include #include +#include #include #include @@ -141,6 +148,7 @@ #include #include #endif +#include #include #include #include @@ -190,9 +198,6 @@ #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; -LIST_HEAD(pmaplist, pmap); -static struct pmaplist allpmaps; -static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ @@ -200,9 +205,7 @@ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; -vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; -extern u_int32_t KERNend; -extern u_int32_t KPTphys; +vm_offset_t kernel_vm_end = /* 0 + */ NKPT * NBPDR; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; @@ -343,29 +346,204 @@ static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait); #endif -static void pmap_set_pg(void); +static void pmap_init_trm(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); +void pmap_cold(void); +extern char _end[]; +u_long physfree; /* phys addr of next free page */ +u_long vm86phystk; /* PA of vm86/bios stack */ +u_long vm86paddr; /* address of vm86 region */ +int vm86pa; /* phys addr of vm86 region */ +u_long KERNend; /* phys addr end of kernel (just after bss) */ +pd_entry_t *IdlePTD; /* phys addr of kernel PTD */ +#if defined(PAE) || defined(PAE_TABLES) +pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */ +#endif +pt_entry_t *KPTmap; /* address of kernel page tables */ +u_long KPTphys; /* phys addr of kernel page tables */ + +static u_long +allocpages(u_int cnt, u_long *physfree) +{ + u_long res; + + res = *physfree; + *physfree += PAGE_SIZE * cnt; + bzero((void *)res, PAGE_SIZE * cnt); + return (res); +} + +static void +pmap_cold_map(u_long pa, u_long va, u_long cnt) +{ + pt_entry_t *pt; + + for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0; + cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE) + *pt = pa | PG_V | PG_RW | PG_A | PG_M; +} + +static void +pmap_cold_mapident(u_long pa, u_long cnt) +{ + + pmap_cold_map(pa, pa, cnt); +} + +_Static_assert(2 * NBPDR == KERNBASE, "Broken double-map of zero PTD"); + /* - * If you get an error here, then you set KVA_PAGES wrong! See the - * description of KVA_PAGES in sys/i386/include/pmap.h. It must be - * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. + * Called from locore.s before paging is enabled. Sets up the first + * kernel page table. Since kernel is mapped with PA == VA, this code + * does not require relocations. */ -CTASSERT(KERNBASE % (1 << 24) == 0); +void +pmap_cold(void) +{ + pt_entry_t *pt; + u_long a; + u_int cr3, ncr4; + + physfree = (u_long)&_end; + if (bootinfo.bi_esymtab != 0) + physfree = bootinfo.bi_esymtab; + if (bootinfo.bi_kernend != 0) + physfree = bootinfo.bi_kernend; + physfree = roundup2(physfree, NBPDR); + KERNend = physfree; + + /* Allocate Kernel Page Tables */ + KPTphys = allocpages(NKPT, &physfree); + KPTmap = (pt_entry_t *)KPTphys; + + /* Allocate Page Table Directory */ +#if defined(PAE) || defined(PAE_TABLES) + /* XXX only need 32 bytes (easier for now) */ + IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree); +#endif + IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree); + + /* Allocate KSTACK */ + proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree); + + /* vm86/bios stack */ + vm86phystk = allocpages(1, &physfree); + + /* pgtable + ext + IOPAGES */ + vm86paddr = vm86pa = allocpages(3, &physfree); + + /* Install page tables into PTD. Page table page 1 is wasted. */ + for (a = 0; a < NKPT; a++) + IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M; + +#if defined(PAE) || defined(PAE_TABLES) + /* PAE install PTD pointers into PDPT */ + for (a = 0; a < NPGPTD; a++) + IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V; +#endif + + /* + * Install recursive mapping for kernel page tables into + * itself. + */ + for (a = 0; a < NPGPTD; a++) + IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V | + PG_RW; + + /* + * Initialize page table pages mapping physical address zero + * through the (physical) end of the kernel. Many of these + * pages must be reserved, and we reserve them all and map + * them linearly for convenience. We do this even if we've + * enabled PSE above; we'll just switch the corresponding + * kernel PDEs before we turn on paging. + * + * This and all other page table entries allow read and write + * access for various reasons. Kernel mappings never have any + * access restrictions. + */ + pmap_cold_mapident(0, atop(NBPDR)); + pmap_cold_map(0, NBPDR, atop(NBPDR)); + pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE)); + + /* Map page table directory */ +#if defined(PAE) || defined(PAE_TABLES) + pmap_cold_mapident((u_long)IdlePDPT, 1); +#endif + pmap_cold_mapident((u_long)IdlePTD, NPGPTD); + + /* Map proc0kstack */ + pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES); + /* ISA hole already mapped */ + + pmap_cold_mapident(vm86phystk, 1); + pmap_cold_mapident(vm86pa, 3); + + /* Map page 0 into the vm86 page table */ + *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V; + + /* ...likewise for the ISA hole for vm86 */ + for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0; + a < atop(ISA_HOLE_LENGTH); a++, pt++) + *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A | + PG_M | PG_V; + + /* Enable PSE, PGE, VME, and PAE if configured. */ + ncr4 = 0; + if ((cpu_feature & CPUID_PSE) != 0) { + ncr4 |= CR4_PSE; + /* + * Superpage mapping of the kernel text. Existing 4k + * page table pages are wasted. + */ + for (a = KERNBASE; a < KERNend; a += NBPDR) + IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M | + PG_RW | PG_V; + } + if ((cpu_feature & CPUID_PGE) != 0) { + ncr4 |= CR4_PGE; + pgeflag = PG_G; + } + ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0; +#if defined(PAE) || defined(PAE_TABLES) + ncr4 |= CR4_PAE; +#endif + if (ncr4 != 0) + load_cr4(rcr4() | ncr4); + + /* Now enable paging */ +#if defined(PAE) || defined(PAE_TABLES) + cr3 = (u_int)IdlePDPT; +#else + cr3 = (u_int)IdlePTD; +#endif + load_cr3(cr3); + load_cr0(rcr0() | CR0_PG); + + /* + * Now running relocated at KERNBASE where the system is + * linked to run. + */ + + /* + * Remove the lowest part of the double mapping of low memory + * to get some null pointer checks. + */ + IdlePTD[0] = 0; + load_cr3(cr3); /* invalidate TLB */ +} /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled + * in locore.s with the page table created in pmap_cold(), * and just syncs the pmap module with what has already been done. - * [We can't call it easily with mapping off since the kernel is not - * mapped with PA == VA, hence we would have to relocate every address - * from the linked base (virtual) address "KERNBASE" to the actual - * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) @@ -391,7 +569,7 @@ * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ - virtual_avail = (vm_offset_t) KERNBASE + firstaddr; + virtual_avail = (vm_offset_t)firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; @@ -399,9 +577,9 @@ * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); - kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); + kernel_pmap->pm_pdir = IdlePTD; #if defined(PAE) || defined(PAE_TABLES) - kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); + kernel_pmap->pm_pdpt = IdlePDPT; #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); @@ -411,19 +589,6 @@ */ rw_init(&pvh_global_lock, "pmap pv global"); - LIST_INIT(&allpmaps); - - /* - * Request a spin mutex so that changes to allpmaps cannot be - * preempted by smp_rendezvous_cpus(). Otherwise, - * pmap_update_pde_kernel() could access allpmaps while it is - * being changed. - */ - mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); - mtx_lock_spin(&allpmaps_lock); - LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); - mtx_unlock_spin(&allpmaps_lock); - /* * Reserve some special page table entries/VA space for temporary * mapping of pages. @@ -474,14 +639,7 @@ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) - KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; - - /* - * Adjust the start of the KPTD and KPTmap so that the implementation - * of pmap_kextract() and pmap_growkernel() can be made simpler. - */ - KPTD -= KPTDI; - KPTmap -= i386_btop(KPTDI << PDRSHIFT); + KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), @@ -494,18 +652,6 @@ virtual_avail = va; - /* - * Finish removing the identity mapping (virt == phys) of low memory. - * It was only used for 2 instructions in locore. locore then - * unmapped the first PTD to get some null pointer checks. ACPI - * wakeup will map the first PTD transiently to use it for 1 - * instruction. The double mapping for low memory is not usable in - * normal operation since it breaks trapping of null pointers and - * causes inconsistencies in page tables when combined with PG_G. - */ - for (i = 1; i < NKPT; i++) - PTD[i] = 0; - /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a @@ -515,9 +661,6 @@ * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); - - /* Turn on PG_G on kernel page(s) */ - pmap_set_pg(); } static void @@ -529,21 +672,32 @@ CPU_FOREACH(i) { pc = pcpu_find(i); + mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF | + MTX_NEW); + pc->pc_copyout_maddr = kva_alloc(ptoa(2)); + if (pc->pc_copyout_maddr == 0) + panic("unable to allocate non-sleepable copyout KVA"); + sx_init(&pc->pc_copyout_slock, "cpslk"); + pc->pc_copyout_saddr = kva_alloc(ptoa(2)); + if (pc->pc_copyout_saddr == 0) + panic("unable to allocate sleepable copyout KVA"); + /* - * Skip if the mapping has already been initialized, + * Skip if the mappings have already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; + mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) - panic("%s: unable to allocate KVA", __func__); + panic("unable to allocate CMAP KVA"); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); - pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); + pc->pc_qmap_addr = pages + atop(2); } } @@ -653,39 +807,6 @@ load_cr4(cr4); } -/* - * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. - */ -static void -pmap_set_pg(void) -{ - pt_entry_t *pte; - vm_offset_t va, endva; - - if (pgeflag == 0) - return; - - endva = KERNBASE + KERNend; - - if (pseflag) { - va = KERNBASE + roundup2(KERNLOAD, NBPDR); - while (va < endva) { - pdir_pde(PTD, va) |= pgeflag; - invltlb(); /* Flush non-PG_G entries. */ - va += NBPDR; - } - } else { - va = (vm_offset_t)btext; - while (va < endva) { - pte = vtopte(va); - if (*pte) - *pte |= pgeflag; - invltlb(); /* Flush non-PG_G entries. */ - va += PAGE_SIZE; - } - } -} - /* * Initialize a vm_page's machine-dependent fields. */ @@ -783,12 +904,12 @@ * page table pages. */ for (i = 0; i < NKPT; i++) { - mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); + mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; - mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); + mpte->phys_addr = KPTphys + ptoa(i); } /* @@ -859,6 +980,8 @@ #endif pmap_initialized = 1; + pmap_init_trm(); + if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { @@ -868,6 +991,7 @@ printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } + } @@ -935,21 +1059,9 @@ pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; - pmap_t pmap; - boolean_t PTD_updated; - - PTD_updated = FALSE; - mtx_lock_spin(&allpmaps_lock); - LIST_FOREACH(pmap, &allpmaps, pm_list) { - if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & - PG_FRAME)) - PTD_updated = TRUE; - pde = pmap_pde(pmap, va); - pde_store(pde, newpde); - } - mtx_unlock_spin(&allpmaps_lock); - KASSERT(PTD_updated, - ("pmap_kenter_pde: current page table is not in allpmaps")); + + pde = pmap_pde(kernel_pmap, va); + pde_store(pde, newpde); } /* @@ -962,47 +1074,23 @@ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { - u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); - else if ((newpde & PG_G) == 0) + else /* if ((newpde & PG_G) == 0) */ /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); - else { - /* - * Promotion: flush every 4KB page mapping from the TLB, - * including any global (PG_G) mappings. - */ - cr4 = rcr4(); - load_cr4(cr4 & ~CR4_PGE); - /* - * Although preemption at this point could be detrimental to - * performance, it would not lead to an error. PG_G is simply - * ignored if CR4.PGE is clear. Moreover, in case this block - * is re-entered, the load_cr4() either above or below will - * modify CR4.PGE flushing the TLB. - */ - load_cr4(cr4 | CR4_PGE); - } } void invltlb_glob(void) { - uint64_t cr4; - if (pgeflag == 0) { - invltlb(); - } else { - cr4 = rcr4(); - load_cr4(cr4 & ~CR4_PGE); - load_cr4(cr4 | CR4_PGE); - } + invltlb(); } @@ -1033,15 +1121,15 @@ u_int cpuid; sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { + if (pmap == kernel_pmap) { invlpg(va); mask = &all_cpus; + } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { + mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } @@ -1065,17 +1153,16 @@ } sched_pin(); - if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { + if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; + } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { + mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } @@ -1091,17 +1178,14 @@ sched_pin(); if (pmap == kernel_pmap) { - invltlb_glob(); + invltlb(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { - invltlb(); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); - if (CPU_ISSET(cpuid, &pmap->pm_active)) - invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } @@ -1132,19 +1216,10 @@ { struct pde_action *act = arg; pd_entry_t *pde; - pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { - - /* - * Elsewhere, this operation requires allpmaps_lock for - * synchronization. Here, it does not because it is being - * performed in the context of an all_cpus rendezvous. - */ - LIST_FOREACH(pmap, &allpmaps, pm_list) { - pde = pmap_pde(pmap, act->va); - pde_store(pde, act->newpde); - } + pde = pmap_pde(kernel_pmap, act->va); + pde_store(pde, act->newpde); } } @@ -1219,7 +1294,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + if (pmap == kernel_pmap) invlpg(va); } @@ -1228,7 +1303,7 @@ { vm_offset_t addr; - if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) + if (pmap == kernel_pmap) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } @@ -1238,8 +1313,6 @@ { if (pmap == kernel_pmap) - invltlb_glob(); - else if (!CPU_EMPTY(&pmap->pm_active)) invltlb(); } @@ -1371,8 +1444,7 @@ pmap_is_current(pmap_t pmap) { - return (pmap == kernel_pmap || pmap == - vmspace_pmap(curthread->td_proc->p_vmspace)); + return (pmap == kernel_pmap); } /* @@ -1570,7 +1642,7 @@ pt_entry_t *pte; pte = vtopte(va); - pte_store(pte, pa | PG_RW | PG_V | pgeflag); + pte_store(pte, pa | PG_RW | PG_V); } static __inline void @@ -1579,7 +1651,7 @@ pt_entry_t *pte; pte = vtopte(va); - pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); + pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(mode, 0)); } /* @@ -1638,7 +1710,7 @@ pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); - newpde = start | PG_PS | pgeflag | PG_RW | PG_V; + newpde = start | PG_PS | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; @@ -1678,9 +1750,9 @@ if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; #if defined(PAE) || defined(PAE_TABLES) - pte_store(pte, pa | pgeflag | pg_nx | PG_RW | PG_V); + pte_store(pte, pa | pg_nx | PG_RW | PG_V); #else - pte_store(pte, pa | pgeflag | PG_RW | PG_V); + pte_store(pte, pa | PG_RW | PG_V); #endif } pte++; @@ -1809,7 +1881,7 @@ pd_entry_t ptepde; vm_page_t mpte; - if (va >= VM_MAXUSER_ADDRESS) + if (pmap == kernel_pmap) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); @@ -1824,14 +1896,9 @@ { PMAP_LOCK_INIT(pmap); - /* - * Since the page table directory is shared with the kernel pmap, - * which is already included in the list "allpmaps", this pmap does - * not need to be inserted into that list. - */ - pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); + pmap->pm_pdir = IdlePTD; #if defined(PAE) || defined(PAE_TABLES) - pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); + pmap->pm_pdpt = IdlePDPT; #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); @@ -1847,8 +1914,7 @@ int pmap_pinit(pmap_t pmap) { - vm_page_t m, ptdpg[NPGPTD]; - vm_paddr_t pa; + vm_page_t m; int i; /* @@ -1878,32 +1944,25 @@ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); - if (m == NULL) + if (m == NULL) { vm_wait(NULL); - else - ptdpg[i++] = m; + } else { + pmap->pm_ptdpg[i] = m; +#if defined(PAE) || defined(PAE_TABLES) + pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V; +#endif + i++; + } } - pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); + pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) - if ((ptdpg[i]->flags & PG_ZERO) == 0) + if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); - mtx_lock_spin(&allpmaps_lock); - LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); - /* Copy the kernel page table directory entries. */ - bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); - mtx_unlock_spin(&allpmaps_lock); - - /* install self-referential address mapping entry(s) */ - for (i = 0; i < NPGPTD; i++) { - pa = VM_PAGE_TO_PHYS(ptdpg[i]); - pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; -#if defined(PAE) || defined(PAE_TABLES) - pmap->pm_pdpt[i] = pa | PG_V; -#endif - } + /* Install the trampoline mapping. */ + pmap->pm_pdir[TRPTDI] = PTD[TRPTDI]; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); @@ -2016,7 +2075,7 @@ void pmap_release(pmap_t pmap) { - vm_page_t m, ptdpg[NPGPTD]; + vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, @@ -2027,27 +2086,16 @@ KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); - mtx_lock_spin(&allpmaps_lock); - LIST_REMOVE(pmap, pm_list); - mtx_unlock_spin(&allpmaps_lock); - - for (i = 0; i < NPGPTD; i++) - ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & - PG_FRAME); - - bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * - sizeof(*pmap->pm_pdir)); - pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { - m = ptdpg[i]; + m = pmap->pm_ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif vm_page_unwire_noq(m); - vm_page_free_zero(m); + vm_page_free(m); } } @@ -2107,7 +2155,7 @@ pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); - pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; + pdir_pde(KPTD, kernel_vm_end) = newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; @@ -2665,7 +2713,7 @@ " in pmap %p", va, pmap); return (FALSE); } - if (va < VM_MAXUSER_ADDRESS) + if (pmap != kernel_pmap) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); @@ -2676,7 +2724,7 @@ * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ - if (va >= KERNBASE) + if (pmap == kernel_pmap) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { @@ -3471,9 +3519,11 @@ mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; - KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); - KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, - ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", + KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) || + (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS), + ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va)); + KASSERT(va < PMAP_TRM_MIN_ADDRESS, + ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); @@ -3483,7 +3533,7 @@ sched_pin(); pde = pmap_pde(pmap, va); - if (va < VM_MAXUSER_ADDRESS) { + if (pmap != kernel_pmap) { /* * va is for UVA. * In the case that a page table page is not resident, @@ -3582,7 +3632,8 @@ * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { - KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, + KASSERT(pmap != kernel_pmap || va < kmi.clean_sva || + va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); @@ -3614,10 +3665,8 @@ #endif if (wired) newpte |= PG_W; - if (va < VM_MAXUSER_ADDRESS) + if (pmap != kernel_pmap) newpte |= PG_U; - if (pmap == kernel_pmap) - newpte |= pgeflag; /* * if the mapping or permission bits are different, we need @@ -3802,8 +3851,8 @@ vm_paddr_t pa; struct spglist free; - KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || - (m->oflags & VPO_UNMANAGED) != 0, + KASSERT(pmap != kernel_pmap || va < kmi.clean_sva || + va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -3812,7 +3861,7 @@ * In the case that a page table page is not * resident, we are creating it here. */ - if (va < VM_MAXUSER_ADDRESS) { + if (pmap != kernel_pmap) { u_int ptepindex; pd_entry_t ptepa; @@ -3848,18 +3897,14 @@ mpte = NULL; } - /* - * This call to vtopte makes the assumption that we are - * entering the page into the current pmap. In order to support - * quick entry into any pmap, one would likely use pmap_pte_quick. - * But that isn't as quick as vtopte. - */ - pte = vtopte(va); + /* XXXKIB: pmap_pte_quick() instead ? */ + pte = pmap_pte(pmap, va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } + pmap_pte_release(pte); return (mpte); } @@ -3877,6 +3922,7 @@ mpte = NULL; } + pmap_pte_release(pte); return (mpte); } @@ -3898,6 +3944,7 @@ pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); + pmap_pte_release(pte); return (mpte); } @@ -4101,6 +4148,9 @@ vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; + /* XXXKIB */ + return; + if (dst_addr != src_addr) return; @@ -4122,8 +4172,8 @@ pd_entry_t srcptepaddr; u_int ptepindex; - KASSERT(addr < UPT_MIN_ADDRESS, - ("pmap_copy: invalid to pmap_copy page tables")); + KASSERT(addr < PMAP_TRM_MIN_ADDRESS, + ("pmap_copy: invalid to pmap_copy the trampoline")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) @@ -4519,7 +4569,7 @@ pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { - pte = vtopte(pv->pv_va); + pte = pmap_pte_quick(pmap, pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } @@ -4685,8 +4735,10 @@ PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { - pte = vtopte(addr); - rv = *pte == 0; + pte = pmap_pte(pmap, addr); + if (pte != NULL) + rv = *pte == 0; + pmap_pte_release(pte); } PMAP_UNLOCK(pmap); return (rv); @@ -5188,8 +5240,8 @@ size = round_page(offset + size); pa = pa & PG_FRAME; - if (pa < KERNLOAD && pa + size <= KERNLOAD) - va = KERNBASE + pa; + if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) + va = pa + PMAP_MAP_LOW; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { @@ -5248,7 +5300,7 @@ vm_offset_t offset; int i; - if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) + if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE) return; offset = va & PAGE_MASK; size = round_page(offset + size); @@ -5545,7 +5597,6 @@ * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; - load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } @@ -5613,6 +5664,75 @@ critical_exit(); } +static vmem_t *pmap_trm_arena; +static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS; + +static int +pmap_trm_import(void *unused __unused, vmem_size_t size, int flags, + vmem_addr_t *addrp) +{ + vm_page_t m; + vmem_addr_t af, addr, prev_addr; + pt_entry_t *trm_pte; + + prev_addr = atomic_load_long(&pmap_trm_arena_last); + size = round_page(size); + for (;;) { + if (prev_addr + size < prev_addr || prev_addr + size < size || + prev_addr + size > PMAP_TRM_MAX_ADDRESS) + return (ENOMEM); + addr = prev_addr + size; + if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr)) + break; + } + trm_pte = PTmap + atop(prev_addr); + for (af = prev_addr; af < addr; af += PAGE_SIZE) { + m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); + pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) | + PG_M | PG_A | PG_RW | PG_V | pgeflag | + pmap_cache_bits(VM_MEMATTR_DEFAULT, FALSE)); + } + *addrp = prev_addr; + return (0); +} + +static +void pmap_init_trm(void) +{ + vm_page_t pd_m; + + pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK); + vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE); + pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO); + if ((pd_m->flags & PG_ZERO) == 0) + pmap_zero_page(pd_m); + PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V | + pmap_cache_bits(VM_MEMATTR_DEFAULT, TRUE); +} + +void * +pmap_trm_alloc(size_t size, int flags) +{ + vmem_addr_t res; + int error; + + MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0); + error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int), + 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res); + if (error != 0) + return (NULL); + return ((void *)res); +} + +void +pmap_trm_free(void *addr, size_t size) +{ + + vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4)); +} + #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { Index: sys/i386/i386/sigtramp.s =================================================================== --- sys/i386/i386/sigtramp.s +++ sys/i386/i386/sigtramp.s @@ -97,6 +97,45 @@ pushl %eax /* junk to fake return addr. */ int $0x80 /* enter kernel with args */ 0: jmp 0b + +/* + * The lcall $7,$0 handler cannot use the call gate that does an + * inter-privilege transition. The reason is that the call gate + * does not disable interrupts, and, before the kernel page table is + * activated on MMU, we would have a window where the ring 0 code is + * executed with the wrong page table and interrupts enabled. + * + * Instead, set LDT descriptor 0 as code segment, which reflects + * the lcall $7,$0 back to ring 3 trampoline. The trampoline sets up + * the frame for int $0x80. + */ + ALIGN_TEXT +lcall_tramp: + cmpl $SYS_vfork,%eax + je 1f + pushl %ebp + movl %esp,%ebp + pushl 0x24(%ebp) /* arg 6 */ + pushl 0x20(%ebp) + pushl 0x1c(%ebp) + pushl 0x18(%ebp) + pushl 0x14(%ebp) + pushl 0x10(%ebp) /* arg 1 */ + subl $4,%esp /* gap */ + int $0x80 + leavel + lretl +1: + /* + * vfork handling is special and relies on the libc stub saving + * the return ip in %ecx. Also, we assume that the call was done + * with ucode32 selector in %cs. + */ + int $0x80 + movl $0x33,4(%esp) /* GUCODE32_SEL | SEL_UPL */ + movl %ecx,(%esp) + lretl + #endif /* COMPAT_43 */ ALIGN_TEXT @@ -115,4 +154,7 @@ .globl szosigcode szosigcode: .long esigcode-osigcode + .globl sz_lcall_tramp +sz_lcall_tramp: + .long esigcode-lcall_tramp #endif Index: sys/i386/i386/support.s =================================================================== --- sys/i386/i386/support.s +++ sys/i386/i386/support.s @@ -251,196 +251,6 @@ ret END(memcpy) -/*****************************************************************************/ -/* copyout and fubyte family */ -/*****************************************************************************/ -/* - * Access user memory from inside the kernel. These routines and possibly - * the math- and DOS emulators should be the only places that do this. - * - * We have to access the memory with user's permissions, so use a segment - * selector with RPL 3. For writes to user space we have to additionally - * check the PTE for write permission, because the 386 does not check - * write permissions when we are executing with EPL 0. The 486 does check - * this if the WP bit is set in CR0, so we can use a simpler version here. - * - * These routines set curpcb->pcb_onfault for the time they execute. When a - * protection violation occurs inside the functions, the trap handler - * returns to *curpcb->pcb_onfault instead of the function. - */ - -/* - * copyout(from_kernel, to_user, len) - MP SAFE - */ -ENTRY(copyout) - movl PCPU(CURPCB),%eax - movl $copyout_fault,PCB_ONFAULT(%eax) - pushl %esi - pushl %edi - pushl %ebx - movl 16(%esp),%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx - testl %ebx,%ebx /* anything to do? */ - jz done_copyout - - /* - * Check explicitly for non-user addresses. This check is essential - * because it prevents usermode from writing into the kernel. We do - * not verify anywhere else that the user did not specify a rogue - * address. - */ - /* - * First, prevent address wrapping. - */ - movl %edi,%eax - addl %ebx,%eax - jc copyout_fault -/* - * XXX STOP USING VM_MAXUSER_ADDRESS. - * It is an end address, not a max, so every time it is used correctly it - * looks like there is an off by one error, and of course it caused an off - * by one error in several places. - */ - cmpl $VM_MAXUSER_ADDRESS,%eax - ja copyout_fault - - /* bcopy(%esi, %edi, %ebx) */ - movl %ebx,%ecx - - shrl $2,%ecx - rep - movsl - movb %bl,%cl - andb $3,%cl - rep - movsb - -done_copyout: - popl %ebx - popl %edi - popl %esi - xorl %eax,%eax - movl PCPU(CURPCB),%edx - movl %eax,PCB_ONFAULT(%edx) - ret -END(copyout) - - ALIGN_TEXT -copyout_fault: - popl %ebx - popl %edi - popl %esi - movl PCPU(CURPCB),%edx - movl $0,PCB_ONFAULT(%edx) - movl $EFAULT,%eax - ret - -/* - * copyin(from_user, to_kernel, len) - MP SAFE - */ -ENTRY(copyin) - movl PCPU(CURPCB),%eax - movl $copyin_fault,PCB_ONFAULT(%eax) - pushl %esi - pushl %edi - movl 12(%esp),%esi /* caddr_t from */ - movl 16(%esp),%edi /* caddr_t to */ - movl 20(%esp),%ecx /* size_t len */ - - /* - * make sure address is valid - */ - movl %esi,%edx - addl %ecx,%edx - jc copyin_fault - cmpl $VM_MAXUSER_ADDRESS,%edx - ja copyin_fault - - movb %cl,%al - shrl $2,%ecx /* copy longword-wise */ - rep - movsl - movb %al,%cl - andb $3,%cl /* copy remaining bytes */ - rep - movsb - - popl %edi - popl %esi - xorl %eax,%eax - movl PCPU(CURPCB),%edx - movl %eax,PCB_ONFAULT(%edx) - ret -END(copyin) - - ALIGN_TEXT -copyin_fault: - popl %edi - popl %esi - movl PCPU(CURPCB),%edx - movl $0,PCB_ONFAULT(%edx) - movl $EFAULT,%eax - ret - -/* - * casueword. Compare and set user word. Returns -1 on fault, - * 0 on non-faulting access. The current value is in *oldp. - */ -ALTENTRY(casueword32) -ENTRY(casueword) - movl PCPU(CURPCB),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) - movl 4(%esp),%edx /* dst */ - movl 8(%esp),%eax /* old */ - movl 16(%esp),%ecx /* new */ - - cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ - ja fusufault - -#ifdef SMP - lock -#endif - cmpxchgl %ecx,(%edx) /* Compare and set. */ - - /* - * The old value is in %eax. If the store succeeded it will be the - * value we expected (old) from before the store, otherwise it will - * be the current value. - */ - - movl PCPU(CURPCB),%ecx - movl $0,PCB_ONFAULT(%ecx) - movl 12(%esp),%edx /* oldp */ - movl %eax,(%edx) - xorl %eax,%eax - ret -END(casueword32) -END(casueword) - -/* - * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user - * memory. - */ - -ALTENTRY(fueword32) -ENTRY(fueword) - movl PCPU(CURPCB),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) - movl 4(%esp),%edx /* from */ - - cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ - ja fusufault - - movl (%edx),%eax - movl $0,PCB_ONFAULT(%ecx) - movl 8(%esp),%edx - movl %eax,(%edx) - xorl %eax,%eax - ret -END(fueword32) -END(fueword) - /* * fuswintr() and suswintr() are specialized variants of fuword16() and * suword16(), respectively. They are called from the profiling code, @@ -455,167 +265,6 @@ END(suswintr) END(fuswintr) -ENTRY(fuword16) - movl PCPU(CURPCB),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) - movl 4(%esp),%edx - - cmpl $VM_MAXUSER_ADDRESS-2,%edx - ja fusufault - - movzwl (%edx),%eax - movl $0,PCB_ONFAULT(%ecx) - ret -END(fuword16) - -ENTRY(fubyte) - movl PCPU(CURPCB),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) - movl 4(%esp),%edx - - cmpl $VM_MAXUSER_ADDRESS-1,%edx - ja fusufault - - movzbl (%edx),%eax - movl $0,PCB_ONFAULT(%ecx) - ret -END(fubyte) - - ALIGN_TEXT -fusufault: - movl PCPU(CURPCB),%ecx - xorl %eax,%eax - movl %eax,PCB_ONFAULT(%ecx) - decl %eax - ret - -/* - * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory. - * All these functions are MPSAFE. - */ - -ALTENTRY(suword32) -ENTRY(suword) - movl PCPU(CURPCB),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) - movl 4(%esp),%edx - - cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */ - ja fusufault - - movl 8(%esp),%eax - movl %eax,(%edx) - xorl %eax,%eax - movl PCPU(CURPCB),%ecx - movl %eax,PCB_ONFAULT(%ecx) - ret -END(suword32) -END(suword) - -ENTRY(suword16) - movl PCPU(CURPCB),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) - movl 4(%esp),%edx - - cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */ - ja fusufault - - movw 8(%esp),%ax - movw %ax,(%edx) - xorl %eax,%eax - movl PCPU(CURPCB),%ecx /* restore trashed register */ - movl %eax,PCB_ONFAULT(%ecx) - ret -END(suword16) - -ENTRY(subyte) - movl PCPU(CURPCB),%ecx - movl $fusufault,PCB_ONFAULT(%ecx) - movl 4(%esp),%edx - - cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */ - ja fusufault - - movb 8(%esp),%al - movb %al,(%edx) - xorl %eax,%eax - movl PCPU(CURPCB),%ecx /* restore trashed register */ - movl %eax,PCB_ONFAULT(%ecx) - ret -END(subyte) - -/* - * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE - * - * copy a string from 'from' to 'to', stop when a 0 character is reached. - * return ENAMETOOLONG if string is longer than maxlen, and - * EFAULT on protection violations. If lencopied is non-zero, - * return the actual length in *lencopied. - */ -ENTRY(copyinstr) - pushl %esi - pushl %edi - movl PCPU(CURPCB),%ecx - movl $cpystrflt,PCB_ONFAULT(%ecx) - - movl 12(%esp),%esi /* %esi = from */ - movl 16(%esp),%edi /* %edi = to */ - movl 20(%esp),%edx /* %edx = maxlen */ - - movl $VM_MAXUSER_ADDRESS,%eax - - /* make sure 'from' is within bounds */ - subl %esi,%eax - jbe cpystrflt - - /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ - cmpl %edx,%eax - jae 1f - movl %eax,%edx - movl %eax,20(%esp) -1: - incl %edx - -2: - decl %edx - jz 3f - - lodsb - stosb - orb %al,%al - jnz 2b - - /* Success -- 0 byte reached */ - decl %edx - xorl %eax,%eax - jmp cpystrflt_x -3: - /* edx is zero - return ENAMETOOLONG or EFAULT */ - cmpl $VM_MAXUSER_ADDRESS,%esi - jae cpystrflt -4: - movl $ENAMETOOLONG,%eax - jmp cpystrflt_x - -cpystrflt: - movl $EFAULT,%eax - -cpystrflt_x: - /* set *lencopied and return %eax */ - movl PCPU(CURPCB),%ecx - movl $0,PCB_ONFAULT(%ecx) - movl 20(%esp),%ecx - subl %edx,%ecx - movl 24(%esp),%edx - testl %edx,%edx - jz 1f - movl %ecx,(%edx) -1: - popl %edi - popl %esi - ret -END(copyinstr) - /* * copystr(from, to, maxlen, int *lencopied) - MP SAFE */ Index: sys/i386/i386/swtch.s =================================================================== --- sys/i386/i386/swtch.s +++ sys/i386/i386/swtch.s @@ -86,8 +86,6 @@ 1: movl 8(%esp),%ecx /* New thread */ movl TD_PCB(%ecx),%edx - movl PCB_CR3(%edx),%eax - movl %eax,%cr3 /* set bit in new pm_active */ movl TD_PROC(%ecx),%eax movl P_VMSPACE(%eax), %ebx @@ -157,7 +155,7 @@ popl %eax 1: - /* Save is done. Now fire up new thread. Leave old vmspace. */ + /* Save is done. Now fire up new thread. */ movl 4(%esp),%edi movl 8(%esp),%ecx /* New thread */ movl 12(%esp),%esi /* New lock */ @@ -167,15 +165,10 @@ #endif movl TD_PCB(%ecx),%edx - /* switch address space */ - movl PCB_CR3(%edx),%eax - movl %cr3,%ebx /* The same address space? */ - cmpl %ebx,%eax - je sw0 - movl %eax,%cr3 /* new address space */ + /* Switchout td_lock */ movl %esi,%eax movl PCPU(CPUID),%esi - SETOP %eax,TD_LOCK(%edi) /* Switchout td_lock */ + SETOP %eax,TD_LOCK(%edi) /* Release bit from old pmap->pm_active */ movl PCPU(CURPMAP), %ebx @@ -200,26 +193,28 @@ sw1: BLOCK_SPIN(%ecx) /* - * At this point, we've switched address spaces and are ready + * At this point, we have managed thread locks and are ready * to load up the rest of the next context. */ + + /* Load a pointer to the thread kernel stack into PCPU. */ + leal -16(%edx), %eax /* leave space for vm86 */ + movl %eax, PCPU(KESP0) + cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ movl $1, PCPU(PRIVATE_TSS) /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ + movl PCPU(TRAMPSTK), %ebx + movl %ebx, PCB_EXT_TSS+TSS_ESP0(%edi) jmp 2f /* Load it up */ 1: /* * Use the common default TSS instead of our own. - * Set our stack pointer into the TSS, it's set to just - * below the PCB. In C, common_tss.tss_esp0 = &pcb - 16; - */ - leal -16(%edx), %ebx /* leave space for vm86 */ - movl %ebx, PCPU(COMMON_TSS) + TSS_ESP0 - - /* - * Test this CPU's bit in the bitmap to see if this - * CPU was using a private TSS. + * Stack pointer in the common TSS points to the trampoline stack + * already and should be not changed. + * + * Test this CPU's flag to see if this CPU was using a private TSS. */ cmpl $0, PCPU(PRIVATE_TSS) /* Already using the common? */ je 3f /* if so, skip reloading */ Index: sys/i386/i386/sys_machdep.c =================================================================== --- sys/i386/i386/sys_machdep.c +++ sys/i386/i386/sys_machdep.c @@ -294,10 +294,8 @@ 0 /* granularity */ }; - ext = (struct pcb_ext *)kmem_malloc(kernel_arena, ctob(IOPAGES+1), - M_WAITOK | M_ZERO); + ext = pmap_trm_alloc(ctob(IOPAGES + 1), M_WAITOK | M_ZERO); /* -16 is so we can convert a trapframe into vm86trapframe inplace */ - ext->ext_tss.tss_esp0 = (vm_offset_t)td->td_pcb - 16; ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); /* * The last byte of the i/o map must be followed by an 0xff byte. @@ -323,6 +321,7 @@ /* Switch to the new TSS. */ critical_enter(); + ext->ext_tss.tss_esp0 = PCPU_GET(trampstk); td->td_pcb->pcb_ext = ext; PCPU_SET(private_tss, 1); *PCPU_GET(tss_gdt) = ext->ext_tssd; @@ -457,8 +456,8 @@ new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK); new_ldt->ldt_len = len = NEW_MAX_LD(len); - new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena, - len * sizeof(union descriptor), M_WAITOK | M_ZERO); + new_ldt->ldt_base = pmap_trm_alloc(len * sizeof(union descriptor), + M_WAITOK | M_ZERO); new_ldt->ldt_refcnt = 1; new_ldt->ldt_active = 0; @@ -473,7 +472,7 @@ bcopy(pldt->ldt_base, new_ldt->ldt_base, len * sizeof(union descriptor)); } else - bcopy(ldt, new_ldt->ldt_base, sizeof(ldt)); + bcopy(ldt, new_ldt->ldt_base, sizeof(union descriptor) * NLDT); return (new_ldt); } @@ -510,8 +509,8 @@ mtx_assert(&dt_lock, MA_OWNED); if (--pldt->ldt_refcnt == 0) { mtx_unlock_spin(&dt_lock); - kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base, - pldt->ldt_len * sizeof(union descriptor)); + pmap_trm_free(pldt->ldt_base, pldt->ldt_len * + sizeof(union descriptor)); free(pldt, M_SUBPROC); } else mtx_unlock_spin(&dt_lock); @@ -767,8 +766,7 @@ * free the new object and return. */ mtx_unlock_spin(&dt_lock); - kmem_free(kernel_arena, - (vm_offset_t)new_ldt->ldt_base, + pmap_trm_free(new_ldt->ldt_base, new_ldt->ldt_len * sizeof(union descriptor)); free(new_ldt, M_SUBPROC); mtx_lock_spin(&dt_lock); @@ -801,8 +799,8 @@ mtx_unlock_spin(&dt_lock); #endif if (old_ldt_base != NULL_LDT_BASE) { - kmem_free(kernel_arena, (vm_offset_t)old_ldt_base, - old_ldt_len * sizeof(union descriptor)); + pmap_trm_free(old_ldt_base, old_ldt_len * + sizeof(union descriptor)); free(new_ldt, M_SUBPROC); } mtx_lock_spin(&dt_lock); Index: sys/i386/i386/trap.c =================================================================== --- sys/i386/i386/trap.c +++ sys/i386/i386/trap.c @@ -116,45 +116,59 @@ static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); -extern inthand_t IDTVEC(lcall_syscall); - #define MAX_TRAP_MSG 32 -static char *trap_msg[] = { - "", /* 0 unused */ - "privileged instruction fault", /* 1 T_PRIVINFLT */ - "", /* 2 unused */ - "breakpoint instruction fault", /* 3 T_BPTFLT */ - "", /* 4 unused */ - "", /* 5 unused */ - "arithmetic trap", /* 6 T_ARITHTRAP */ - "", /* 7 unused */ - "", /* 8 unused */ - "general protection fault", /* 9 T_PROTFLT */ - "trace trap", /* 10 T_TRCTRAP */ - "", /* 11 unused */ - "page fault", /* 12 T_PAGEFLT */ - "", /* 13 unused */ - "alignment fault", /* 14 T_ALIGNFLT */ - "", /* 15 unused */ - "", /* 16 unused */ - "", /* 17 unused */ - "integer divide fault", /* 18 T_DIVIDE */ - "non-maskable interrupt trap", /* 19 T_NMI */ - "overflow trap", /* 20 T_OFLOW */ - "FPU bounds check fault", /* 21 T_BOUND */ - "FPU device not available", /* 22 T_DNA */ - "double fault", /* 23 T_DOUBLEFLT */ - "FPU operand fetch fault", /* 24 T_FPOPFLT */ - "invalid TSS fault", /* 25 T_TSSFLT */ - "segment not present fault", /* 26 T_SEGNPFLT */ - "stack fault", /* 27 T_STKFLT */ - "machine check trap", /* 28 T_MCHK */ - "SIMD floating-point exception", /* 29 T_XMMFLT */ - "reserved (unknown) fault", /* 30 T_RESERVED */ - "", /* 31 unused (reserved) */ - "DTrace pid return trap", /* 32 T_DTRACE_RET */ + +struct trap_data { + bool ei; + const char *msg; +}; + +static const struct trap_data trap_data[] = { + [T_PRIVINFLT] = { .ei = true, .msg = "privileged instruction fault" }, + [T_BPTFLT] = { .ei = false, .msg = "breakpoint instruction fault" }, + [T_ARITHTRAP] = { .ei = true, .msg = "arithmetic trap" }, + [T_PROTFLT] = { .ei = true, .msg = "general protection fault" }, + [T_TRCTRAP] = { .ei = false, .msg = "trace trap" }, + [T_PAGEFLT] = { .ei = false, .msg = "page fault" }, + [T_ALIGNFLT] = { .ei = true, .msg = "alignment fault" }, + [T_DIVIDE] = { .ei = true, .msg = "integer divide fault" }, + [T_NMI] = { .ei = false, .msg = "non-maskable interrupt trap" }, + [T_OFLOW] = { .ei = true, .msg = "overflow trap" }, + [T_BOUND] = { .ei = true, .msg = "FPU bounds check fault" }, + [T_DNA] = { .ei = true, .msg = "FPU device not available" }, + [T_DOUBLEFLT] = { .ei = false, .msg = "double fault" }, + [T_FPOPFLT] = { .ei = true, .msg = "FPU operand fetch fault" }, + [T_TSSFLT] = { .ei = true, .msg = "invalid TSS fault" }, + [T_SEGNPFLT] = { .ei = true, .msg = "segment not present fault" }, + [T_STKFLT] = { .ei = true, .msg = "stack fault" }, + [T_MCHK] = { .ei = true, .msg = "machine check trap" }, + [T_XMMFLT] = { .ei = true, .msg = "SIMD floating-point exception" }, + [T_DTRACE_RET] ={ .ei = true, .msg = "DTrace pid return trap" }, }; +static bool +trap_enable_intr(int trapno) +{ + + if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL) + return (trap_data[trapno].ei); + return (false); +} + +static const char * +trap_msg(int trapno) +{ + const char *res; + static const char unkn[] = "UNKNOWN"; + + res = NULL; + if (trapno < nitems(trap_data)) + res = trap_data[trapno].msg; + if (res == NULL) + res = unkn; + return (res); +} + #if defined(I586_CPU) && !defined(NO_F00F_HACK) int has_f00f_bug = 0; /* Initialized so that it can be patched. */ #endif @@ -199,6 +213,8 @@ VM_CNT_INC(v_trap); type = frame->tf_trapno; + if (trap_enable_intr(type)) + enable_intr(); #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ @@ -582,24 +598,40 @@ * problem here and not have to check all the * selectors and pointers when the user changes * them. + * + * N.B. Comparing to long mode, 32-bit mode + * does not push %esp on the trap frame, + * because iretl faulted while in ring 0. As + * the consequence, there is no need to fixup + * the stack pointer for doreti_iret_fault, + * the fixup and the complimentary trap() call + * are executed on the main thread stack, not + * on the trampoline stack. */ - if (frame->tf_eip == (int)doreti_iret) { - frame->tf_eip = (int)doreti_iret_fault; + if (frame->tf_eip == (int)doreti_iret + setidt_disp) { + frame->tf_eip = (int)doreti_iret_fault + + setidt_disp; return; } if (type == T_STKFLT) break; - if (frame->tf_eip == (int)doreti_popl_ds) { - frame->tf_eip = (int)doreti_popl_ds_fault; + if (frame->tf_eip == (int)doreti_popl_ds + + setidt_disp) { + frame->tf_eip = (int)doreti_popl_ds_fault + + setidt_disp; return; } - if (frame->tf_eip == (int)doreti_popl_es) { - frame->tf_eip = (int)doreti_popl_es_fault; + if (frame->tf_eip == (int)doreti_popl_es + + setidt_disp) { + frame->tf_eip = (int)doreti_popl_es_fault + + setidt_disp; return; } - if (frame->tf_eip == (int)doreti_popl_fs) { - frame->tf_eip = (int)doreti_popl_fs_fault; + if (frame->tf_eip == (int)doreti_popl_fs + + setidt_disp) { + frame->tf_eip = (int)doreti_popl_fs_fault + + setidt_disp; return; } if (curpcb->pcb_onfault != NULL) { @@ -626,23 +658,6 @@ case T_TRCTRAP: /* trace trap */ kernel_trctrap: - if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) { - /* - * We've just entered system mode via the - * syscall lcall. Continue single stepping - * silently until the syscall handler has - * saved the flags. - */ - return; - } - if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) { - /* - * The syscall handler has now saved the - * flags. Stop single stepping it. - */ - frame->tf_eflags &= ~PSL_T; - return; - } /* * Ignore debug register trace traps due to * accesses in the user's address space, which @@ -710,10 +725,11 @@ ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %x code %d type %d " - "addr 0x%x esp 0x%08x eip 0x%08x " + "addr 0x%x ss 0x%04x esp 0x%08x cs 0x%04x eip 0x%08x " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type, - addr, frame->tf_esp, frame->tf_eip, + addr, frame->tf_ss, frame->tf_esp, frame->tf_cs, + frame->tf_eip, fubyte((void *)(frame->tf_eip + 0)), fubyte((void *)(frame->tf_eip + 1)), fubyte((void *)(frame->tf_eip + 2)), @@ -790,7 +806,7 @@ } } va = trunc_page(eva); - if (va >= KERNBASE) { + if (va >= PMAP_TRM_MIN_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid @@ -805,20 +821,17 @@ #endif if (usermode) return (SIGSEGV); - - map = kernel_map; + trap_fatal(frame, eva); + return (-1); } else { - map = &p->p_vmspace->vm_map; + map = usermode ? &p->p_vmspace->vm_map : kernel_map; /* - * When accessing a user-space address, kernel must be - * ready to accept the page fault, and provide a - * handling routine. Since accessing the address - * without the handler is a bug, do not try to handle - * it normally, and panic immediately. + * Kernel cannot access a user-space address directly + * because user pages are not mapped. Also, page + * faults must not be caused during the interrupts. */ - if (!usermode && (td->td_intr_nesting_level != 0 || - curpcb->pcb_onfault == NULL)) { + if (!usermode && td->td_intr_nesting_level != 0) { trap_fatal(frame, eva); return (-1); } @@ -881,17 +894,12 @@ int code, ss, esp; u_int type; struct soft_segment_descriptor softseg; - char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); - if (type <= MAX_TRAP_MSG) - msg = trap_msg[type]; - else - msg = "UNKNOWN"; - printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, + printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type), frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP @@ -954,8 +962,8 @@ } #endif printf("trap number = %d\n", type); - if (type <= MAX_TRAP_MSG) - panic("%s", trap_msg[type]); + if (trap_msg(type) != NULL) + panic("%s", trap_msg(type)); else panic("unknown/reserved trap"); } @@ -973,16 +981,16 @@ * of this is that "trace " in ddb won't work. */ void -dblfault_handler() +dblfault_handler(void) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault:\n"); - printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); - printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); - printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); + printf("eip = 0x%x\n", PCPU_GET(common_tssp)->tss_eip); + printf("esp = 0x%x\n", PCPU_GET(common_tssp)->tss_esp); + printf("ebp = 0x%x\n", PCPU_GET(common_tssp)->tss_ebp); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); Index: sys/i386/i386/vm86bios.s =================================================================== --- sys/i386/i386/vm86bios.s +++ sys/i386/i386/vm86bios.s @@ -104,9 +104,8 @@ movl %cr3,%eax pushl %eax /* save address space */ - movl IdlePTD,%ecx + movl IdlePTD,%ecx /* va (and pa) of Idle PTD */ movl %ecx,%ebx - addl $KERNBASE,%ebx /* va of Idle PTD */ movl 0(%ebx),%eax pushl %eax /* old ptde != 0 when booting */ pushl %ebx /* keep for reuse */ Index: sys/i386/i386/vm_machdep.c =================================================================== --- sys/i386/i386/vm_machdep.c +++ sys/i386/i386/vm_machdep.c @@ -217,9 +217,11 @@ * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. - * The -16 is so we can expand the trapframe if we go to vm86. + * The -VM86_STACK_SPACE (-16) is so we can expand the trapframe + * if we go to vm86. */ - td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb - 16) - 1; + td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb - + VM86_STACK_SPACE) - 1; bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe)); td2->td_frame->tf_eax = 0; /* Child returns zero */ @@ -251,7 +253,7 @@ pcb2->pcb_ebp = 0; pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *); pcb2->pcb_ebx = (int)td2; /* fork_trampoline argument */ - pcb2->pcb_eip = (int)fork_trampoline; + pcb2->pcb_eip = (int)fork_trampoline + setidt_disp; /*- * pcb2->pcb_dr*: cloned above. * pcb2->pcb_savefpu: cloned above. @@ -357,8 +359,7 @@ * XXX do we need to move the TSS off the allocated pages * before freeing them? (not done here) */ - kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_ext, - ctob(IOPAGES + 1)); + pmap_trm_free(pcb->pcb_ext, ctob(IOPAGES + 1)); pcb->pcb_ext = NULL; } } @@ -380,7 +381,8 @@ struct xstate_hdr *xhdr; td->td_pcb = pcb = get_pcb_td(td); - td->td_frame = (struct trapframe *)((caddr_t)pcb - 16) - 1; + td->td_frame = (struct trapframe *)((caddr_t)pcb - + VM86_STACK_SPACE) - 1; pcb->pcb_ext = NULL; pcb->pcb_save = get_pcb_user_save_pcb(pcb); if (use_xsave) { @@ -475,7 +477,7 @@ pcb2->pcb_ebp = 0; pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */ pcb2->pcb_ebx = (int)td; /* trampoline arg */ - pcb2->pcb_eip = (int)fork_trampoline; + pcb2->pcb_eip = (int)fork_trampoline + setidt_disp; pcb2->pcb_gs = rgs(); /* * If we didn't copy the pcb, we'd need to do the following registers: @@ -729,7 +731,7 @@ */ ptep = vtopte(sf->kva); opte = *ptep; - *ptep = VM_PAGE_TO_PHYS(sf->m) | pgeflag | PG_RW | PG_V | + *ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V | pmap_cache_bits(sf->m->md.pat_mode, 0); /* Index: sys/i386/include/asmacros.h =================================================================== --- sys/i386/include/asmacros.h +++ sys/i386/include/asmacros.h @@ -1,3 +1,4 @@ +/* -*- mode: asm -*- */ /*- * SPDX-License-Identifier: BSD-3-Clause * @@ -135,6 +136,10 @@ #endif /* GPROF */ #ifdef LOCORE + +#define GSEL_KPL 0x0020 /* GSEL(GCODE_SEL, SEL_KPL) */ +#define SEL_RPL_MASK 0x0003 + /* * Convenience macro for declaring interrupt entry points. */ @@ -144,16 +149,21 @@ /* * Macros to create and destroy a trap frame. */ -#define PUSH_FRAME \ - pushl $0 ; /* dummy error code */ \ - pushl $0 ; /* dummy trap type */ \ - pushal ; /* 8 ints */ \ - pushl $0 ; /* save data and extra segments ... */ \ - movw %ds,(%esp) ; \ - pushl $0 ; \ - movw %es,(%esp) ; \ - pushl $0 ; \ + .macro PUSH_FRAME2 + pushal + pushl $0 + movw %ds,(%esp) + pushl $0 + movw %es,(%esp) + pushl $0 movw %fs,(%esp) + .endm + + .macro PUSH_FRAME + pushl $0 /* dummy error code */ + pushl $0 /* dummy trap type */ + PUSH_FRAME2 + .endm /* * Access per-CPU data. @@ -167,12 +177,39 @@ /* * Setup the kernel segment registers. */ -#define SET_KERNEL_SREGS \ - movl $KDSEL, %eax ; /* reload with kernel's data segment */ \ - movl %eax, %ds ; \ - movl %eax, %es ; \ - movl $KPSEL, %eax ; /* reload with per-CPU data segment */ \ + .macro SET_KERNEL_SREGS + movl $KDSEL, %eax /* reload with kernel's data segment */ + movl %eax, %ds + movl %eax, %es + movl $KPSEL, %eax /* reload with per-CPU data segment */ movl %eax, %fs + .endm + + .macro NMOVE_STACKS + movl PCPU(KESP0), %edx + movl $TF_SZ, %ecx + subl %ecx, %edx + movl %edx, %edi + movl %esp, %esi + rep; movsb + movl %edx, %esp + .endm + + .macro MOVE_STACKS + call 1000f +1000: popl %eax + movl (tramp_idleptd - 1000b)(%eax), %eax + movl %eax, %cr3 + NMOVE_STACKS + .endm + + .macro KENTER +/* XXXKIB vm86 */ + testb $SEL_RPL_MASK, TF_CS(%esp) + jz 1f + MOVE_STACKS +1: + .endm #endif /* LOCORE */ Index: sys/i386/include/frame.h =================================================================== --- sys/i386/include/frame.h +++ sys/i386/include/frame.h @@ -41,4 +41,7 @@ #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) +#define TRAMP_STACK_SZ 4096 +#define VM86_STACK_SPACE 16 + #endif /* _I386_FRAME_H_ */ Index: sys/i386/include/md_var.h =================================================================== --- sys/i386/include/md_var.h +++ sys/i386/include/md_var.h @@ -45,8 +45,11 @@ #endif #ifdef COMPAT_43 extern int szosigcode; +extern int sz_lcall_tramp; #endif extern uint32_t *vm_page_dump; +extern vm_offset_t proc0kstack; +extern uintptr_t setidt_disp; struct segment_descriptor; union savefpu; @@ -71,6 +74,7 @@ void set_fsbase(struct thread *td, uint32_t base); void set_gsbase(struct thread *td, uint32_t base); void setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int selec); +void setidt_nodisp(int idx, uintptr_t func, int typ, int dpl, int selec); union savefpu *get_pcb_user_save_td(struct thread *td); union savefpu *get_pcb_user_save_pcb(struct pcb *pcb); Index: sys/i386/include/param.h =================================================================== --- sys/i386/include/param.h +++ sys/i386/include/param.h @@ -164,7 +164,6 @@ #define pgtok(x) ((x) * (PAGE_SIZE / 1024)) -#define INKERNEL(va) (((vm_offset_t)(va)) >= VM_MAXUSER_ADDRESS && \ - ((vm_offset_t)(va)) < VM_MAX_KERNEL_ADDRESS) +#define INKERNEL(va) (TRUE) #endif /* !_I386_INCLUDE_PARAM_H_ */ Index: sys/i386/include/pc/bios.h =================================================================== --- sys/i386/include/pc/bios.h +++ sys/i386/include/pc/bios.h @@ -267,8 +267,8 @@ }; #ifdef _KERNEL -#define BIOS_PADDRTOVADDR(x) ((x) + KERNBASE) -#define BIOS_VADDRTOPADDR(x) ((x) - KERNBASE) +#define BIOS_PADDRTOVADDR(x) ((x) + PMAP_MAP_LOW) +#define BIOS_VADDRTOPADDR(x) ((x) - PMAP_MAP_LOW) struct bios_oem_signature { char * anchor; /* search anchor string in BIOS memory */ Index: sys/i386/include/pcpu.h =================================================================== --- sys/i386/include/pcpu.h +++ sys/i386/include/pcpu.h @@ -42,21 +42,23 @@ #include /* - * The SMP parts are setup in pmap.c and locore.s for the BSP, and - * mp_machdep.c sets up the data for the AP's to "see" when they awake. - * The reason for doing it via a struct is so that an array of pointers - * to each CPU's data can be set up for things like "check curproc on all - * other processors" + * The SMP parts are setup in pmap.c and machdep.c for the BSP, and + * pmap.c and mp_machdep.c sets up the data for the AP's to "see" when + * they awake. The reason for doing it via a struct is so that an + * array of pointers to each CPU's data can be set up for things like + * "check curproc on all other processors" */ #define PCPU_MD_FIELDS \ char pc_monitorbuf[128] __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ - struct i386tss pc_common_tss; \ struct segment_descriptor pc_common_tssd; \ struct segment_descriptor *pc_tss_gdt; \ struct segment_descriptor *pc_fsgs_gdt; \ + struct i386tss *pc_common_tssp; \ + u_int pc_kesp0; \ + u_int pc_trampstk; \ int pc_currentldt; \ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_apic_id; \ @@ -69,8 +71,12 @@ caddr_t pc_cmap_addr1; \ caddr_t pc_cmap_addr2; \ vm_offset_t pc_qmap_addr; /* KVA for temporary mappings */\ + vm_offset_t pc_copyout_maddr; \ + vm_offset_t pc_copyout_saddr; \ + struct mtx pc_copyout_mlock; \ + struct sx pc_copyout_slock; \ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ - char __pad[445] + char __pad[550] #ifdef _KERNEL Index: sys/i386/include/pmap.h =================================================================== --- sys/i386/include/pmap.h +++ sys/i386/include/pmap.h @@ -112,12 +112,10 @@ * For PAE, the page table page unit size is 2MB. This means that 512 pages * is 1 Gigabyte. Double everything. It must be a multiple of 8 for PAE. */ -#ifndef KVA_PAGES #if defined(PAE) || defined(PAE_TABLES) -#define KVA_PAGES 512 +#define KVA_PAGES (512*4) #else -#define KVA_PAGES 256 -#endif +#define KVA_PAGES (256*4) #endif /* @@ -150,12 +148,13 @@ /* * The *PTDI values control the layout of virtual memory - * - * XXX This works for now, but I am not real happy with it, I'll fix it - * right after I fix locore.s and the magic 28K hole */ -#define KPTDI (NPDEPTD-NKPDE) /* start of kernel virtual pde's */ -#define PTDPTDI (KPTDI-NPGPTD) /* ptd entry that points to ptd! */ +#define KPTDI 0 /* start of kernel virtual pde's */ +#define LOWPTDI 1 /* low memory map pde */ +#define KERNPTDI 2 /* start of kernel text pde */ +#define PTDPTDI (NPDEPTD - 1 - NPGPTD) /* ptd entry that points + to ptd! */ +#define TRPTDI (NPDEPTD - 1) /* u/k trampoline ptd */ /* * XXX doesn't really belong here I guess... @@ -311,6 +310,7 @@ table */ #endif struct vm_radix pm_root; /* spare page table pages */ + vm_page_t pm_ptdpg[NPGPTD]; }; typedef struct pmap *pmap_t; @@ -396,6 +396,8 @@ void pmap_invalidate_cache_pages(vm_page_t *pages, int count); void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force); +void *pmap_trm_alloc(size_t size, int flags); +void pmap_trm_free(void *addr, size_t size); void invltlb_glob(void); Index: sys/i386/include/segments.h =================================================================== --- sys/i386/include/segments.h +++ sys/i386/include/segments.h @@ -84,11 +84,10 @@ #ifdef _KERNEL extern int _default_ldt; -extern union descriptor gdt[]; -extern union descriptor ldt[NLDT]; +extern union descriptor *gdt; +extern union descriptor *ldt; extern struct soft_segment_descriptor gdt_segs[]; extern struct gate_descriptor *idt; -extern struct region_descriptor r_gdt, r_idt; void lgdt(struct region_descriptor *rdp); void sdtossd(struct segment_descriptor *sdp, Index: sys/i386/include/vmparam.h =================================================================== --- sys/i386/include/vmparam.h +++ sys/i386/include/vmparam.h @@ -136,7 +136,7 @@ * Kernel physical load address. */ #ifndef KERNLOAD -#define KERNLOAD (1 << PDRSHIFT) +#define KERNLOAD (KERNPTDI << PDRSHIFT) #endif /* !defined(KERNLOAD) */ /* @@ -146,23 +146,47 @@ * messy at times, but hey, we'll do anything to save a page :-) */ -#define VM_MAX_KERNEL_ADDRESS VADDR(KPTDI+NKPDE-1, NPTEPG-1) +#define VM_MAX_KERNEL_ADDRESS VADDR(PTDPTDI, 0) -#define VM_MIN_KERNEL_ADDRESS VADDR(PTDPTDI, PTDPTDI) +#define VM_MIN_KERNEL_ADDRESS 0 -#define KERNBASE VADDR(KPTDI, 0) +#define KERNBASE KERNLOAD #define UPT_MAX_ADDRESS VADDR(PTDPTDI, PTDPTDI) #define UPT_MIN_ADDRESS VADDR(PTDPTDI, 0) -#define VM_MAXUSER_ADDRESS VADDR(PTDPTDI, 0) +#define VM_MAXUSER_ADDRESS VADDR(TRPTDI, 0) #define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) #define USRSTACK SHAREDPAGE -#define VM_MAX_ADDRESS VADDR(PTDPTDI, PTDPTDI) +#define VM_MAX_ADDRESS VADDR(PTDPTDI, 0) #define VM_MIN_ADDRESS ((vm_offset_t)0) +#define PMAP_TRM_MIN_ADDRESS VM_MAXUSER_ADDRESS +#define PMAP_TRM_MAX_ADDRESS 0xffffffff + +#define PMAP_MAP_LOW VADDR(LOWPTDI, 0) + +/* + * KVA layout. The unit of the system allocation is single PDE, which + * represents NBPDR bytes, algined to NBPDR. NBPDR is 4M for non-PAE + * page tables, and 2M for PAE. Addresses below are shown for non-PAE. + * + * 0x00000000 - 0x001fffff Transient identity map of low memory (0-4M), + * normally disabled to catch NULL derefs. + * 0x00200000 - 0x003fffff Fixed mapping of the low memory (0-4M). + * 0x00400000 - 0xffbfffff KERNBASE (VA) == KERNLOAD (PA), kernel + * text + data and all kernel maps. Managed + * by MI VM. + * 0xffc00000 - 0xffdfffff Recursive kernel page table mapping, pointed + * to by PTmap. PTD[] recusively points + * into PTmap. + * 0xffe00000 - 0xffffffff Kernel/User mode shared PDE, contains GDT, + * IDT, TSS, LDT, trampoline code and stacks. + * Managed by pmap_trm_alloc(). + */ + /* * How many physical pages per kmem arena virtual page. */ Index: sys/kern/imgact_aout.c =================================================================== --- sys/kern/imgact_aout.c +++ sys/kern/imgact_aout.c @@ -67,7 +67,12 @@ static int exec_aout_imgact(struct image_params *imgp); static int aout_fixup(register_t **stack_base, struct image_params *imgp); +#define AOUT32_USRSTACK 0xbfc00000 + #if defined(__i386__) + +#define AOUT32_PS_STRINGS (AOUT32_USRSTACK - sizeof(struct ps_strings)) + struct sysentvec aout_sysvec = { .sv_size = SYS_MAXSYSCALL, .sv_table = sysent, @@ -85,9 +90,9 @@ .sv_minsigstksz = MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = AOUT32_USRSTACK, + .sv_usrstack = AOUT32_USRSTACK, + .sv_psstrings = AOUT32_PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = exec_setregs, @@ -104,10 +109,9 @@ #elif defined(__amd64__) -#define AOUT32_USRSTACK 0xbfc00000 #define AOUT32_PS_STRINGS \ (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings)) -#define AOUT32_MINUSER FREEBSD32_MINUSER +#define AOUT32_MINUSER FREEBSD32_MINUSER extern const char *freebsd32_syscallnames[]; extern u_long ia32_maxssiz; Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -81,9 +81,9 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x104, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x468, +_Static_assert(offsetof(struct thread, td_frame) == 0x470, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x510, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x518, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag"); @@ -101,9 +101,9 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa0, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x2e4, +_Static_assert(offsetof(struct thread, td_frame) == 0x2e8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x330, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x334, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x68, "struct proc KBI p_flag"); Index: sys/kern/subr_trap.c =================================================================== --- sys/kern/subr_trap.c +++ sys/kern/subr_trap.c @@ -178,6 +178,8 @@ ("userret: Returning with stop signals deferred")); KASSERT(td->td_su == NULL, ("userret: Returning with SU cleanup request not handled")); + KASSERT(td->td_vslock_sz == 0, + ("userret: Returning with vslock-wired space")); #ifdef VIMAGE /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */ VNET_ASSERT(curvnet == NULL, Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -956,23 +956,30 @@ vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, struct thread *td) { + int error, save; + error = 0; + save = vm_fault_disable_pagefaults(); switch (args->kind) { case VN_IO_FAULT_FOP: - return ((args->args.fop_args.doio)(args->args.fop_args.fp, - uio, args->cred, args->flags, td)); + error = (args->args.fop_args.doio)(args->args.fop_args.fp, + uio, args->cred, args->flags, td); + break; case VN_IO_FAULT_VOP: if (uio->uio_rw == UIO_READ) { - return (VOP_READ(args->args.vop_args.vp, uio, - args->flags, args->cred)); + error = VOP_READ(args->args.vop_args.vp, uio, + args->flags, args->cred); } else if (uio->uio_rw == UIO_WRITE) { - return (VOP_WRITE(args->args.vop_args.vp, uio, - args->flags, args->cred)); + error = VOP_WRITE(args->args.vop_args.vp, uio, + args->flags, args->cred); } break; + default: + panic("vn_io_fault_doio: unknown kind of io %d %d", + args->kind, uio->uio_rw); } - panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, - uio->uio_rw); + vm_fault_enable_pagefaults(save); + return (error); } static int @@ -1047,7 +1054,7 @@ vm_offset_t addr, end; size_t len, resid; ssize_t adv; - int error, cnt, save, saveheld, prev_td_ma_cnt; + int error, cnt, saveheld, prev_td_ma_cnt; if (vn_io_fault_prefault) { error = vn_io_fault_prefault_user(uio); @@ -1073,7 +1080,6 @@ short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; - save = vm_fault_disable_pagefaults(); error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; @@ -1144,7 +1150,6 @@ td->td_ma_cnt = prev_td_ma_cnt; curthread_pflags_restore(saveheld); out: - vm_fault_enable_pagefaults(save); free(uio_clone, M_IOV); return (error); } Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -297,6 +297,7 @@ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ + size_t td_vslock_sz; /* (k) amount of vslock-ed space */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ Index: sys/vm/vm.h =================================================================== --- sys/vm/vm.h +++ sys/vm/vm.h @@ -80,7 +80,9 @@ #define VM_PROT_WRITE ((vm_prot_t) 0x02) #define VM_PROT_EXECUTE ((vm_prot_t) 0x04) #define VM_PROT_COPY ((vm_prot_t) 0x08) /* copy-on-read */ -#define VM_PROT_FAULT_LOOKUP ((vm_prot_t) 0x010) +#define VM_PROT_PRIV_FLAG ((vm_prot_t) 0x010) +#define VM_PROT_FAULT_LOOKUP VM_PROT_PRIV_FLAG +#define VM_PROT_QUICK_NOFAULT VM_PROT_PRIV_FLAG /* same to save bits */ #define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) #define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE) Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -1526,6 +1526,9 @@ * mapping had insufficient permissions. Attempt to fault in * and hold these pages. */ + if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && + (curthread->td_pflags & TDP_NOFAULTING) != 0) + goto error; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault_hold(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) Index: sys/vm/vm_glue.c =================================================================== --- sys/vm/vm_glue.c +++ sys/vm/vm_glue.c @@ -196,11 +196,16 @@ #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); + if (error == KERN_SUCCESS) { + curthread->td_vslock_sz += len; + return (0); + } + /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ - return (error == KERN_SUCCESS ? 0 : EFAULT); + return (EFAULT); } void @@ -208,6 +213,8 @@ { /* Rely on the parameter sanity checks performed by vslock(). */ + MPASS(curthread->td_vslock_sz >= len); + curthread->td_vslock_sz -= len; (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); Index: sys/x86/acpica/acpi_wakeup.c =================================================================== --- sys/x86/acpica/acpi_wakeup.c +++ sys/x86/acpica/acpi_wakeup.c @@ -186,7 +186,7 @@ * cpususpend_handler() and we will release them soon. Then each * will invalidate its TLB. */ - kernel_pmap->pm_pdir[0] = 0; + PTD[KPTDI] = 0; invltlb_glob(); #endif @@ -256,7 +256,7 @@ * be careful to use the kernel map (PTD[0] is for curthread * which may be a user thread in deprecated APIs). */ - kernel_pmap->pm_pdir[0] = PTD[KPTDI]; + PTD[KPTDI] = PTD[LOWPTDI]; #endif /* Call ACPICA to enter the desired sleep state */ Index: sys/x86/x86/local_apic.c =================================================================== --- sys/x86/x86/local_apic.c +++ sys/x86/x86/local_apic.c @@ -78,11 +78,9 @@ #ifdef __amd64__ #define SDT_APIC SDT_SYSIGT -#define SDT_APICT SDT_SYSIGT #define GSEL_APIC 0 #else #define SDT_APIC SDT_SYS386IGT -#define SDT_APICT SDT_SYS386TGT #define GSEL_APIC GSEL(GCODE_SEL, SEL_KPL) #endif @@ -517,7 +515,7 @@ /* Local APIC CMCI. */ setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint), - SDT_APICT, SEL_KPL, GSEL_APIC); + SDT_APIC, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; @@ -1605,7 +1603,7 @@ * We can not currently clear the idt entry because other cpus * may have a valid vector at this offset. */ - setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC, SEL_KPL, GSEL_APIC); #endif } @@ -2146,7 +2144,7 @@ KASSERT(func != (uintptr_t)&IDTVEC(rsvd) && func != (uintptr_t)&IDTVEC(rsvd_pti), ("invalid idtfunc %#lx", func)); - setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC, SEL_KPL, GSEL_APIC); mtx_unlock_spin(&icu_lock); } Index: sys/x86/x86/mp_x86.c =================================================================== --- sys/x86/x86/mp_x86.c +++ sys/x86/x86/mp_x86.c @@ -1632,8 +1632,10 @@ generation = smp_tlb_generation; if (smp_tlb_pmap == kernel_pmap) invltlb_glob(); +#ifdef __amd64__ else invltlb(); +#endif PCPU_SET(smp_tlb_done, generation); } @@ -1650,7 +1652,10 @@ #endif /* COUNT_IPIS */ generation = smp_tlb_generation; /* Overlap with serialization */ - invlpg(smp_tlb_addr1); +#ifdef __i386__ + if (smp_tlb_pmap == kernel_pmap) +#endif + invlpg(smp_tlb_addr1); PCPU_SET(smp_tlb_done, generation); } @@ -1670,10 +1675,13 @@ addr = smp_tlb_addr1; addr2 = smp_tlb_addr2; generation = smp_tlb_generation; /* Overlap with serialization */ - do { - invlpg(addr); - addr += PAGE_SIZE; - } while (addr < addr2); +#ifdef __i386__ + if (smp_tlb_pmap == kernel_pmap) +#endif + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < addr2); PCPU_SET(smp_tlb_done, generation); }