Index: projects/amd64_xen_pv/sys/amd64/include/cpufunc.h =================================================================== --- projects/amd64_xen_pv/sys/amd64/include/cpufunc.h (revision 260906) +++ projects/amd64_xen_pv/sys/amd64/include/cpufunc.h (revision 260907) @@ -1,881 +1,883 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Functions to provide access to special i386 instructions. * This in included in sys/systm.h, and that file should be * used in preference to this. */ #ifndef _MACHINE_CPUFUNC_H_ #define _MACHINE_CPUFUNC_H_ #ifndef _SYS_CDEFS_H_ #error this file needs sys/cdefs.h as a prerequisite #endif #ifdef XEN struct pcb; /* Forward declaration */ extern void xen_cli(void); extern void xen_sti(void); extern u_long xen_rcr2(void); extern void xen_load_cr3(u_long data); extern void xen_tlb_flush(void); extern void xen_invlpg(vm_offset_t addr); +extern void xen_load_kgsbase(uint64_t kgsbase); +extern void xen_load_tls(struct pcb *pcb); extern void xen_set_proc(struct pcb *newpcb); extern void write_rflags(u_long rflags); extern u_long read_rflags(void); #endif /* XEN */ struct region_descriptor; #define readb(va) (*(volatile uint8_t *) (va)) #define readw(va) (*(volatile uint16_t *) (va)) #define readl(va) (*(volatile uint32_t *) (va)) #define readq(va) (*(volatile uint64_t *) (va)) #define writeb(va, d) (*(volatile uint8_t *) (va) = (d)) #define writew(va, d) (*(volatile uint16_t *) (va) = (d)) #define writel(va, d) (*(volatile uint32_t *) (va) = (d)) #define writeq(va, d) (*(volatile uint64_t *) (va) = (d)) #if defined(__GNUCLIKE_ASM) && defined(__CC_SUPPORTS___INLINE) static __inline void breakpoint(void) { __asm __volatile("int $3"); } static __inline u_int bsfl(u_int mask) { u_int result; __asm __volatile("bsfl %1,%0" : "=r" (result) : "rm" (mask)); return (result); } static __inline u_long bsfq(u_long mask) { u_long result; __asm __volatile("bsfq %1,%0" : "=r" (result) : "rm" (mask)); return (result); } static __inline u_int bsrl(u_int mask) { u_int result; __asm __volatile("bsrl %1,%0" : "=r" (result) : "rm" (mask)); return (result); } static __inline u_long bsrq(u_long mask) { u_long result; __asm __volatile("bsrq %1,%0" : "=r" (result) : "rm" (mask)); return (result); } static __inline void clflush(u_long addr) { __asm __volatile("clflush %0" : : "m" (*(char *)addr)); } static __inline void clts(void) { __asm __volatile("clts"); } static __inline void disable_intr(void) { #ifdef XEN xen_cli(); #else __asm __volatile("cli" : : : "memory"); #endif } static __inline void do_cpuid(u_int ax, u_int *p) { __asm __volatile("cpuid" : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) : "0" (ax)); } static __inline void cpuid_count(u_int ax, u_int cx, u_int *p) { __asm __volatile("cpuid" : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) : "0" (ax), "c" (cx)); } static __inline void enable_intr(void) { #ifdef XEN xen_sti(); #else __asm __volatile("sti"); #endif } #ifdef _KERNEL #define HAVE_INLINE_FFS #define ffs(x) __builtin_ffs(x) #define HAVE_INLINE_FFSL static __inline int ffsl(long mask) { return (mask == 0 ? mask : (int)bsfq((u_long)mask) + 1); } #define HAVE_INLINE_FLS static __inline int fls(int mask) { return (mask == 0 ? mask : (int)bsrl((u_int)mask) + 1); } #define HAVE_INLINE_FLSL static __inline int flsl(long mask) { return (mask == 0 ? mask : (int)bsrq((u_long)mask) + 1); } #endif /* _KERNEL */ static __inline void halt(void) { __asm __volatile("hlt"); } static __inline u_char inb(u_int port) { u_char data; __asm __volatile("inb %w1, %0" : "=a" (data) : "Nd" (port)); return (data); } static __inline u_int inl(u_int port) { u_int data; __asm __volatile("inl %w1, %0" : "=a" (data) : "Nd" (port)); return (data); } static __inline void insb(u_int port, void *addr, size_t count) { __asm __volatile("cld; rep; insb" : "+D" (addr), "+c" (count) : "d" (port) : "memory"); } static __inline void insw(u_int port, void *addr, size_t count) { __asm __volatile("cld; rep; insw" : "+D" (addr), "+c" (count) : "d" (port) : "memory"); } static __inline void insl(u_int port, void *addr, size_t count) { __asm __volatile("cld; rep; insl" : "+D" (addr), "+c" (count) : "d" (port) : "memory"); } static __inline void invd(void) { __asm __volatile("invd"); } static __inline u_short inw(u_int port) { u_short data; __asm __volatile("inw %w1, %0" : "=a" (data) : "Nd" (port)); return (data); } static __inline void outb(u_int port, u_char data) { __asm __volatile("outb %0, %w1" : : "a" (data), "Nd" (port)); } static __inline void outl(u_int port, u_int data) { __asm __volatile("outl %0, %w1" : : "a" (data), "Nd" (port)); } static __inline void outsb(u_int port, const void *addr, size_t count) { __asm __volatile("cld; rep; outsb" : "+S" (addr), "+c" (count) : "d" (port)); } static __inline void outsw(u_int port, const void *addr, size_t count) { __asm __volatile("cld; rep; outsw" : "+S" (addr), "+c" (count) : "d" (port)); } static __inline void outsl(u_int port, const void *addr, size_t count) { __asm __volatile("cld; rep; outsl" : "+S" (addr), "+c" (count) : "d" (port)); } static __inline void outw(u_int port, u_short data) { __asm __volatile("outw %0, %w1" : : "a" (data), "Nd" (port)); } static __inline u_long popcntq(u_long mask) { u_long result; __asm __volatile("popcntq %1,%0" : "=r" (result) : "rm" (mask)); return (result); } static __inline void lfence(void) { __asm __volatile("lfence" : : : "memory"); } static __inline void mfence(void) { __asm __volatile("mfence" : : : "memory"); } static __inline void ia32_pause(void) { __asm __volatile("pause"); } static __inline u_long #ifdef XEN _read_rflags(void) #else read_rflags(void) #endif { u_long rf; __asm __volatile("pushfq; popq %0" : "=r" (rf)); return (rf); } static __inline uint64_t rdmsr(u_int msr) { uint32_t low, high; __asm __volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr)); return (low | ((uint64_t)high << 32)); } static __inline uint64_t rdpmc(u_int pmc) { uint32_t low, high; __asm __volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (pmc)); return (low | ((uint64_t)high << 32)); } static __inline uint64_t rdtsc(void) { uint32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); return (low | ((uint64_t)high << 32)); } static __inline uint32_t rdtsc32(void) { uint32_t rv; __asm __volatile("rdtsc" : "=a" (rv) : : "edx"); return (rv); } static __inline void wbinvd(void) { __asm __volatile("wbinvd"); } static __inline void #ifdef XEN _write_rflags(u_long rf) #else write_rflags(u_long rf) #endif { __asm __volatile("pushq %0; popfq" : : "r" (rf)); } static __inline void wrmsr(u_int msr, uint64_t newval) { uint32_t low, high; low = newval; high = newval >> 32; __asm __volatile("wrmsr" : : "a" (low), "d" (high), "c" (msr)); } static __inline void load_cr0(u_long data) { __asm __volatile("movq %0,%%cr0" : : "r" (data)); } static __inline u_long rcr0(void) { u_long data; __asm __volatile("movq %%cr0,%0" : "=r" (data)); return (data); } static __inline u_long rcr2(void) { u_long data; #ifdef XEN return (xen_rcr2()); #endif __asm __volatile("movq %%cr2,%0" : "=r" (data)); return (data); } static __inline void load_cr3(u_long data) { #ifdef XEN xen_load_cr3(data); #else __asm __volatile("movq %0,%%cr3" : : "r" (data) : "memory"); #endif } static __inline u_long rcr3(void) { u_long data; __asm __volatile("movq %%cr3,%0" : "=r" (data)); return (data); } static __inline void load_cr4(u_long data) { __asm __volatile("movq %0,%%cr4" : : "r" (data)); } static __inline u_long rcr4(void) { u_long data; __asm __volatile("movq %%cr4,%0" : "=r" (data)); return (data); } static __inline u_long rxcr(u_int reg) { u_int low, high; __asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg)); return (low | ((uint64_t)high << 32)); } static __inline void load_xcr(u_int reg, u_long val) { u_int low, high; low = val; high = val >> 32; __asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high)); } /* * Global TLB flush (except for thise for pages marked PG_G) */ static __inline void invltlb(void) { #ifdef XEN xen_tlb_flush(); #else load_cr3(rcr3()); #endif } #ifndef CR4_PGE #define CR4_PGE 0x00000080 /* Page global enable */ #endif /* * Perform the guaranteed invalidation of all TLB entries. This * includes the global entries, and entries in all PCIDs, not only the * current context. The function works both on non-PCID CPUs and CPUs * with the PCID turned off or on. See IA-32 SDM Vol. 3a 4.10.4.1 * Operations that Invalidate TLBs and Paging-Structure Caches. */ static __inline void invltlb_globpcid(void) { uint64_t cr4; cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } /* * TLB flush for an individual page (even if it has PG_G). * Only works on 486+ CPUs (i386 does not have PG_G). */ static __inline void invlpg(u_long addr) { #ifdef XEN xen_invlpg(addr); #else __asm __volatile("invlpg %0" : : "m" (*(char *)addr) : "memory"); #endif } #define INVPCID_ADDR 0 #define INVPCID_CTX 1 #define INVPCID_CTXGLOB 2 #define INVPCID_ALLCTX 3 struct invpcid_descr { uint64_t pcid:12 __packed; uint64_t pad:52 __packed; uint64_t addr; } __packed; static __inline void invpcid(struct invpcid_descr *d, int type) { /* invpcid (%rdx),%rax */ __asm __volatile(".byte 0x66,0x0f,0x38,0x82,0x02" : : "d" (d), "a" ((u_long)type) : "memory"); } static __inline u_short rfs(void) { u_short sel; __asm __volatile("movw %%fs,%0" : "=rm" (sel)); return (sel); } static __inline u_short rgs(void) { u_short sel; __asm __volatile("movw %%gs,%0" : "=rm" (sel)); return (sel); } static __inline u_short rss(void) { u_short sel; __asm __volatile("movw %%ss,%0" : "=rm" (sel)); return (sel); } static __inline void load_ds(u_short sel) { __asm __volatile("movw %0,%%ds" : : "rm" (sel)); } static __inline void load_es(u_short sel) { __asm __volatile("movw %0,%%es" : : "rm" (sel)); } static __inline void cpu_monitor(const void *addr, u_long extensions, u_int hints) { __asm __volatile("monitor" : : "a" (addr), "c" (extensions), "d" (hints)); } static __inline void cpu_mwait(u_long extensions, u_int hints) { __asm __volatile("mwait" : : "a" (hints), "c" (extensions)); } #ifdef _KERNEL /* This is defined in but is too painful to get to */ #ifndef MSR_FSBASE #define MSR_FSBASE 0xc0000100 #endif static __inline void load_fs(u_short sel) { /* Preserve the fsbase value across the selector load */ __asm __volatile("rdmsr; movw %0,%%fs; wrmsr" : : "rm" (sel), "c" (MSR_FSBASE) : "eax", "edx"); } #ifndef MSR_GSBASE #define MSR_GSBASE 0xc0000101 #endif static __inline void load_gs(u_short sel) { /* * Preserve the gsbase value across the selector load. * Note that we have to disable interrupts because the gsbase * being trashed happens to be the kernel gsbase at the time. */ __asm __volatile("pushfq; cli; rdmsr; movw %0,%%gs; wrmsr; popfq" : : "rm" (sel), "c" (MSR_GSBASE) : "eax", "edx"); } #else /* Usable by userland */ static __inline void load_fs(u_short sel) { __asm __volatile("movw %0,%%fs" : : "rm" (sel)); } static __inline void load_gs(u_short sel) { __asm __volatile("movw %0,%%gs" : : "rm" (sel)); } #endif static __inline void lidt(struct region_descriptor *addr) { __asm __volatile("lidt (%0)" : : "r" (addr)); } static __inline void lldt(u_short sel) { __asm __volatile("lldt %0" : : "r" (sel)); } static __inline void ltr(u_short sel) { __asm __volatile("ltr %0" : : "r" (sel)); } static __inline uint64_t rdr0(void) { uint64_t data; __asm __volatile("movq %%dr0,%0" : "=r" (data)); return (data); } static __inline void load_dr0(uint64_t dr0) { __asm __volatile("movq %0,%%dr0" : : "r" (dr0)); } static __inline uint64_t rdr1(void) { uint64_t data; __asm __volatile("movq %%dr1,%0" : "=r" (data)); return (data); } static __inline void load_dr1(uint64_t dr1) { __asm __volatile("movq %0,%%dr1" : : "r" (dr1)); } static __inline uint64_t rdr2(void) { uint64_t data; __asm __volatile("movq %%dr2,%0" : "=r" (data)); return (data); } static __inline void load_dr2(uint64_t dr2) { __asm __volatile("movq %0,%%dr2" : : "r" (dr2)); } static __inline uint64_t rdr3(void) { uint64_t data; __asm __volatile("movq %%dr3,%0" : "=r" (data)); return (data); } static __inline void load_dr3(uint64_t dr3) { __asm __volatile("movq %0,%%dr3" : : "r" (dr3)); } static __inline uint64_t rdr4(void) { uint64_t data; __asm __volatile("movq %%dr4,%0" : "=r" (data)); return (data); } static __inline void load_dr4(uint64_t dr4) { __asm __volatile("movq %0,%%dr4" : : "r" (dr4)); } static __inline uint64_t rdr5(void) { uint64_t data; __asm __volatile("movq %%dr5,%0" : "=r" (data)); return (data); } static __inline void load_dr5(uint64_t dr5) { __asm __volatile("movq %0,%%dr5" : : "r" (dr5)); } static __inline uint64_t rdr6(void) { uint64_t data; __asm __volatile("movq %%dr6,%0" : "=r" (data)); return (data); } static __inline void load_dr6(uint64_t dr6) { __asm __volatile("movq %0,%%dr6" : : "r" (dr6)); } static __inline uint64_t rdr7(void) { uint64_t data; __asm __volatile("movq %%dr7,%0" : "=r" (data)); return (data); } static __inline void load_dr7(uint64_t dr7) { __asm __volatile("movq %0,%%dr7" : : "r" (dr7)); } static __inline register_t intr_disable(void) { register_t rflags; rflags = read_rflags(); disable_intr(); return (rflags); } static __inline void intr_restore(register_t rflags) { write_rflags(rflags); } #else /* !(__GNUCLIKE_ASM && __CC_SUPPORTS___INLINE) */ int breakpoint(void); u_int bsfl(u_int mask); u_int bsrl(u_int mask); void clflush(u_long addr); void clts(void); void cpuid_count(u_int ax, u_int cx, u_int *p); void disable_intr(void); void do_cpuid(u_int ax, u_int *p); void enable_intr(void); void halt(void); void ia32_pause(void); u_char inb(u_int port); u_int inl(u_int port); void insb(u_int port, void *addr, size_t count); void insl(u_int port, void *addr, size_t count); void insw(u_int port, void *addr, size_t count); register_t intr_disable(void); void intr_restore(register_t rf); void invd(void); void invlpg(u_int addr); void invltlb(void); u_short inw(u_int port); void lidt(struct region_descriptor *addr); void lldt(u_short sel); void load_cr0(u_long cr0); void load_cr3(u_long cr3); void load_cr4(u_long cr4); void load_dr0(uint64_t dr0); void load_dr1(uint64_t dr1); void load_dr2(uint64_t dr2); void load_dr3(uint64_t dr3); void load_dr4(uint64_t dr4); void load_dr5(uint64_t dr5); void load_dr6(uint64_t dr6); void load_dr7(uint64_t dr7); void load_fs(u_short sel); void load_gs(u_short sel); void ltr(u_short sel); void outb(u_int port, u_char data); void outl(u_int port, u_int data); void outsb(u_int port, const void *addr, size_t count); void outsl(u_int port, const void *addr, size_t count); void outsw(u_int port, const void *addr, size_t count); void outw(u_int port, u_short data); u_long rcr0(void); u_long rcr2(void); u_long rcr3(void); u_long rcr4(void); uint64_t rdmsr(u_int msr); uint64_t rdpmc(u_int pmc); uint64_t rdr0(void); uint64_t rdr1(void); uint64_t rdr2(void); uint64_t rdr3(void); uint64_t rdr4(void); uint64_t rdr5(void); uint64_t rdr6(void); uint64_t rdr7(void); uint64_t rdtsc(void); u_long read_rflags(void); u_int rfs(void); u_int rgs(void); void wbinvd(void); void write_rflags(u_int rf); void wrmsr(u_int msr, uint64_t newval); #endif /* __GNUCLIKE_ASM && __CC_SUPPORTS___INLINE */ void reset_dbregs(void); #ifdef _KERNEL int rdmsr_safe(u_int msr, uint64_t *val); int wrmsr_safe(u_int msr, uint64_t newval); #endif #endif /* !_MACHINE_CPUFUNC_H_ */ Index: projects/amd64_xen_pv/sys/amd64/xen/exception.S =================================================================== --- projects/amd64_xen_pv/sys/amd64/xen/exception.S (revision 260906) +++ projects/amd64_xen_pv/sys/amd64/xen/exception.S (revision 260907) @@ -1,575 +1,635 @@ /*- * Copyright (c) 2011-2012 Spectra Logic Corporation * All rights reserved. * * This software was developed by Cherry G. Mathew * under sponsorship from Spectra Logic Corporation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #include #include #include #include #include "assym.s" #define T_EVENT T_RESERVED /* XXX: */ #define VGCF_IN_SYSCALL 256 /* See: xen/interface/arch-x86/xen-x86_64.h */ /* * We're guaranteed that sizeof(struct vcpu_info) == 64 bytes. * log2(64) == 6; * See: interface/xen.h */ #define EVENTS_MASK(reg) \ movq PCPU(CPUID), reg ; \ shlq $6, reg /* cpuid * sizeof(struct vcpu_info) */ ; \ addq HYPERVISOR_shared_info, reg /* shared_info.vcpu_info[cpuid] */ ; \ movq $1, EVTCHN_UPCALL_MASK(reg) #define EVENTS_UNMASK(reg) \ movq PCPU(CPUID), reg ; \ shlq $6, reg /* cpuid * sizeof(struct vcpu_info) */ ; \ addq HYPERVISOR_shared_info, reg /* shared_info.vcpu_info[cpuid] */ ; \ movq $0, EVTCHN_UPCALL_MASK(reg) /* Save all general registers to the stack */ #define SAVE_GENERAL_REGS \ movq %rdi, TF_RDI(%rsp) ; \ movq %rsi, TF_RSI(%rsp) ; \ movq %rdx, TF_RDX(%rsp) ; \ movq %rcx, TF_RCX(%rsp) ; \ movq %r8, TF_R8(%rsp) ; \ movq %r9, TF_R9(%rsp) ; \ movq %rax, TF_RAX(%rsp) ; \ movq %rbx, TF_RBX(%rsp) ; \ movq %rbp, TF_RBP(%rsp) ; \ movq %r10, TF_R10(%rsp) ; \ movq %r11, TF_R11(%rsp) ; \ movq %r12, TF_R12(%rsp) ; \ movq %r13, TF_R13(%rsp) ; \ movq %r14, TF_R14(%rsp) ; \ movq %r15, TF_R15(%rsp) /* Restore all general registers from the stack */ #define RESTORE_GENERAL_REGS \ movq TF_RDI(%rsp), %rdi ; \ movq TF_RSI(%rsp), %rsi ; \ movq TF_RDX(%rsp), %rdx ; \ movq TF_RCX(%rsp), %rcx ; \ movq TF_R8(%rsp), %r8 ; \ movq TF_R9(%rsp), %r9 ; \ movq TF_RAX(%rsp), %rax ; \ movq TF_RBX(%rsp), %rbx ; \ movq TF_RBP(%rsp), %rbp ; \ movq TF_R10(%rsp), %r10 ; \ movq TF_R11(%rsp), %r11 ; \ movq TF_R12(%rsp), %r12 ; \ movq TF_R13(%rsp), %r13 ; \ movq TF_R14(%rsp), %r14 ; \ movq TF_R15(%rsp), %r15 /* Note: %fs/%gs are saved/restored by the hypervisor */ /* Save generic data segment registers to the stack */ #define SAVE_SEGMENT_REGS \ movw %es, TF_ES(%rsp) ; \ movw %ds, TF_DS(%rsp) ; \ - movw %fs, TF_FS(%rsp) + movw %fs, TF_FS(%rsp) ; \ + movw %gs, TF_FS(%rsp) ; \ /* Restore generic data segment registers from the stack */ +/* This macro overwrites some general registers */ #define RESTORE_SEGMENT_REGS \ call restore_segment_regs /* stackframe management for trap() * * Xen creates a "bounce frame" in the following format: * { RCX, R11, [DS-GS,] [ERRCODE,] RIP, CS, RFLAGS, RSP, SS } * * Erratum: Comments in the Xen sources talk about [CR2] saved on the * stackframe, but the code for this is not to be found. * c.f: xen/arch/x86/x86_64/entry.S * * [DS-GS,] is only saved for the failsafe callback. * * [ERRCODE], is optional, depending on the type of (hardware) exception. * See: the "AMD64 Architecture Programmer's Manual, Volume 2: * System Programming: Section 8.2 for individual error code * reporting status * */ /* * Prepare the frame for a non-failsafe entry point. * We frob the stack so it looks like the native entry point. * See: "hardware defined" part of x86/frame.h struct trapframe; */ #define TRAP_FRAME_PREPARE \ movq (%rsp), %rcx ; \ movq 8(%rsp), %r11 ; \ addq $16, %rsp /* * Setup the trapframe for exceptions that the CPU does not * push an error code on the stack. */ #define TRAP_FRAME_ENTER_NOERR \ TRAP_FRAME_PREPARE ; \ subq $TF_RIP, %rsp ; \ movq $0, TF_ERR(%rsp) /* * Setup the trapframe for exceptions that the CPU pushes an * error code on the stack. */ #define TRAP_FRAME_ENTER_ERR \ TRAP_FRAME_PREPARE ; \ subq $TF_ERR, %rsp /* * Setting up the exit stackframe involves resetting the stack layout * identically to that of an exception without error code. The reason * for this is that in order to "iret", we make a hypervisor call, and * this hypervisor call is a syscall which expects an 'error code' on * the stack. We accomplish this by pushing quadword '0' onto the * stack in the INTR_EXIT() stub. * */ #define TRAP_FRAME_EXIT_NOERR \ addq $TF_RIP, %rsp #define TRAP_FRAME_EXIT_ERR \ addq $TF_RIP, %rsp #define TRAP_PROLOGUE(a) \ movl $(a), TF_TRAPNO(%rsp) ; \ movq $0, TF_ADDR(%rsp) /* Fetch page fault address (%cr2) from hypervisor */ #define SETUP_TF_ADDR /* Clobbers %rsi %rdi */ \ movq PCPU(CPUID), %rsi ; \ shlq $6, %rsi /* cpuid * sizeof(struct vcpu_info) */ ; \ addq HYPERVISOR_shared_info, %rsi /* shared_info.vcpu_info[cpuid] */ ; \ movq VCPU_RCR2(%rsi), %rdi ; \ movq %rdi, TF_ADDR(%rsp) #define CALLTRAP \ cld ; \ movq %rsp, %rdi ; \ call trap #define EVENT_UPCALL \ cld ; \ movq %rsp, %rdi ; \ call xen_intr_handle_upcall #define DO_AST_MAYBE \ testb $SEL_RPL_MASK, TF_CS(%rsp) /* are we returning to user mode? */ ; \ jz 2f /* can't handle ASTs now if not */ ; \ 1: \ /* XXX: cli */ \ movq PCPU(CURTHREAD), %rax ; \ testl $TDF_ASTPENDING | TDF_NEEDRESCHED, TD_FLAGS(%rax) ; \ je 2f ; \ /* XXX: sti */ \ movq %rsp, %rdi ; \ call ast ; \ jmp 1b ; \ 2: #define DO_STI_MAYBE \ testl $PSL_I, TF_RFLAGS(%rsp) ; \ jz 1f ; \ EVENTS_UNMASK(%rdi) ; \ 1: #define INTR_EXIT \ pushq $0 ; /* struct iret_context.flags */ \ jmp hypercall_page + (__HYPERVISOR_iret * 32) #define CALLSYSCALL \ cld ;\ movq PCPU(CURTHREAD),%rdi ;\ movq %rsp, TD_FRAME(%rdi) ;\ movl TF_RFLAGS(%rsp),%esi ;\ andl $PSL_T,%esi ;\ call amd64_syscall #define SYSRET \ /* XXX: watchout for: * http: //cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2006-0744 * Explained here: * http://blog.xen.org/index.php/2012/06/13/the-intel-sysret-privilege-escalation/ * Also see comments in trap.c */ pushq $VGCF_IN_SYSCALL ;\ jmp hypercall_page + (__HYPERVISOR_iret * 32) NON_GPROF_ENTRY(restore_segment_regs) + .globl doreti_iret .globl ld_es .globl ld_ds .globl ld_fs .globl ld_gs +doreti_iret: + /* Note: The trapframe is on the *caller* stackframe */ + movq PCPU(CURPCB),%r8 + + /* + * Do not reload segment registers for kernel. + * Since we do not reload segments registers with sane + * values on kernel entry, descriptors referenced by + * segments registers might be not valid. This is fatal + * for user mode, but is not a problem for the kernel. + */ + testb $SEL_RPL_MASK,(TF_CS + 8)(%rsp) + jz segs_done + testl $PCB_FULL_IRET,PCB_FLAGS(%r8) + jz segs_done + testl $TF_HASSEGS,(TF_FLAGS + 8)(%rsp) + jne 1f + + /* reload with sane values */ + movw $KUDSEL,%ax + movw %ax,(TF_DS + 8)(%rsp) + movw %ax,(TF_ES + 8)(%rsp) + movw $KUF32SEL,(TF_FS + 8)(%rsp) + movw $KUG32SEL,(TF_GS + 8)(%rsp) + +1: +ld_fs: + xorq %rbx, %rbx + movw (TF_FS + 8)(%rsp), %ax + movw %ax, %fs /* blew away fsbase here */ + cmpw $KUF32SEL, %ax + jne 2f + movq $0x666, %rbx /* Kernel doesn't use %fs */ + +2: +ld_gs: + movl PCPU(CPUID), %edi + callq pcpu_find + movq %rax, %rdi /* kgsbase == pcpu_find(PCPU_GET(cpuid)) */ + + movw (TF_GS + 8)(%rsp), %ax + movw %ax, %gs /* blew away (k)gsbase here */ + + /* Kernel gsbase reload */ + callq xen_load_kgsbase /* reload kgsbase from %rdi */ + + movw %gs, %ax + cmpw $KUG32SEL, %ax + jne 3f + movq $0x666, %rbx +3: + cmpq $0x666, %rbx /* reload user %fs/%gs ? */ + jne 4f + + movq PCPU(CURPCB), %rdi + callq xen_load_tls /* Update user %fs/%gs to pcb_fsbase and pcb_gsbase */ + +4: /* done with %fs/%gs */ + ld_es: - movw (TF_ES + 8)(%rsp,1), %es ; /* Save on previous frame */ + movw (TF_ES + 8)(%rsp), %es ; ld_ds: - movw (TF_DS + 8)(%rsp,1), %ds ; /* Save on previous frame */ -ld_fs: /* %fs == 0 and the per-proc base is updated via xen_set_proc() */ -ld_gs: /* XEN manages %gs (swapgs) */ + movw (TF_DS + 8)(%rsp), %ds ; + +segs_done: ret /* The path below should not have been reached. */ - .globl doreti_iret .globl doreti_iret_fault .globl ld_gsbase .globl ld_fsbase .globl gsbase_load_fault .globl fsbase_load_fault .globl mca_intr .globl fs_load_fault /* XXX: revisit */ .globl gs_load_fault /* XXX: revisit */ -doreti_iret: doreti_iret_fault: ld_gsbase: ld_fsbase: gsbase_load_fault: fsbase_load_fault: fs_load_fault: gs_load_fault: mca_intr: movq msgflt, %rdi call panic /* panic("..."); */ msgflt: .asciz "Unknown kernel fault rip location\n" .globl ds_load_fault .globl es_load_fault ds_load_fault: es_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) movq %rsp,%rdi call trap movw $KUDSEL,TF_ES(%rsp) DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(de) /* Divide-By-Zero-Error */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_DIVIDE) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(db) /* Debug */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_TRCTRAP); - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(nmi) /* Non-Maskable-Interrupt */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_NMI) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(bp) /* Breakpoint */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_BPTFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(of) /* Overflow */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_OFLOW) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(br) /* Bound-Range */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_BOUND) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(ud) /* Invalid-Opcode */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_PRIVINFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(nm) /* Device-Not-Available */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_DNA) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(df) /* Double-Fault */ TRAP_FRAME_ENTER_ERR ; TRAP_PROLOGUE(T_DOUBLEFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_ERR ; INTR_EXIT ; IDTVEC(ts) /* Invalid-TSS */ TRAP_FRAME_ENTER_ERR ; TRAP_PROLOGUE(T_TSSFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; - RESTORE_GENERAL_REGS ; RESTORE_SEGMENT_REGS ; + RESTORE_GENERAL_REGS ; /* overwrites some general registers */ TRAP_FRAME_EXIT_ERR ; INTR_EXIT ; IDTVEC(np) /* Segment-Not-Present */ TRAP_FRAME_ENTER_ERR ; TRAP_PROLOGUE(T_SEGNPFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; - RESTORE_GENERAL_REGS ; RESTORE_SEGMENT_REGS ; + RESTORE_GENERAL_REGS ; /* overwrites some general registers */ TRAP_FRAME_EXIT_ERR ; INTR_EXIT ; IDTVEC(ss) /* Stack */ TRAP_FRAME_ENTER_ERR ; TRAP_PROLOGUE(T_STKFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_ERR ; INTR_EXIT ; IDTVEC(gp) /* General-Protection */ TRAP_FRAME_ENTER_ERR ; TRAP_PROLOGUE(T_PROTFLT); SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ TRAP_FRAME_EXIT_ERR ; INTR_EXIT ; IDTVEC(pf) /* Page-Fault */ TRAP_FRAME_ENTER_ERR ; TRAP_PROLOGUE(T_PAGEFLT); - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; SETUP_TF_ADDR ; /* Fault Address - clobbers %rsi %rdi */ CALLTRAP ; /* %rsi is ignored */ DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_ERR ; INTR_EXIT ; IDTVEC(mf) /* x87 Floating-Point Exception Pending */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_ARITHTRAP) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(ac) /* Alignment-Check */ TRAP_FRAME_ENTER_ERR ; TRAP_PROLOGUE(T_ALIGNFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_ERR ; INTR_EXIT ; IDTVEC(mc) /* Machine-Check */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_MCHK) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(xf) /* SIMD Floating-Point */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_XMMFLT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(rs) /* Reserved */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_RESERVED) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; DO_STI_MAYBE ; CALLTRAP ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; ENTRY(fork_trampoline) movq %r12,%rdi /* function */ movq %rbx,%rsi /* arg1 */ movq %rsp,%rdx /* trapframe pointer */ call fork_exit DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; IDTVEC(hypervisor_callback) /* Xen only */ TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_EVENT) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; EVENT_UPCALL ; DO_STI_MAYBE ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; ENTRY(failsafe_callback) movq msgfailsafe, %rdi ; call panic ; /* panic("..."); */ msgfailsafe: .asciz "Failsafe upcall triggered\n" IDTVEC(syscall_callback) TRAP_FRAME_ENTER_NOERR ; TRAP_PROLOGUE(T_USER) ; - SAVE_SEGMENT_REGS ; SAVE_GENERAL_REGS ; + SAVE_SEGMENT_REGS ; movq %r11, TF_RFLAGS(%rsp) ; /* Tweak for INTR_EXIT */ movq %r10, TF_RCX(%rsp) ; /* Translate to C abi. see trap.c:cpu_fetch_syscall_args() */ DO_STI_MAYBE ; /* Clobbers %rdi */ movq TF_RDI(%rsp), %rdi ; CALLSYSCALL ; DO_AST_MAYBE ; + RESTORE_SEGMENT_REGS ; /* overwrites some general registers */ RESTORE_GENERAL_REGS ; /* XXX: optimise for SYSRET */ - RESTORE_SEGMENT_REGS ; TRAP_FRAME_EXIT_NOERR ; INTR_EXIT ; /* XXX: SYSRET is more optimal */ Index: projects/amd64_xen_pv/sys/amd64/xen/machdep.c =================================================================== --- projects/amd64_xen_pv/sys/amd64/xen/machdep.c (revision 260906) +++ projects/amd64_xen_pv/sys/amd64/xen/machdep.c (revision 260907) @@ -1,1774 +1,1802 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2012, 2013 Spectra Logic Corporation * All rights reserved. * * Portions of this software were developed by * Cherry G. Mathew under sponsorship * from Spectra Logic Corporation. * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include "opt_compat.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_smp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX: remove with RB_XXX */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CS_SECURE(cs) (0) /* XXX: TODO */ #define EFL_SECURE(ef, oef) (0) /* XXX: TODO */ int _udatasel, _ucodesel, _ufssel, _ugssel; int cold = 1; int gdtset = 0; long Maxmem = 0; long realmem = 0; unsigned long physfree; start_info_t *xen_start_info; shared_info_t *HYPERVISOR_shared_info; xen_pfn_t *xen_machine_phys = machine_to_phys_mapping; xen_pfn_t *xen_phys_machine; xen_pfn_t *xen_pfn_to_mfn_frame_list[16]; /* XXX: TODO init for suspend/resume */ xen_pfn_t *xen_pfn_to_mfn_frame_list_list; /* XXX: TODO init for suspend/resume */ int xen_vector_callback_enabled = 0; enum xen_domain_type xen_domain_type = XEN_PV_DOMAIN; #define PHYSMAP_SIZE (2 * VM_PHYSSEG_MAX) vm_offset_t pa_index = 0; vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; struct kva_md_info kmi; static struct trapframe proc0_tf; struct pcpu __pcpu[MAXCPU]; struct user_segment_descriptor gdt[512] __aligned(PAGE_SIZE); /* vcpu0 global descriptor tables */ struct mtx icu_lock; struct mtx dt_lock; /* lock for GDT and LDT */ /* XXX : please review its use */ /* callback prototypes */ void Xhypervisor_callback(void); void failsafe_callback(void); void Xsyscall_callback(void); vm_paddr_t initxen(struct start_info *); extern void printcpuinfo(void); /* XXX header file */ extern void identify_cpu(void); /* XXX header file */ extern void panicifcpuunsupported(void); /* XXX header file */ static void get_fpcontext(struct thread *td, mcontext_t *mcp); static int set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); /*------------------------------- Per-CPU Data -------------------------------*/ DPCPU_DEFINE(struct vcpu_info *, vcpu_info); /* Expects a zero-ed page aligned page */ static void setup_gdt(struct user_segment_descriptor *thisgdt) { uint32_t base, limit; uint8_t type, dpl, p, l, def32, gran; int i; for (i = 0; i < NGDT; i++) { base = 0; limit = 0; type = 0; dpl = 0; p = 0; l = 0; def32 = 0; gran = 0; switch (i) { #if 0 /* xen manages user/kernel stack switches by itself (not via tss) */ case GPROC0_SEL: /* kernel TSS (64bit) first half */ /* Second half is all zeroes */ limit = sizeof(struct amd64tss) + IOPAGES * PAGE_SIZE - 1; type = SDT_SYSTSS; dpl = SEL_KPL; p = 1; break; #endif /* 0 */ case GUFS32_SEL: case GUGS32_SEL: case GUDATA_SEL: limit = 0xfffff; type = SDT_MEMRWA; dpl = SEL_UPL; p = 1; def32 = 1; gran = 1; break; case GUCODE_SEL: limit = 0xfffff; type = SDT_MEMERA; dpl = SEL_UPL; p = 1; l = 1; gran = 1; break; case GCODE_SEL: limit = 0xfffff; type = SDT_MEMERA; dpl = SEL_KPL; p = 1; l = 1; gran = 1; break; case GDATA_SEL: limit = 0xfffff; type = SDT_MEMRWA; dpl = SEL_KPL; p = 1; l = 1; gran = 1; break; case GUCODE32_SEL: limit = 0xfffff; type = SDT_MEMERA; dpl = SEL_UPL; p = 1; def32 = 1; gran = 1; break; } USD_SETBASE(&thisgdt[i], base); USD_SETLIMIT(&thisgdt[i], limit); thisgdt[i].sd_type = type; thisgdt[i].sd_dpl = dpl; thisgdt[i].sd_p = p; thisgdt[i].sd_long = l; thisgdt[i].sd_def32 = def32; thisgdt[i].sd_gran = gran; thisgdt[i].sd_xx = 0; } } /* * Tell xen about our exception handlers. Unlike page tables, this is * a "fire-and-forget" xen setup - we only need to pass a template of * the vector table which xen then makes a copy of. Each time this * function is called, the entire trap table is updated. * * Note: We have a page worth of boot stack, so ok to do put the * template on the stack. */ extern int Xde, Xdb, Xnmi, Xbp, Xof, Xbr, Xud, Xnm, Xdf, Xts, Xnp, Xss, Xgp, Xpf, Xmf, Xac, Xmc, Xxf; static void init_exception_table(void) { /* * The vector mapping is dictated by the Intel 64 and * IA-32 Architectures Software Developer's Manual, Volume 3, * System Programming Guide.... Table 6.1 "Exceptions and * Interrupts". * * Note: Xen only re-routes exceptions via this * mechanism. Hardware Interrupts are managed via an "event" * mechanism, elsewhere. */ struct trap_info exception_table[] = { /* .vector, .flags, .cs, .address */ { 0, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xde }, { 1, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xdb }, { 2, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xnmi }, /* XXX: masking */ { 3, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xbp }, { 4, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xof }, { 5, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xbr }, { 6, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xud }, { 7, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xnm }, { 8, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xdf }, { 10, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xts }, { 11, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xnp }, { 12, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xss }, { 13, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xgp }, { 14, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xpf }, { 15, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xmf }, { 16, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xac }, { 17, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xmc }, /* XXX: investigate MCA on XEN */ { 18, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &Xxf }, { 0, 0, 0, 0 } /* End of table marker * .address == 0 */ }; PANIC_IF(HYPERVISOR_set_trap_table(exception_table)); } static void init_event_callbacks(void) { struct callback_register cbr; cbr.type = CALLBACKTYPE_event; cbr.address = (unsigned long)Xhypervisor_callback; PANIC_IF(HYPERVISOR_callback_op(CALLBACKOP_register, &cbr)); cbr.type = CALLBACKTYPE_failsafe; cbr.address = (unsigned long)failsafe_callback; PANIC_IF(HYPERVISOR_callback_op(CALLBACKOP_register, &cbr)); cbr.type = CALLBACKTYPE_syscall; cbr.address = (unsigned long)Xsyscall_callback; PANIC_IF(HYPERVISOR_callback_op(CALLBACKOP_register, &cbr)); /* XXX: syscall32, sysenter */ } #define XEN_CPUID_LEAF_HYPERCALL XEN_CPUID_LEAF(3 - 1) void xen_set_hypercall_page(vm_paddr_t); extern char hypercall_page[]; /* locore.s */ extern uint64_t xenstack; /* start of Xen provided stack */ void force_evtchn_callback(void) { (void)HYPERVISOR_xen_version(0, NULL); } /* * Modify the cmd_line by converting ',' to NULLs so that it is in a format * suitable for the static env vars. * XXX: nicked from, unify with i386/xen_machdep.c */ static char * xen_setbootenv(char *cmd_line) { char *cmd_line_next; /* Skip leading spaces */ for (; *cmd_line == ' '; cmd_line++); printk("xen_setbootenv(): cmd_line='%s'\n", cmd_line); for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;); return cmd_line; } static struct { const char *ev; int mask; } howto_names[] = { {"boot_askname", RB_ASKNAME}, {"boot_single", RB_SINGLE}, {"boot_nosync", RB_NOSYNC}, {"boot_halt", RB_ASKNAME}, {"boot_serial", RB_SERIAL}, {"boot_cdrom", RB_CDROM}, {"boot_gdb", RB_GDB}, {"boot_gdb_pause", RB_RESERVED1}, {"boot_verbose", RB_VERBOSE}, {"boot_multicons", RB_MULTIPLE}, {NULL, 0} }; static int xen_boothowto(char *envp) { int i, howto = 0; /* get equivalents from the environment */ for (i = 0; howto_names[i].ev != NULL; i++) if (getenv(howto_names[i].ev) != NULL) howto |= howto_names[i].mask; return howto; } static void xen_rootconf(void) { char *rdevpath; rdevpath = getenv("root"); if (rdevpath == NULL) { return; } /* Do not overwrite existing variable */ if (getenv("vfs.root.mountfrom") == NULL) { setenv("vfs.root.mountfrom", rdevpath); } } SYSINIT(xen_rootconf, SI_SUB_ROOT_CONF, SI_ORDER_ANY, xen_rootconf, NULL); /* * Setup early kernel environment, based on start_info passed to us by * xen */ vm_paddr_t initxen(struct start_info *si) { char *env; caddr_t kmdp; size_t kstack0_sz; struct pcpu *pc; KASSERT(si != NULL, ("start_info invalid")); /* global variables */ xen_start_info = si; /* xen variables */ xen_phys_machine = (xen_pfn_t *)si->mfn_list; physmem = si->nr_pages; Maxmem = si->nr_pages + 1; memset(phys_avail, 0, sizeof phys_avail); memset(dump_avail, 0 , sizeof dump_avail); /* * Setup kernel PCPU base. pcpu needs them, and other * parts of the early startup path use pcpu variables before * we have loaded the new Global Descriptor Table. + * XXX: revisit */ pc = &__pcpu[0]; - HYPERVISOR_set_segment_base (SEGBASE_FS, 0); HYPERVISOR_set_segment_base (SEGBASE_GS_KERNEL, (uint64_t) pc); - HYPERVISOR_set_segment_base (SEGBASE_GS_USER, 0); /* Setup paging */ /* * We'll reclaim the space taken by bootstrap PT and bootstrap * stack by marking them later as an available chunk via * phys_avail[] to the vm subsystem. */ /* Address of lowest unused page */ physfree = VTOP(si->pt_base + si->nr_pt_frames * PAGE_SIZE); /* Init basic tunables, hz, msgbufsize etc */ init_param1(); /* page tables */ pmap_bootstrap(&physfree); /* Setup thread context */ thread0.td_kstack = PTOV(physfree); thread0.td_kstack_pages = KSTACK_PAGES; kstack0_sz = ptoa(thread0.td_kstack_pages); bzero((void *)thread0.td_kstack, kstack0_sz); thread0.td_pcb = get_pcb_td(&thread0); physfree += kstack0_sz; /* Make sure we are still inside of available mapped va. */ KASSERT(PTOV(physfree) <= (xenstack + 512 * 1024), ("Attempt to use unmapped va\n")); /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); KASSERT(si->mod_start == 0, ("MISMATCH")); if (si->mod_start != 0) { /* we have a ramdisk or kernel module */ preload_metadata = (caddr_t)(si->mod_start); preload_bootstrap_relocate(KERNBASE); } kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); if (envmode == 1) kern_envp = static_env; else if ((caddr_t)xen_start_info->cmd_line) kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line); boothowto |= xen_boothowto(kern_envp); + boothowto |= RB_SINGLE; #ifdef DDB /* XXX: */ ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); #endif /* gdt */ vm_paddr_t gdt0_frame = phystomach(VTOP(gdt)); vm_paddr_t gdt0_frame_mfn = PFNTOMFN(VTOPFN(gdt)); memset(gdt, 0, sizeof gdt); setup_gdt(gdt); /* gdt resides in R/O memory. Update mappings */ if (HYPERVISOR_update_va_mapping((vm_offset_t)gdt, gdt0_frame | PG_U | PG_V, UVMF_INVLPG)) { printk("HYPERVISOR_update_va_mapping() failed\n"); cpu_halt(); /* NOTREACHED */ } if (HYPERVISOR_set_gdt((unsigned long *)&gdt0_frame_mfn, NGDT) != 0) { printk("HYPERVISOR_set_gdt() failed\n"); cpu_halt(); /* NOTREACHED */ } - lgdt(NULL); /* See: support.S */ + lgdt(NULL); /* Load all segment registers - See: support.S */ /* * Refresh kernel tls registers since we've blown them away - * via new GDT load. pcpu needs them. + * via new GDT load and segment reloads. pcpu needs them. */ + + HYPERVISOR_set_segment_base (SEGBASE_FS, 0); HYPERVISOR_set_segment_base (SEGBASE_GS_KERNEL, (uint64_t) pc); + HYPERVISOR_set_segment_base (SEGBASE_GS_USER, (uint64_t) 0); /* per cpu structures for cpu0 */ pcpu_init(pc, 0, sizeof(struct pcpu)); - dpcpu_init((void *)(PTOV(physfree)), 0); physfree += DPCPU_SIZE; /* XXX: This is a hack until we have MP */ DPCPU_ID_SET(0, vcpu_info, &HYPERVISOR_shared_info->vcpu_info[0]); /* Register the rest of free physical memory with phys_avail[] */ /* dump_avail[] starts at index 1 */ phys_avail[pa_index++] = physfree; dump_avail[pa_index] = physfree; phys_avail[pa_index++] = ptoa(physmem); dump_avail[pa_index] = ptoa(physmem); PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); PCPU_SET(tssp, &common_tss[0]); /* Dummy - see definition */ PCPU_SET(commontssp, &common_tss[0]); /* Dummy - see definition */ + /* XXX: ldt */ PCPU_SET(fs32p, (void *)xpmap_ptom(VTOP(&gdt[GUFS32_SEL]))); /* Note: On Xen PV, we set the machine address. */ PCPU_SET(gs32p, (void *)xpmap_ptom(VTOP(&gdt[GUGS32_SEL]))); /* Note: On Xen PV, we set the machine address. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exception handling */ init_exception_table(); /* Event handling */ init_event_callbacks(); cninit(); /* Console subsystem init */ kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif identify_cpu(); /* Final stage of CPU initialization */ initializecpu(); initializecpucache(); init_param2(physmem); bzero(msgbufp, msgbufsize); msgbufinit(msgbufp, msgbufsize); /* Enable write permissions for code patching */ static vm_offset_t xsave_cpage; xsave_cpage = (vm_offset_t) ctx_switch_xsave & ~PAGE_MASK; if (use_xsave) PT_SET_MA(xsave_cpage, phystomach(VTOP(xsave_cpage)) | PG_V | PG_U | PG_RW); fpuinit(); if (use_xsave) PT_SET_MA(xsave_cpage, phystomach(VTOP(xsave_cpage)) | PG_V | PG_U); /* * Set up thread0 pcb after fpuinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); PCPU_SET(rsp0, (vm_offset_t) thread0.td_pcb & ~0xFul /* 16 byte aligned */); PCPU_SET(curpcb, thread0.td_pcb); HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), (unsigned long) PCPU_GET(rsp0)); /* Tell xen about the kernel stack */ /* setup user mode selector glue */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); + /* XXX: _ucode32sel & compat_32 */ _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); - /* Load thread0 context */ - load_ds(_udatasel); - load_es(_udatasel); - load_fs(0); /* reset %fs to 0 before 64bit base load */ - HYPERVISOR_set_segment_base (SEGBASE_FS, 0); - HYPERVISOR_set_segment_base (SEGBASE_GS_USER_SEL, (uint64_t) 0); - HYPERVISOR_set_segment_base (SEGBASE_GS_USER, (uint64_t) 0); + /* + * Native does a "transfer to user mode" - which seems rather + * suspect^wunfinished to me (cherry@). + * + * We don't do this on xen, since this thread eventually + * becomes vm/vm_glue.c:swapper() , which assumes that it is + * running in kernel mode. + * + * Note, cherry@: I don't think it's worth the trouble setting + * up a separate "swapper" user context for this thread, + * unless a strong case for performance savings (TLB hits ?) + * can be made. + */ /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_pcb->pcb_cr3 = xpmap_ptom(VTOP(KPML4phys)); thread0.td_frame = &proc0_tf; + thread0.td_pcb->pcb_gsbase = (uint64_t) pc; env = getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); return (u_int64_t) thread0.td_pcb & ~0xFul /* 16 byte aligned */; } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } void cpu_halt(void) { HYPERVISOR_shutdown(SHUTDOWN_poweroff); } #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 int scheduler_running; static void idle_block(void) { HYPERVISOR_sched_op(SCHEDOP_block, 0); } void cpu_idle(int busy) { CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); cpu_idleclock(); } /* Call main idle method. */ scheduler_running = 1; enable_intr(); idle_block(); /* Switch timers mack into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } int cpu_idle_wakeup(int cpu) { struct pcpu *pcpu; int *state; pcpu = pcpu_find(cpu); state = (int *)pcpu->pc_monitorbuf; /* * This doesn't need to be atomic since missing the race will * simply result in unnecessary IPIs. */ if (*state == STATE_SLEEPING) return (0); if (*state == STATE_MWAIT) *state = STATE_RUNNING; return (1); } static void cpu_startup(void *dummy) { uintmax_t memsize; /* * Good {morning,afternoon,evening,night}. */ startrtclock(); //printcpuinfo(); //panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif realmem = Maxmem; /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; if (memsize < ptoa((uintmax_t)cnt.v_free_count)) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)cnt.v_free_count), ptoa((uintmax_t)cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* XXX: Unify with "native" machdep.c */ /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; mtx_lock(&dt_lock); if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); else mtx_unlock(&dt_lock); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; set_pcb_flags(pcb, PCB_FULL_IRET); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; td->td_retval[1] = 0; /* XXX: we don't do PCB_DBREGS */ /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { /* XXX: */ } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } /* XXX: Delete when we get mp_machdep.c */ /* Dummy defines to get x86/x86/local_apic.c to link. */ int Xspuriousint, Xtimerint, Xerrorint, Xcmcint, Xapic_isr1; int Xapic_isr2, Xapic_isr3, Xapic_isr4, Xapic_isr5; int Xapic_isr6, Xapic_isr7; int cmc_intr; void setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int ist) { } struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; }; /* We don't want to #include MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) mp_ncpus++; if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } /* Delete when we get mp_machdep.c : XXX*/ void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; critical_exit(); flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(flags); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } int ptrace_set_pc(struct thread *td, unsigned long addr) { td->td_frame->tf_rip = addr; return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_rflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_rflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; if (tp->tf_flags & TF_HASSEGS) { regs->r_ds = tp->tf_ds; regs->r_es = tp->tf_es; regs->r_fs = tp->tf_fs; regs->r_gs = tp->tf_gs; } else { regs->r_ds = 0; regs->r_es = 0; regs->r_fs = 0; regs->r_gs = 0; } return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; if (0) { /* XXXKIB */ tp->tf_ds = regs->r_ds; tp->tf_es = regs->r_es; tp->tf_fs = regs->r_fs; tp->tf_gs = regs->r_gs; tp->tf_flags = TF_HASSEGS; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td), ("not suspended thread %p", td)); fpugetregs(td); fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); fpuuserinited(td); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_ds = tp->tf_ds; mcp->mc_es = tp->tf_es; mcp->mc_fs = tp->tf_fs; mcp->mc_gs = tp->tf_gs; mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, const mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tp; char *xfpustate; long rflags; int ret; pcb = td->td_pcb; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; tp->tf_flags = mcp->mc_flags; if (tp->tf_flags & TF_HASSEGS) { tp->tf_ds = mcp->mc_ds; tp->tf_es = mcp->mc_es; tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } set_pcb_flags(pcb, PCB_FULL_IRET); return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); } static int set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { struct savefpu *fpstate; int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { fpstate = (struct savefpu *)&mcp->mc_fpstate; fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask; error = fpusetregs(td, fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (0); } void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetuserregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetuserregs()... perhaps we just * have too many layers. */ clear_pcb_flags(curthread->td_pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { KASSERT(0, ("XXX: TODO")); return -1; } int set_dbregs(struct thread *td, struct dbreg *dbregs) { KASSERT(0, ("XXX: TODO")); return -1; } void reset_dbregs(void) { KASSERT(0, ("XXX: TODO")); } #define PRINTK_BUFSIZE 1024 void printk(const char *fmt, ...) { __va_list ap; va_start(ap, fmt); vprintk(fmt, ap); va_end(ap); } int vprintk(const char *fmt, __va_list ap) { int retval; static char buf[PRINTK_BUFSIZE]; retval = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap); buf[retval] = 0; (void)HYPERVISOR_console_write(buf, retval); return retval; } #ifdef KTR static __inline u_long rrbp(void) { u_long data; __asm __volatile("movq 4(%%rbp),%0" : "=r" (data)); return (data); } #endif u_long read_rflags(void) { vcpu_info_t *_vcpu; u_long rflags; rflags = _read_rflags(); _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; if (_vcpu->evtchn_upcall_mask) rflags &= ~PSL_I; return (rflags); } void write_rflags(u_long rflags) { u_int intr; CTR2(KTR_SPARE2, "%x xen_restore_flags rflags %x", rrbp(), rflags); intr = ((rflags & PSL_I) == 0); __restore_flags(intr); _write_rflags(rflags); } void xen_cli(void) { CTR1(KTR_SPARE2, "%x xen_cli disabling interrupts", rrbp()); __cli(); } void xen_sti(void) { CTR1(KTR_SPARE2, "%x xen_sti enabling interrupts", rrbp()); __sti(); } u_long xen_rcr2(void) { return (HYPERVISOR_shared_info->vcpu_info[curcpu].arch.cr2); } +/* + * Set kernel %gs base + * This is required after a %gs reload from kernel context + */ void +xen_load_kgsbase(uint64_t gsbase) +{ + HYPERVISOR_set_segment_base (SEGBASE_GS_KERNEL, gsbase); +} + +/* Set Usermode TLS registers from pcb context */ +void +xen_load_tls(struct pcb *pcb) +{ + /* XXX: compat32 */ + if (pcb->pcb_flags & PCB_32BIT) { + struct user_segment_descriptor gsd; + gsd = gdt[GUGS32_SEL]; + USD_SETBASE(&gsd, pcb->pcb_gsbase); + xen_set_descriptor((vm_paddr_t)PCPU_GET(gs32p), (void *)&gsd); + + if (pcb->pcb_flags & PCB_32BIT) { + gsd = gdt[GUFS32_SEL]; + USD_SETBASE(&gsd, pcb->pcb_fsbase); + xen_set_descriptor((vm_paddr_t)PCPU_GET(fs32p), (void *)&gsd); + } + } else { + HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, + _ugssel); + HYPERVISOR_set_segment_base(SEGBASE_GS_USER, + pcb->pcb_gsbase); + HYPERVISOR_set_segment_base(SEGBASE_FS, + pcb->pcb_fsbase); + } +} + +void xen_set_proc(struct pcb *newpcb) { HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), (unsigned long) newpcb & ~0xFul); if (!(curthread->td_pflags & TDP_KTHREAD)) { /* Only for user proc */ - /* XXX: compat32 */ - if (newpcb->pcb_flags & PCB_32BIT) { - struct user_segment_descriptor gsd; - gsd = gdt[GUGS32_SEL]; - USD_SETBASE(&gsd, newpcb->pcb_gsbase); - xen_set_descriptor((vm_paddr_t)PCPU_GET(gs32p), (void *)&gsd); - - if (newpcb->pcb_flags & PCB_32BIT) { - gsd = gdt[GUFS32_SEL]; - USD_SETBASE(&gsd, newpcb->pcb_fsbase); - xen_set_descriptor((vm_paddr_t)PCPU_GET(fs32p), (void *)&gsd); - } - } else { - HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, - 0); - HYPERVISOR_set_segment_base(SEGBASE_GS_USER, - newpcb->pcb_gsbase); - HYPERVISOR_set_segment_base(SEGBASE_FS, - newpcb->pcb_fsbase); - } + xen_load_tls(newpcb); } } char *console_page; /* * We don't use the tss on xen pv - this is a dummy to not break * common assembler code - see cpu_switch.S:cpu_switch */ #include struct amd64tss common_tss[MAXCPU]; void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { KASSERT(0, ("XXX: TODO\n")); return -1; } #include #include /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct pcb *pcb; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; int sig; int oonstack; td = curthread; pcb = td->td_pcb; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext); fpstate_drop(td); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, sizeof(sf.sf_uc.uc_mcontext.mc_spare)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128; /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Translate the signal if appropriate. */ if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize) sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)]; /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; set_pcb_flags(pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct pcb *pcb; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; long rflags; int cs, error, ret; ksiginfo_t ksi; pcb = td->td_pcb; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { uprintf("pid %d (%s): sigreturn copyin failed\n", p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ /* * XXX do allow users to change the privileged flag PSL_RF. * The cpu sets PSL_RF in tf_rflags for faults. Debuggers * should sometimes set it there too. tf_rflags is kept in * the signal context during signal handling and there is no * other place to remember it, so the PSL_RF bit may be * corrupted by the signal handler without us knowing. * Corruption of the PSL_RF bit at worst causes one more or * one less debugger trap, so allowing it is fairly harmless. */ if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, td->td_name, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); set_pcb_flags(pcb, PCB_FULL_IRET); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif