Index: releng/11.4/sys/amd64/amd64/cpu_switch.S =================================================================== --- releng/11.4/sys/amd64/amd64/cpu_switch.S (revision 361587) +++ releng/11.4/sys/amd64/amd64/cpu_switch.S (revision 361588) @@ -1,501 +1,503 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include "assym.s" #include "opt_sched.h" /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ .text #ifdef SMP #define LK lock ; #else #define LK #endif #if defined(SCHED_ULE) && defined(SMP) #define SETLK xchgq #else #define SETLK movq #endif /* * cpu_throw() * * This is the second half of cpu_switch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care * about its state. This is only a slight optimization and is probably * not worth it anymore. Note that we need to clear the pm_active bits so * we do need the old proc if it still exists. * %rdi = oldtd * %rsi = newtd */ ENTRY(cpu_throw) movq %rsi,%r12 movq %rsi,%rdi call pmap_activate_sw jmp sw1 END(cpu_throw) /* * cpu_switch(old, new, mtx) * * Save the current thread state, then select the next thread to run * and load its state. * %rdi = oldtd * %rsi = newtd * %rdx = mtx */ ENTRY(cpu_switch) /* Switch to new thread. First, save context. */ movq TD_PCB(%rdi),%r8 movq (%rsp),%rax /* Hardware registers */ movq %r15,PCB_R15(%r8) movq %r14,PCB_R14(%r8) movq %r13,PCB_R13(%r8) movq %r12,PCB_R12(%r8) movq %rbp,PCB_RBP(%r8) movq %rsp,PCB_RSP(%r8) movq %rbx,PCB_RBX(%r8) movq %rax,PCB_RIP(%r8) testl $PCB_FULL_IRET,PCB_FLAGS(%r8) jnz 2f orl $PCB_FULL_IRET,PCB_FLAGS(%r8) testl $TDP_KTHREAD,TD_PFLAGS(%rdi) jnz 2f testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) jz 2f movl %fs,%eax cmpl $KUF32SEL,%eax jne 1f rdfsbase %rax movq %rax,PCB_FSBASE(%r8) 1: movl %gs,%eax cmpl $KUG32SEL,%eax jne 2f movq %rdx,%r12 movl $MSR_KGSBASE,%ecx /* Read user gs base */ rdmsr shlq $32,%rdx orq %rdx,%rax movq %rax,PCB_GSBASE(%r8) movq %r12,%rdx 2: testl $PCB_DBREGS,PCB_FLAGS(%r8) jnz store_dr /* static predict not taken */ done_store_dr: /* have we used fp, and need a save? */ cmpq %rdi,PCPU(FPCURTHREAD) jne 2f movq PCB_SAVEFPU(%r8),%r8 clts cmpl $0,use_xsave(%rip) jne 1f fxsave (%r8) jmp 2f 1: movq %rdx,%rcx movl xsave_mask,%eax movl xsave_mask+4,%edx .globl ctx_switch_xsave ctx_switch_xsave: /* This is patched to xsaveopt if supported, see fpuinit_bsp1() */ xsave (%r8) movq %rcx,%rdx 2: /* Save is done. Now fire up new thread. Leave old vmspace. */ movq %rsi,%r12 movq %rdi,%r13 movq %rdx,%r15 movq %rsi,%rdi callq pmap_activate_sw SETLK %r15,TD_LOCK(%r13) /* Release the old thread */ sw1: movq TD_PCB(%r12),%r8 #if defined(SCHED_ULE) && defined(SMP) /* Wait for the new thread to become unblocked */ movq $blocked_lock, %rdx 1: movq TD_LOCK(%r12),%rcx cmpq %rcx, %rdx pause je 1b #endif /* * At this point, we've switched address spaces and are ready * to load up the rest of the next context. */ /* Skip loading LDT and user fsbase/gsbase for kthreads */ testl $TDP_KTHREAD,TD_PFLAGS(%r12) jnz do_kthread /* * Load ldt register */ movq TD_PROC(%r12),%rcx cmpq $0, P_MD+MD_LDT(%rcx) jne do_ldt xorl %eax,%eax ld_ldt: lldt %ax /* Restore fs base in GDT */ movl PCB_FSBASE(%r8),%eax movq PCPU(FS32P),%rdx movw %ax,2(%rdx) shrl $16,%eax movb %al,4(%rdx) shrl $8,%eax movb %al,7(%rdx) /* Restore gs base in GDT */ movl PCB_GSBASE(%r8),%eax movq PCPU(GS32P),%rdx movw %ax,2(%rdx) shrl $16,%eax movb %al,4(%rdx) shrl $8,%eax movb %al,7(%rdx) do_kthread: /* Do we need to reload tss ? */ movq PCPU(TSSP),%rax movq PCB_TSSP(%r8),%rdx testq %rdx,%rdx cmovzq PCPU(COMMONTSSP),%rdx cmpq %rax,%rdx jne do_tss done_tss: movq %r8,PCPU(RSP0) movq %r8,PCPU(CURPCB) /* Update the TSS_RSP0 pointer for the next interrupt */ cmpq $~0,PCPU(UCR3) je 1f movq PCPU(PTI_RSP0),%rax movq %rax,TSS_RSP0(%rdx) jmp 2f 1: movq %r8,TSS_RSP0(%rdx) 2: movq %r12,PCPU(CURTHREAD) /* into next thread */ /* Test if debug registers should be restored. */ testl $PCB_DBREGS,PCB_FLAGS(%r8) jnz load_dr /* static predict not taken */ done_load_dr: /* Restore context. */ movq PCB_R15(%r8),%r15 movq PCB_R14(%r8),%r14 movq PCB_R13(%r8),%r13 movq PCB_R12(%r8),%r12 movq PCB_RBP(%r8),%rbp movq PCB_RSP(%r8),%rsp movq PCB_RBX(%r8),%rbx movq PCB_RIP(%r8),%rax movq %rax,(%rsp) movq PCPU(CURTHREAD),%rdi call fpu_activate_sw + cmpb $0,cpu_flush_rsb_ctxsw(%rip) + jne rsb_flush ret /* * We order these strangely for several reasons. * 1: I wanted to use static branch prediction hints * 2: Most athlon64/opteron cpus don't have them. They define * a forward branch as 'predict not taken'. Intel cores have * the 'rep' prefix to invert this. * So, to make it work on both forms of cpu we do the detour. * We use jumps rather than call in order to avoid the stack. */ store_dr: movq %dr7,%rax /* yes, do the save */ movq %dr0,%r15 movq %dr1,%r14 movq %dr2,%r13 movq %dr3,%r12 movq %dr6,%r11 movq %r15,PCB_DR0(%r8) movq %r14,PCB_DR1(%r8) movq %r13,PCB_DR2(%r8) movq %r12,PCB_DR3(%r8) movq %r11,PCB_DR6(%r8) movq %rax,PCB_DR7(%r8) andq $0x0000fc00, %rax /* disable all watchpoints */ movq %rax,%dr7 jmp done_store_dr load_dr: movq %dr7,%rax movq PCB_DR0(%r8),%r15 movq PCB_DR1(%r8),%r14 movq PCB_DR2(%r8),%r13 movq PCB_DR3(%r8),%r12 movq PCB_DR6(%r8),%r11 movq PCB_DR7(%r8),%rcx movq %r15,%dr0 movq %r14,%dr1 /* Preserve reserved bits in %dr7 */ andq $0x0000fc00,%rax andq $~0x0000fc00,%rcx movq %r13,%dr2 movq %r12,%dr3 orq %rcx,%rax movq %r11,%dr6 movq %rax,%dr7 jmp done_load_dr do_tss: movq %rdx,PCPU(TSSP) movq %rdx,%rcx movq PCPU(TSS),%rax movw %cx,2(%rax) shrq $16,%rcx movb %cl,4(%rax) shrq $8,%rcx movb %cl,7(%rax) shrq $8,%rcx movl %ecx,8(%rax) movb $0x89,5(%rax) /* unset busy */ movl $TSSSEL,%eax ltr %ax jmp done_tss do_ldt: movq PCPU(LDT),%rax movq P_MD+MD_LDT_SD(%rcx),%rdx movq %rdx,(%rax) movq P_MD+MD_LDT_SD+8(%rcx),%rdx movq %rdx,8(%rax) movl $LDTSEL,%eax jmp ld_ldt END(cpu_switch) /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* Save caller's return address. */ movq (%rsp),%rax movq %rax,PCB_RIP(%rdi) movq %rbx,PCB_RBX(%rdi) movq %rsp,PCB_RSP(%rdi) movq %rbp,PCB_RBP(%rdi) movq %r12,PCB_R12(%rdi) movq %r13,PCB_R13(%rdi) movq %r14,PCB_R14(%rdi) movq %r15,PCB_R15(%rdi) movq %cr0,%rax movq %rax,PCB_CR0(%rdi) movq %cr2,%rax movq %rax,PCB_CR2(%rdi) movq %cr3,%rax movq %rax,PCB_CR3(%rdi) movq %cr4,%rax movq %rax,PCB_CR4(%rdi) movq %dr0,%rax movq %rax,PCB_DR0(%rdi) movq %dr1,%rax movq %rax,PCB_DR1(%rdi) movq %dr2,%rax movq %rax,PCB_DR2(%rdi) movq %dr3,%rax movq %rax,PCB_DR3(%rdi) movq %dr6,%rax movq %rax,PCB_DR6(%rdi) movq %dr7,%rax movq %rax,PCB_DR7(%rdi) movl $MSR_FSBASE,%ecx rdmsr movl %eax,PCB_FSBASE(%rdi) movl %edx,PCB_FSBASE+4(%rdi) movl $MSR_GSBASE,%ecx rdmsr movl %eax,PCB_GSBASE(%rdi) movl %edx,PCB_GSBASE+4(%rdi) movl $MSR_KGSBASE,%ecx rdmsr movl %eax,PCB_KGSBASE(%rdi) movl %edx,PCB_KGSBASE+4(%rdi) movl $MSR_EFER,%ecx rdmsr movl %eax,PCB_EFER(%rdi) movl %edx,PCB_EFER+4(%rdi) movl $MSR_STAR,%ecx rdmsr movl %eax,PCB_STAR(%rdi) movl %edx,PCB_STAR+4(%rdi) movl $MSR_LSTAR,%ecx rdmsr movl %eax,PCB_LSTAR(%rdi) movl %edx,PCB_LSTAR+4(%rdi) movl $MSR_CSTAR,%ecx rdmsr movl %eax,PCB_CSTAR(%rdi) movl %edx,PCB_CSTAR+4(%rdi) movl $MSR_SF_MASK,%ecx rdmsr movl %eax,PCB_SFMASK(%rdi) movl %edx,PCB_SFMASK+4(%rdi) sgdt PCB_GDT(%rdi) sidt PCB_IDT(%rdi) sldt PCB_LDT(%rdi) str PCB_TR(%rdi) movl $1,%eax ret END(savectx) /* * resumectx(pcb) * Resuming processor state from pcb. */ ENTRY(resumectx) /* Switch to KPML4phys. */ movq KPML4phys,%rax movq %rax,%cr3 /* Force kernel segment registers. */ movl $KDSEL,%eax movw %ax,%ds movw %ax,%es movw %ax,%ss movl $KUF32SEL,%eax movw %ax,%fs movl $KUG32SEL,%eax movw %ax,%gs movl $MSR_FSBASE,%ecx movl PCB_FSBASE(%rdi),%eax movl 4 + PCB_FSBASE(%rdi),%edx wrmsr movl $MSR_GSBASE,%ecx movl PCB_GSBASE(%rdi),%eax movl 4 + PCB_GSBASE(%rdi),%edx wrmsr movl $MSR_KGSBASE,%ecx movl PCB_KGSBASE(%rdi),%eax movl 4 + PCB_KGSBASE(%rdi),%edx wrmsr /* Restore EFER one more time. */ movl $MSR_EFER,%ecx movl PCB_EFER(%rdi),%eax wrmsr /* Restore fast syscall stuff. */ movl $MSR_STAR,%ecx movl PCB_STAR(%rdi),%eax movl 4 + PCB_STAR(%rdi),%edx wrmsr movl $MSR_LSTAR,%ecx movl PCB_LSTAR(%rdi),%eax movl 4 + PCB_LSTAR(%rdi),%edx wrmsr movl $MSR_CSTAR,%ecx movl PCB_CSTAR(%rdi),%eax movl 4 + PCB_CSTAR(%rdi),%edx wrmsr movl $MSR_SF_MASK,%ecx movl PCB_SFMASK(%rdi),%eax wrmsr /* Restore CR0, CR2, CR4 and CR3. */ movq PCB_CR0(%rdi),%rax movq %rax,%cr0 movq PCB_CR2(%rdi),%rax movq %rax,%cr2 movq PCB_CR4(%rdi),%rax movq %rax,%cr4 movq PCB_CR3(%rdi),%rax movq %rax,%cr3 /* Restore descriptor tables. */ lidt PCB_IDT(%rdi) lldt PCB_LDT(%rdi) #define SDT_SYSTSS 9 #define SDT_SYSBSY 11 /* Clear "task busy" bit and reload TR. */ movq PCPU(TSS),%rax andb $(~SDT_SYSBSY | SDT_SYSTSS),5(%rax) movw PCB_TR(%rdi),%ax ltr %ax #undef SDT_SYSTSS #undef SDT_SYSBSY /* Restore debug registers. */ movq PCB_DR0(%rdi),%rax movq %rax,%dr0 movq PCB_DR1(%rdi),%rax movq %rax,%dr1 movq PCB_DR2(%rdi),%rax movq %rax,%dr2 movq PCB_DR3(%rdi),%rax movq %rax,%dr3 movq PCB_DR6(%rdi),%rax movq %rax,%dr6 movq PCB_DR7(%rdi),%rax movq %rax,%dr7 /* Restore other callee saved registers. */ movq PCB_R15(%rdi),%r15 movq PCB_R14(%rdi),%r14 movq PCB_R13(%rdi),%r13 movq PCB_R12(%rdi),%r12 movq PCB_RBP(%rdi),%rbp movq PCB_RSP(%rdi),%rsp movq PCB_RBX(%rdi),%rbx /* Restore return address. */ movq PCB_RIP(%rdi),%rax movq %rax,(%rsp) xorl %eax,%eax ret END(resumectx) Index: releng/11.4/sys/amd64/amd64/initcpu.c =================================================================== --- releng/11.4/sys/amd64/amd64/initcpu.c (revision 361587) +++ releng/11.4/sys/amd64/amd64/initcpu.c (revision 361588) @@ -1,295 +1,309 @@ /*- * Copyright (c) KATO Takenori, 1997, 1998. * * All rights reserved. Unpublished rights reserved under the copyright * laws of Japan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer as * the first lines of this file unmodified. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #include static int hw_instruction_sse; SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU"); static int lower_sharedpage_init; int hw_lower_amd64_sharedpage; SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN, &hw_lower_amd64_sharedpage, 0, "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory"); /* * -1: automatic (default) * 0: keep enable CLFLUSH * 1: force disable CLFLUSH */ static int hw_clflush_disable = -1; static void init_amd(void) { uint64_t msr; /* * Work around Erratum 721 for Family 10h and 12h processors. * These processors may incorrectly update the stack pointer * after a long series of push and/or near-call instructions, * or a long series of pop and/or near-return instructions. * * http://support.amd.com/us/Processor_TechDocs/41322_10h_Rev_Gd.pdf * http://support.amd.com/us/Processor_TechDocs/44739_12h_Rev_Gd.pdf * * Hypervisors do not provide access to the errata MSR, * causing #GP exception on attempt to apply the errata. The * MSR write shall be done on host and persist globally * anyway, so do not try to do it when under virtualization. */ switch (CPUID_TO_FAMILY(cpu_id)) { case 0x10: case 0x12: if ((cpu_feature2 & CPUID2_HV) == 0) wrmsr(0xc0011029, rdmsr(0xc0011029) | 1); break; } /* * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG. * So, do it here or otherwise some tools could be confused by * Initial Local APIC ID reported with CPUID Function 1 in EBX. */ if (CPUID_TO_FAMILY(cpu_id) == 0x10) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(MSR_NB_CFG1); msr |= (uint64_t)1 << 54; wrmsr(MSR_NB_CFG1, msr); } } /* * BIOS may configure Family 10h processors to convert WC+ cache type * to CD. That can hurt performance of guest VMs using nested paging. * The relevant MSR bit is not documented in the BKDG, * the fix is borrowed from Linux. */ if (CPUID_TO_FAMILY(cpu_id) == 0x10) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(0xc001102a); msr &= ~((uint64_t)1 << 24); wrmsr(0xc001102a, msr); } } /* * Work around Erratum 793: Specific Combination of Writes to Write * Combined Memory Types and Locked Instructions May Cause Core Hang. * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors, * revision 3.04 or later, publication 51810. */ if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(MSR_LS_CFG); msr |= (uint64_t)1 << 15; wrmsr(MSR_LS_CFG, msr); } } /* Ryzen erratas. */ if (CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1 && (cpu_feature2 & CPUID2_HV) == 0) { /* 1021 */ msr = rdmsr(0xc0011029); msr |= 0x2000; wrmsr(0xc0011029, msr); /* 1033 */ msr = rdmsr(MSR_LS_CFG); msr |= 0x10; wrmsr(MSR_LS_CFG, msr); /* 1049 */ msr = rdmsr(0xc0011028); msr |= 0x10; wrmsr(0xc0011028, msr); /* 1095 */ msr = rdmsr(MSR_LS_CFG); msr |= 0x200000000000000; wrmsr(MSR_LS_CFG, msr); } /* * Work around a problem on Ryzen that is triggered by executing * code near the top of user memory, in our case the signal * trampoline code in the shared page on amd64. * * This function is executed once for the BSP before tunables take * effect so the value determined here can be overridden by the * tunable. This function is then executed again for each AP and * also on resume. Set a flag the first time so that value set by * the tunable is not overwritten. * * The stepping and/or microcode versions should be checked after * this issue is fixed by AMD so that we don't use this mode if not * needed. */ if (lower_sharedpage_init == 0) { lower_sharedpage_init = 1; if (CPUID_TO_FAMILY(cpu_id) == 0x17) { hw_lower_amd64_sharedpage = 1; } } } /* * Initialize special VIA features */ static void init_via(void) { u_int regs[4], val; /* * Check extended CPUID for PadLock features. * * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf */ do_cpuid(0xc0000000, regs); if (regs[0] >= 0xc0000001) { do_cpuid(0xc0000001, regs); val = regs[3]; } else return; /* Enable RNG if present. */ if ((val & VIA_CPUID_HAS_RNG) != 0) { via_feature_rng = VIA_HAS_RNG; wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG); } /* Enable PadLock if present. */ if ((val & VIA_CPUID_HAS_ACE) != 0) via_feature_xcrypt |= VIA_HAS_AES; if ((val & VIA_CPUID_HAS_ACE2) != 0) via_feature_xcrypt |= VIA_HAS_AESCTR; if ((val & VIA_CPUID_HAS_PHE) != 0) via_feature_xcrypt |= VIA_HAS_SHA; if ((val & VIA_CPUID_HAS_PMM) != 0) via_feature_xcrypt |= VIA_HAS_MM; if (via_feature_xcrypt != 0) wrmsr(0x1107, rdmsr(0x1107) | (1 << 28)); } /* * Initialize CPU control registers */ void initializecpu(void) { uint64_t msr; uint32_t cr4; cr4 = rcr4(); if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { cr4 |= CR4_FXSR | CR4_XMM; cpu_fxsr = hw_instruction_sse = 1; } if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) cr4 |= CR4_FSGSBASE; /* + * If SMEP is present, we only need to flush RSB (by default) + * on context switches, to prevent cross-process ret2spec + * attacks. Do it automatically if ibrs_disable is set, to + * complete the mitigation. + * * Postpone enabling the SMEP on the boot CPU until the page * tables are switched from the boot loader identity mapping * to the kernel tables. The boot loader enables the U bit in * its tables. */ - if (!IS_BSP() && (cpu_stdext_feature & CPUID_STDEXT_SMEP)) - cr4 |= CR4_SMEP; + if (IS_BSP()) { + if (cpu_stdext_feature & CPUID_STDEXT_SMEP && + !TUNABLE_INT_FETCH( + "machdep.mitigations.cpu_flush_rsb_ctxsw", + &cpu_flush_rsb_ctxsw) && + hw_ibrs_disable) + cpu_flush_rsb_ctxsw = 1; + } else { + if (cpu_stdext_feature & CPUID_STDEXT_SMEP) + cr4 |= CR4_SMEP; + } load_cr4(cr4); if ((amd_feature & AMDID_NX) != 0) { msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); pg_nx = PG_NX; } hw_ibrs_recalculate(false); hw_ssb_recalculate(false); switch (cpu_vendor_id) { case CPU_VENDOR_AMD: init_amd(); break; case CPU_VENDOR_CENTAUR: init_via(); break; } if ((amd_feature & AMDID_RDTSCP) != 0 || (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0) wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid)); } void initializecpucache(void) { /* * CPUID with %eax = 1, %ebx returns * Bits 15-8: CLFLUSH line size * (Value * 8 = cache line size in bytes) */ if ((cpu_feature & CPUID_CLFSH) != 0) cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8; /* * XXXKIB: (temporary) hack to work around traps generated * when CLFLUSHing APIC register window under virtualization * environments. These environments tend to disable the * CPUID_SS feature even though the native CPU supports it. */ TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable); if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } /* * The kernel's use of CLFLUSH{,OPT} can be disabled manually * by setting the hw.clflush_disable tunable. */ if (hw_clflush_disable == 1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } } Index: releng/11.4/sys/amd64/amd64/support.S =================================================================== --- releng/11.4/sys/amd64/amd64/support.S (revision 361587) +++ releng/11.4/sys/amd64/amd64/support.S (revision 361588) @@ -1,1168 +1,1171 @@ /*- * Copyright (c) 2018-2019 The FreeBSD Foundation * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_ddb.h" #include #include #include #include "assym.s" .text /* * bcopy family * void bzero(void *buf, u_int len) */ /* done */ ENTRY(bzero) PUSH_FRAME_POINTER movq %rsi,%rcx xorl %eax,%eax shrq $3,%rcx rep stosq movq %rsi,%rcx andq $7,%rcx rep stosb POP_FRAME_POINTER ret END(bzero) /* Address: %rdi */ ENTRY(pagezero) PUSH_FRAME_POINTER movq $PAGE_SIZE/8,%rcx xorl %eax,%eax rep stosq POP_FRAME_POINTER ret END(pagezero) ENTRY(bcmp) PUSH_FRAME_POINTER movq %rdx,%rcx shrq $3,%rcx repe cmpsq jne 1f movq %rdx,%rcx andq $7,%rcx repe cmpsb 1: setne %al movsbl %al,%eax POP_FRAME_POINTER ret END(bcmp) /* * bcopy(src, dst, cnt) * rdi, rsi, rdx * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ENTRY(bcopy) PUSH_FRAME_POINTER xchgq %rsi,%rdi movq %rdx,%rcx movq %rdi,%rax subq %rsi,%rax cmpq %rcx,%rax /* overlapping && src < dst? */ jb 1f shrq $3,%rcx /* copy by 64-bit words */ rep movsq movq %rdx,%rcx andq $7,%rcx /* any bytes left? */ rep movsb POP_FRAME_POINTER ret /* ALIGN_TEXT */ 1: addq %rcx,%rdi /* copy backwards */ addq %rcx,%rsi decq %rdi decq %rsi andq $7,%rcx /* any fractional bytes? */ std rep movsb movq %rdx,%rcx /* copy remainder by 32-bit words */ shrq $3,%rcx subq $7,%rsi subq $7,%rdi rep movsq cld POP_FRAME_POINTER ret END(bcopy) /* * Note: memcpy does not support overlapping copies */ ENTRY(memcpy) PUSH_FRAME_POINTER movq %rdi,%rax movq %rdx,%rcx shrq $3,%rcx /* copy by 64-bit words */ rep movsq movq %rdx,%rcx andq $7,%rcx /* any bytes left? */ rep movsb POP_FRAME_POINTER ret END(memcpy) /* * pagecopy(%rdi=from, %rsi=to) */ ENTRY(pagecopy) PUSH_FRAME_POINTER movq $-PAGE_SIZE,%rax movq %rax,%rdx subq %rax,%rdi subq %rax,%rsi 1: prefetchnta (%rdi,%rax) addq $64,%rax jne 1b 2: movq (%rdi,%rdx),%rax movnti %rax,(%rsi,%rdx) movq 8(%rdi,%rdx),%rax movnti %rax,8(%rsi,%rdx) movq 16(%rdi,%rdx),%rax movnti %rax,16(%rsi,%rdx) movq 24(%rdi,%rdx),%rax movnti %rax,24(%rsi,%rdx) addq $32,%rdx jne 2b sfence POP_FRAME_POINTER ret END(pagecopy) /* fillw(pat, base, cnt) */ /* %rdi,%rsi, %rdx */ ENTRY(fillw) PUSH_FRAME_POINTER movq %rdi,%rax movq %rsi,%rdi movq %rdx,%rcx rep stosw POP_FRAME_POINTER ret END(fillw) /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ /* * Access user memory from inside the kernel. These routines should be * the only places that do this. * * These routines set curpcb->pcb_onfault for the time they execute. When a * protection violation occurs inside the functions, the trap handler * returns to *curpcb->pcb_onfault instead of the function. */ /* * copyout(from_kernel, to_user, len) * %rdi, %rsi, %rdx */ ENTRY(copyout) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rax movq $copyout_fault,PCB_ONFAULT(%rax) testq %rdx,%rdx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. This check is essential * because it prevents usermode from writing into the kernel. We do * not verify anywhere else that the user did not specify a rogue * address. */ /* * First, prevent address wrapping. */ movq %rsi,%rax addq %rdx,%rax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax ja copyout_fault xchgq %rdi,%rsi /* bcopy(%rsi, %rdi, %rdx) */ movq %rdx,%rcx shrq $3,%rcx rep movsq movb %dl,%cl andb $7,%cl rep movsb done_copyout: xorl %eax,%eax movq PCPU(CURPCB),%rdx movq %rax,PCB_ONFAULT(%rdx) POP_FRAME_POINTER ret ALIGN_TEXT copyout_fault: movq PCPU(CURPCB),%rdx movq $0,PCB_ONFAULT(%rdx) movq $EFAULT,%rax POP_FRAME_POINTER ret END(copyout) /* * copyin(from_user, to_kernel, len) * %rdi, %rsi, %rdx */ ENTRY(copyin) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rax movq $copyin_fault,PCB_ONFAULT(%rax) testq %rdx,%rdx /* anything to do? */ jz done_copyin /* * make sure address is valid */ movq %rdi,%rax addq %rdx,%rax jc copyin_fault movq $VM_MAXUSER_ADDRESS,%rcx cmpq %rcx,%rax ja copyin_fault xchgq %rdi,%rsi movq %rdx,%rcx movb %cl,%al shrq $3,%rcx /* copy longword-wise */ rep movsq movb %al,%cl andb $7,%cl /* copy remaining bytes */ rep movsb done_copyin: xorl %eax,%eax movq PCPU(CURPCB),%rdx movq %rax,PCB_ONFAULT(%rdx) POP_FRAME_POINTER ret ALIGN_TEXT copyin_fault: movq PCPU(CURPCB),%rdx movq $0,PCB_ONFAULT(%rdx) movq $EFAULT,%rax POP_FRAME_POINTER ret END(copyin) /* * casueword32. Compare and set user integer. Returns -1 on fault, * 0 if access was successful. Old value is written to *oldp. * dst = %rdi, old = %esi, oldp = %rdx, new = %ecx */ ENTRY(casueword32) PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $fusufault,PCB_ONFAULT(%r8) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault movl %esi,%eax /* old */ #ifdef SMP lock #endif cmpxchgl %ecx,(%rdi) /* new = %ecx */ /* * The old value is in %eax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. Save %eax into %esi to prepare the return * value. */ movl %eax,%esi xorl %eax,%eax movq %rax,PCB_ONFAULT(%r8) /* * Access the oldp after the pcb_onfault is cleared, to correctly * catch corrupted pointer. */ movl %esi,(%rdx) /* oldp = %rdx */ POP_FRAME_POINTER ret END(casueword32) /* * casueword. Compare and set user long. Returns -1 on fault, * 0 if access was successful. Old value is written to *oldp. * dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx */ ENTRY(casueword) PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $fusufault,PCB_ONFAULT(%r8) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault movq %rsi,%rax /* old */ #ifdef SMP lock #endif cmpxchgq %rcx,(%rdi) /* new = %rcx */ /* * The old value is in %rax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. */ movq %rax,%rsi xorl %eax,%eax movq %rax,PCB_ONFAULT(%r8) movq %rsi,(%rdx) POP_FRAME_POINTER ret END(casueword) /* * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit * byte from user memory. * addr = %rdi, valp = %rsi */ ALTENTRY(fueword64) ENTRY(fueword) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault xorl %eax,%eax movq (%rdi),%r11 movq %rax,PCB_ONFAULT(%rcx) movq %r11,(%rsi) POP_FRAME_POINTER ret END(fueword64) END(fueword) ENTRY(fueword32) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address is valid */ ja fusufault xorl %eax,%eax movl (%rdi),%r11d movq %rax,PCB_ONFAULT(%rcx) movl %r11d,(%rsi) POP_FRAME_POINTER ret END(fueword32) /* * fuswintr() and suswintr() are specialized variants of fuword16() and * suword16(), respectively. They are called from the profiling code, * potentially at interrupt time. If they fail, that's okay; good things * will happen later. They always fail for now, until the trap code is * able to deal with this. */ ALTENTRY(suswintr) ENTRY(fuswintr) movq $-1,%rax ret END(suswintr) END(fuswintr) ENTRY(fuword16) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-2,%rax cmpq %rax,%rdi ja fusufault movzwl (%rdi),%eax movq $0,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(fuword16) ENTRY(fubyte) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-1,%rax cmpq %rax,%rdi ja fusufault movzbl (%rdi),%eax movq $0,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(fubyte) ALIGN_TEXT fusufault: movq PCPU(CURPCB),%rcx xorl %eax,%eax movq %rax,PCB_ONFAULT(%rcx) decq %rax POP_FRAME_POINTER ret /* * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to * user memory. * addr = %rdi, value = %rsi */ ALTENTRY(suword64) ENTRY(suword) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-8,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movq %rsi,(%rdi) xorl %eax,%eax movq PCPU(CURPCB),%rcx movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword64) END(suword) ENTRY(suword32) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-4,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movl %esi,(%rdi) xorl %eax,%eax movq PCPU(CURPCB),%rcx movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword32) ENTRY(suword16) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-2,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movw %si,(%rdi) xorl %eax,%eax movq PCPU(CURPCB),%rcx /* restore trashed register */ movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(suword16) ENTRY(subyte) PUSH_FRAME_POINTER movq PCPU(CURPCB),%rcx movq $fusufault,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS-1,%rax cmpq %rax,%rdi /* verify address validity */ ja fusufault movl %esi,%eax movb %al,(%rdi) xorl %eax,%eax movq PCPU(CURPCB),%rcx /* restore trashed register */ movq %rax,PCB_ONFAULT(%rcx) POP_FRAME_POINTER ret END(subyte) /* * copyinstr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx * * copy a string from 'from' to 'to', stop when a 0 character is reached. * return ENAMETOOLONG if string is longer than maxlen, and * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ ENTRY(copyinstr) PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ movq %rcx,%r9 /* %r9 = *len */ xchgq %rdi,%rsi /* %rdi = from, %rsi = to */ movq PCPU(CURPCB),%rcx movq $cpystrflt,PCB_ONFAULT(%rcx) movq $VM_MAXUSER_ADDRESS,%rax /* make sure 'from' is within bounds */ subq %rsi,%rax jbe cpystrflt /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpq %rdx,%rax jae 1f movq %rax,%rdx movq %rax,%r8 1: incq %rdx 2: decq %rdx jz 3f lodsb stosb orb %al,%al jnz 2b /* Success -- 0 byte reached */ decq %rdx xorl %eax,%eax jmp cpystrflt_x 3: /* rdx is zero - return ENAMETOOLONG or EFAULT */ movq $VM_MAXUSER_ADDRESS,%rax cmpq %rax,%rsi jae cpystrflt 4: movq $ENAMETOOLONG,%rax jmp cpystrflt_x cpystrflt: movq $EFAULT,%rax cpystrflt_x: /* set *lencopied and return %eax */ movq PCPU(CURPCB),%rcx movq $0,PCB_ONFAULT(%rcx) testq %r9,%r9 jz 1f subq %rdx,%r8 movq %r8,(%r9) 1: POP_FRAME_POINTER ret END(copyinstr) /* * copystr(from, to, maxlen, int *lencopied) * %rdi, %rsi, %rdx, %rcx */ ENTRY(copystr) PUSH_FRAME_POINTER movq %rdx,%r8 /* %r8 = maxlen */ xchgq %rdi,%rsi incq %rdx 1: decq %rdx jz 4f lodsb stosb orb %al,%al jnz 1b /* Success -- 0 byte reached */ decq %rdx xorl %eax,%eax jmp 6f 4: /* rdx is zero -- return ENAMETOOLONG */ movq $ENAMETOOLONG,%rax 6: testq %rcx,%rcx jz 7f /* set *lencopied and return %rax */ subq %rdx,%r8 movq %r8,(%rcx) 7: POP_FRAME_POINTER ret END(copystr) /* * Handling of special amd64 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ lgdt (%rdi) /* flush the prefetch q */ jmp 1f nop 1: movl $KDSEL,%eax movl %eax,%ds movl %eax,%es movl %eax,%fs /* Beware, use wrmsr to set 64 bit base */ movl %eax,%gs movl %eax,%ss /* reload code selector by turning return into intersegmental return */ popq %rax pushq $KCSEL pushq %rax MEXITCOUNT lretq END(lgdt) /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movq %rbx,0(%rdi) /* save rbx */ movq %rsp,8(%rdi) /* save rsp */ movq %rbp,16(%rdi) /* save rbp */ movq %r12,24(%rdi) /* save r12 */ movq %r13,32(%rdi) /* save r13 */ movq %r14,40(%rdi) /* save r14 */ movq %r15,48(%rdi) /* save r15 */ movq 0(%rsp),%rdx /* get rta */ movq %rdx,56(%rdi) /* save rip */ xorl %eax,%eax /* return(0); */ ret END(setjmp) ENTRY(longjmp) movq 0(%rdi),%rbx /* restore rbx */ movq 8(%rdi),%rsp /* restore rsp */ movq 16(%rdi),%rbp /* restore rbp */ movq 24(%rdi),%r12 /* restore r12 */ movq 32(%rdi),%r13 /* restore r13 */ movq 40(%rdi),%r14 /* restore r14 */ movq 48(%rdi),%r15 /* restore r15 */ movq 56(%rdi),%rdx /* get rta */ movq %rdx,0(%rsp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret END(longjmp) /* * Support for reading MSRs in the safe manner. */ ENTRY(rdmsr_safe) /* int rdmsr_safe(u_int msr, uint64_t *data) */ PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $msr_onfault,PCB_ONFAULT(%r8) movl %edi,%ecx rdmsr /* Read MSR pointed by %ecx. Returns hi byte in edx, lo in %eax */ salq $32,%rdx /* sign-shift %rdx left */ movl %eax,%eax /* zero-extend %eax -> %rax */ orq %rdx,%rax movq %rax,(%rsi) xorq %rax,%rax movq %rax,PCB_ONFAULT(%r8) POP_FRAME_POINTER ret /* * Support for writing MSRs in the safe manner. */ ENTRY(wrmsr_safe) /* int wrmsr_safe(u_int msr, uint64_t data) */ PUSH_FRAME_POINTER movq PCPU(CURPCB),%r8 movq $msr_onfault,PCB_ONFAULT(%r8) movl %edi,%ecx movl %esi,%eax sarq $32,%rsi movl %esi,%edx wrmsr /* Write MSR pointed by %ecx. Accepts hi byte in edx, lo in %eax. */ xorq %rax,%rax movq %rax,PCB_ONFAULT(%r8) POP_FRAME_POINTER ret /* * MSR operations fault handler */ ALIGN_TEXT msr_onfault: movq $0,PCB_ONFAULT(%r8) movl $EFAULT,%eax POP_FRAME_POINTER ret /* * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3); * Invalidates address space addressed by ucr3, then returns to kcr3. * Done in assembler to ensure no other memory accesses happen while * on ucr3. */ ALIGN_TEXT ENTRY(pmap_pti_pcid_invalidate) pushfq cli movq %rdi,%cr3 /* to user page table */ movq %rsi,%cr3 /* back to kernel */ popfq retq /* * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va); * Invalidates virtual address va in address space ucr3, then returns to kcr3. */ ALIGN_TEXT ENTRY(pmap_pti_pcid_invlpg) pushfq cli movq %rdi,%cr3 /* to user page table */ invlpg (%rdx) movq %rsi,%cr3 /* back to kernel */ popfq retq /* * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva, * vm_offset_t eva); * Invalidates virtual addresses between sva and eva in address space ucr3, * then returns to kcr3. */ ALIGN_TEXT ENTRY(pmap_pti_pcid_invlrng) pushfq cli movq %rdi,%cr3 /* to user page table */ 1: invlpg (%rdx) addq $PAGE_SIZE,%rdx cmpq %rdx,%rcx ja 1b movq %rsi,%cr3 /* back to kernel */ popfq retq .altmacro - .macro ibrs_seq_label l -handle_ibrs_\l: + .macro rsb_seq_label l +rsb_seq_\l: .endm - .macro ibrs_call_label l - call handle_ibrs_\l + .macro rsb_call_label l + call rsb_seq_\l .endm - .macro ibrs_seq count + .macro rsb_seq count ll=1 .rept \count - ibrs_call_label %(ll) + rsb_call_label %(ll) nop - ibrs_seq_label %(ll) + rsb_seq_label %(ll) addq $8,%rsp ll=ll+1 .endr .endm +ENTRY(rsb_flush) + rsb_seq 32 + ret + /* all callers already saved %rax, %rdx, and %rcx */ ENTRY(handle_ibrs_entry) cmpb $0,hw_ibrs_ibpb_active(%rip) je 1f movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr orl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax orl $(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx wrmsr movb $1,PCPU(IBPB_SET) testl $CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip) - jne 1f - ibrs_seq 32 + je rsb_flush 1: ret END(handle_ibrs_entry) ENTRY(handle_ibrs_exit) cmpb $0,PCPU(IBPB_SET) je 1f movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr andl $~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax andl $~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx wrmsr movb $0,PCPU(IBPB_SET) 1: ret END(handle_ibrs_exit) /* registers-neutral version, but needs stack */ ENTRY(handle_ibrs_exit_rs) cmpb $0,PCPU(IBPB_SET) je 1f pushq %rax pushq %rdx pushq %rcx movl $MSR_IA32_SPEC_CTRL,%ecx rdmsr andl $~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax andl $~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx wrmsr popq %rcx popq %rdx popq %rax movb $0,PCPU(IBPB_SET) 1: ret END(handle_ibrs_exit_rs) .noaltmacro /* * Flush L1D cache. Load enough of the data from the kernel text * to flush existing L1D content. * * N.B. The function does not follow ABI calling conventions, it corrupts %rbx. * The vmm.ko caller expects that only %rax, %rdx, %rbx, %rcx, %r9, and %rflags * registers are clobbered. The NMI handler caller only needs %r13 preserved. */ ENTRY(flush_l1d_sw) #define L1D_FLUSH_SIZE (64 * 1024) movq $KERNBASE, %r9 movq $-L1D_FLUSH_SIZE, %rcx /* * pass 1: Preload TLB. * Kernel text is mapped using superpages. TLB preload is * done for the benefit of older CPUs which split 2M page * into 4k TLB entries. */ 1: movb L1D_FLUSH_SIZE(%r9, %rcx), %al addq $PAGE_SIZE, %rcx jne 1b xorl %eax, %eax cpuid movq $-L1D_FLUSH_SIZE, %rcx /* pass 2: Read each cache line. */ 2: movb L1D_FLUSH_SIZE(%r9, %rcx), %al addq $64, %rcx jne 2b lfence ret #undef L1D_FLUSH_SIZE END(flush_l1d_sw) ENTRY(mds_handler_void) retq END(mds_handler_void) ENTRY(mds_handler_verw) subq $8, %rsp movw %ds, (%rsp) verw (%rsp) addq $8, %rsp retq END(mds_handler_verw) ENTRY(mds_handler_ivb) pushq %rax pushq %rdx pushq %rcx movq %cr0, %rax testb $CR0_TS, %al je 1f clts 1: movq PCPU(MDS_BUF), %rdx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 lfence orpd (%rdx), %xmm0 orpd (%rdx), %xmm0 mfence movl $40, %ecx addq $16, %rdx 2: movntdq %xmm0, (%rdx) addq $16, %rdx decl %ecx jnz 2b mfence movdqa PCPU(MDS_TMP),%xmm0 testb $CR0_TS, %al je 3f movq %rax, %cr0 3: popq %rcx popq %rdx popq %rax retq END(mds_handler_ivb) ENTRY(mds_handler_bdw) pushq %rax pushq %rbx pushq %rcx pushq %rdi pushq %rsi movq %cr0, %rax testb $CR0_TS, %al je 1f clts 1: movq PCPU(MDS_BUF), %rbx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 movq %rbx, %rdi movq %rbx, %rsi movl $40, %ecx 2: movntdq %xmm0, (%rbx) addq $16, %rbx decl %ecx jnz 2b mfence movl $1536, %ecx rep; movsb lfence movdqa PCPU(MDS_TMP),%xmm0 testb $CR0_TS, %al je 3f movq %rax, %cr0 3: popq %rsi popq %rdi popq %rcx popq %rbx popq %rax retq END(mds_handler_bdw) ENTRY(mds_handler_skl_sse) pushq %rax pushq %rdx pushq %rcx pushq %rdi movq %cr0, %rax testb $CR0_TS, %al je 1f clts 1: movq PCPU(MDS_BUF), %rdi movq PCPU(MDS_BUF64), %rdx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 lfence orpd (%rdx), %xmm0 orpd (%rdx), %xmm0 xorl %eax, %eax 2: clflushopt 5376(%rdi, %rax, 8) addl $8, %eax cmpl $8 * 12, %eax jb 2b sfence movl $6144, %ecx xorl %eax, %eax rep; stosb mfence movdqa PCPU(MDS_TMP), %xmm0 testb $CR0_TS, %al je 3f movq %rax, %cr0 3: popq %rdi popq %rcx popq %rdx popq %rax retq END(mds_handler_skl_sse) ENTRY(mds_handler_skl_avx) pushq %rax pushq %rdx pushq %rcx pushq %rdi movq %cr0, %rax testb $CR0_TS, %al je 1f clts 1: movq PCPU(MDS_BUF), %rdi movq PCPU(MDS_BUF64), %rdx vmovdqa %ymm0, PCPU(MDS_TMP) vpxor %ymm0, %ymm0, %ymm0 lfence vorpd (%rdx), %ymm0, %ymm0 vorpd (%rdx), %ymm0, %ymm0 xorl %eax, %eax 2: clflushopt 5376(%rdi, %rax, 8) addl $8, %eax cmpl $8 * 12, %eax jb 2b sfence movl $6144, %ecx xorl %eax, %eax rep; stosb mfence vmovdqa PCPU(MDS_TMP), %ymm0 testb $CR0_TS, %al je 3f movq %rax, %cr0 3: popq %rdi popq %rcx popq %rdx popq %rax retq END(mds_handler_skl_avx) ENTRY(mds_handler_skl_avx512) pushq %rax pushq %rdx pushq %rcx pushq %rdi movq %cr0, %rax testb $CR0_TS, %al je 1f clts 1: movq PCPU(MDS_BUF), %rdi movq PCPU(MDS_BUF64), %rdx vmovdqa64 %zmm0, PCPU(MDS_TMP) vpxor %zmm0, %zmm0, %zmm0 lfence vorpd (%rdx), %zmm0, %zmm0 vorpd (%rdx), %zmm0, %zmm0 xorl %eax, %eax 2: clflushopt 5376(%rdi, %rax, 8) addl $8, %eax cmpl $8 * 12, %eax jb 2b sfence movl $6144, %ecx xorl %eax, %eax rep; stosb mfence vmovdqa64 PCPU(MDS_TMP), %zmm0 testb $CR0_TS, %al je 3f movq %rax, %cr0 3: popq %rdi popq %rcx popq %rdx popq %rax retq END(mds_handler_skl_avx512) ENTRY(mds_handler_silvermont) pushq %rax pushq %rdx pushq %rcx movq %cr0, %rax testb $CR0_TS, %al je 1f clts 1: movq PCPU(MDS_BUF), %rdx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 movl $16, %ecx 2: movntdq %xmm0, (%rdx) addq $16, %rdx decl %ecx jnz 2b mfence movdqa PCPU(MDS_TMP),%xmm0 testb $CR0_TS, %al je 3f movq %rax, %cr0 3: popq %rcx popq %rdx popq %rax retq END(mds_handler_silvermont) Index: releng/11.4/sys/i386/i386/support.s =================================================================== --- releng/11.4/sys/i386/i386/support.s (revision 361587) +++ releng/11.4/sys/i386/i386/support.s (revision 361588) @@ -1,1012 +1,1034 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include "assym.s" #define IDXSHIFT 10 .text /* * bcopy family * void bzero(void *buf, u_int len) */ ENTRY(bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx xorl %eax,%eax shrl $2,%ecx rep stosl movl 12(%esp),%ecx andl $3,%ecx rep stosb popl %edi ret END(bzero) ENTRY(sse2_pagezero) pushl %ebx movl 8(%esp),%ecx movl %ecx,%eax addl $4096,%eax xor %ebx,%ebx 1: movnti %ebx,(%ecx) addl $4,%ecx cmpl %ecx,%eax jne 1b sfence popl %ebx ret END(sse2_pagezero) ENTRY(i686_pagezero) pushl %edi pushl %ebx movl 12(%esp),%edi movl $1024,%ecx ALIGN_TEXT 1: xorl %eax,%eax repe scasl jnz 2f popl %ebx popl %edi ret ALIGN_TEXT 2: incl %ecx subl $4,%edi movl %ecx,%edx cmpl $16,%ecx jge 3f movl %edi,%ebx andl $0x3f,%ebx shrl %ebx shrl %ebx movl $16,%ecx subl %ebx,%ecx 3: subl %ecx,%edx rep stosl movl %edx,%ecx testl %edx,%edx jnz 1b popl %ebx popl %edi ret END(i686_pagezero) /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi movl 8(%esp),%eax movl 12(%esp),%edi movl 16(%esp),%ecx rep stosw popl %edi ret END(fillw) ENTRY(bcopyb) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards. */ addl %ecx,%esi decl %edi decl %esi std rep movsb popl %edi popl %esi cld ret END(bcopyb) /* * bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ENTRY(bcopy) pushl %ebp movl %esp,%ebp pushl %esi pushl %edi movl 8(%ebp),%esi movl 12(%ebp),%edi movl 16(%ebp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f shrl $2,%ecx /* copy by 32-bit words */ rep movsl movl 16(%ebp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %edi popl %esi popl %ebp ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards */ addl %ecx,%esi decl %edi decl %esi andl $3,%ecx /* any fractional bytes? */ std rep movsb movl 16(%ebp),%ecx /* copy remainder by 32-bit words */ shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld popl %ebp ret END(bcopy) /* * Note: memcpy does not support overlapping copies */ ENTRY(memcpy) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%ecx movl %edi,%eax shrl $2,%ecx /* copy by 32-bit words */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %esi popl %edi ret END(memcpy) /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ /* * Access user memory from inside the kernel. These routines and possibly * the math- and DOS emulators should be the only places that do this. * * We have to access the memory with user's permissions, so use a segment * selector with RPL 3. For writes to user space we have to additionally * check the PTE for write permission, because the 386 does not check * write permissions when we are executing with EPL 0. The 486 does check * this if the WP bit is set in CR0, so we can use a simpler version here. * * These routines set curpcb->pcb_onfault for the time they execute. When a * protection violation occurs inside the functions, the trap handler * returns to *curpcb->pcb_onfault instead of the function. */ /* * copyout(from_kernel, to_user, len) - MP SAFE */ ENTRY(copyout) movl PCPU(CURPCB),%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. This check is essential * because it prevents usermode from writing into the kernel. We do * not verify anywhere else that the user did not specify a rogue * address. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault /* bcopy(%esi, %edi, %ebx) */ movl %ebx,%ecx shrl $2,%ecx rep movsl movb %bl,%cl andb $3,%cl rep movsb done_copyout: popl %ebx popl %edi popl %esi xorl %eax,%eax movl PCPU(CURPCB),%edx movl %eax,PCB_ONFAULT(%edx) ret END(copyout) ALIGN_TEXT copyout_fault: popl %ebx popl %edi popl %esi movl PCPU(CURPCB),%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret /* * copyin(from_user, to_kernel, len) - MP SAFE */ ENTRY(copyin) movl PCPU(CURPCB),%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault movb %cl,%al shrl $2,%ecx /* copy longword-wise */ rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb popl %edi popl %esi xorl %eax,%eax movl PCPU(CURPCB),%edx movl %eax,PCB_ONFAULT(%edx) ret END(copyin) ALIGN_TEXT copyin_fault: popl %edi popl %esi movl PCPU(CURPCB),%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret /* * casueword. Compare and set user word. Returns -1 on fault, * 0 on non-faulting access. The current value is in *oldp. */ ALTENTRY(casueword32) ENTRY(casueword) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx /* dst */ movl 8(%esp),%eax /* old */ movl 16(%esp),%ecx /* new */ cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ ja fusufault #ifdef SMP lock #endif cmpxchgl %ecx,(%edx) /* Compare and set. */ /* * The old value is in %eax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. */ movl PCPU(CURPCB),%ecx movl $0,PCB_ONFAULT(%ecx) movl 12(%esp),%edx /* oldp */ movl %eax,(%edx) xorl %eax,%eax ret END(casueword32) END(casueword) /* * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user * memory. */ ALTENTRY(fueword32) ENTRY(fueword) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx /* from */ cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ ja fusufault movl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) movl 8(%esp),%edx movl %eax,(%edx) xorl %eax,%eax ret END(fueword32) END(fueword) /* * fuswintr() and suswintr() are specialized variants of fuword16() and * suword16(), respectively. They are called from the profiling code, * potentially at interrupt time. If they fail, that's okay; good things * will happen later. They always fail for now, until the trap code is * able to deal with this. */ ALTENTRY(suswintr) ENTRY(fuswintr) movl $-1,%eax ret END(suswintr) END(fuswintr) ENTRY(fuword16) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-2,%edx ja fusufault movzwl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret END(fuword16) ENTRY(fubyte) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-1,%edx ja fusufault movzbl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret END(fubyte) ALIGN_TEXT fusufault: movl PCPU(CURPCB),%ecx xorl %eax,%eax movl %eax,PCB_ONFAULT(%ecx) decl %eax ret /* * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory. * All these functions are MPSAFE. */ ALTENTRY(suword32) ENTRY(suword) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */ ja fusufault movl 8(%esp),%eax movl %eax,(%edx) xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret END(suword32) END(suword) ENTRY(suword16) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */ ja fusufault movw 8(%esp),%ax movw %ax,(%edx) xorl %eax,%eax movl PCPU(CURPCB),%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret END(suword16) ENTRY(subyte) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */ ja fusufault movb 8(%esp),%al movb %al,(%edx) xorl %eax,%eax movl PCPU(CURPCB),%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret END(subyte) /* * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE * * copy a string from 'from' to 'to', stop when a 0 character is reached. * return ENAMETOOLONG if string is longer than maxlen, and * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ ENTRY(copyinstr) pushl %esi pushl %edi movl PCPU(CURPCB),%ecx movl $cpystrflt,PCB_ONFAULT(%ecx) movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ movl $VM_MAXUSER_ADDRESS,%eax /* make sure 'from' is within bounds */ subl %esi,%eax jbe cpystrflt /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpl %edx,%eax jae 1f movl %eax,%edx movl %eax,20(%esp) 1: incl %edx 2: decl %edx jz 3f lodsb stosb orb %al,%al jnz 2b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp cpystrflt_x 3: /* edx is zero - return ENAMETOOLONG or EFAULT */ cmpl $VM_MAXUSER_ADDRESS,%esi jae cpystrflt 4: movl $ENAMETOOLONG,%eax jmp cpystrflt_x cpystrflt: movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ movl PCPU(CURPCB),%ecx movl $0,PCB_ONFAULT(%ecx) movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 1f movl %ecx,(%edx) 1: popl %edi popl %esi ret END(copyinstr) /* * copystr(from, to, maxlen, int *lencopied) - MP SAFE */ ENTRY(copystr) pushl %esi pushl %edi movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ incl %edx 1: decl %edx jz 4f lodsb stosb orb %al,%al jnz 1b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp 6f 4: /* edx is zero -- return ENAMETOOLONG */ movl $ENAMETOOLONG,%eax 6: /* set *lencopied and return %eax */ movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 7f movl %ecx,(%edx) 7: popl %edi popl %esi ret END(copystr) ENTRY(bcmp) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%edx movl %edx,%ecx shrl $2,%ecx repe cmpsl jne 1f movl %edx,%ecx andl $3,%ecx repe cmpsb 1: setne %al movsbl %al,%eax popl %esi popl %edi ret END(bcmp) /* * Handling of special 386 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) /* flush the prefetch q */ jmp 1f nop 1: /* reload "stale" selectors */ movl $KDSEL,%eax movl %eax,%ds movl %eax,%es movl %eax,%gs movl %eax,%ss movl $KPSEL,%eax movl %eax,%fs /* reload code selector by turning return into intersegmental return */ movl (%esp),%eax pushl %eax movl $KCSEL,4(%esp) MEXITCOUNT lret END(lgdt) /* ssdtosd(*ssdp,*sdp) */ ENTRY(ssdtosd) pushl %ebx movl 8(%esp),%ecx movl 8(%ecx),%ebx shll $16,%ebx movl (%ecx),%edx roll $16,%edx movb %dh,%bl movb %dl,%bh rorl $8,%ebx movl 4(%ecx),%eax movw %ax,%dx andl $0xf0000,%eax orl %eax,%ebx movl 12(%esp),%ecx movl %edx,(%ecx) movl %ebx,4(%ecx) popl %ebx ret END(ssdtosd) /* void reset_dbregs() */ ENTRY(reset_dbregs) movl $0,%eax movl %eax,%dr7 /* disable all breakpoints first */ movl %eax,%dr0 movl %eax,%dr1 movl %eax,%dr2 movl %eax,%dr3 movl %eax,%dr6 ret END(reset_dbregs) /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movl 4(%esp),%eax movl %ebx,(%eax) /* save ebx */ movl %esp,4(%eax) /* save esp */ movl %ebp,8(%eax) /* save ebp */ movl %esi,12(%eax) /* save esi */ movl %edi,16(%eax) /* save edi */ movl (%esp),%edx /* get rta */ movl %edx,20(%eax) /* save eip */ xorl %eax,%eax /* return(0); */ ret END(setjmp) ENTRY(longjmp) movl 4(%esp),%eax movl (%eax),%ebx /* restore ebx */ movl 4(%eax),%esp /* restore esp */ movl 8(%eax),%ebp /* restore ebp */ movl 12(%eax),%esi /* restore esi */ movl 16(%eax),%edi /* restore edi */ movl 20(%eax),%edx /* get rta */ movl %edx,(%esp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret END(longjmp) /* * Support for reading MSRs in the safe manner. */ ENTRY(rdmsr_safe) /* int rdmsr_safe(u_int msr, uint64_t *data) */ movl PCPU(CURPCB),%ecx movl $msr_onfault,PCB_ONFAULT(%ecx) movl 4(%esp),%ecx rdmsr movl 8(%esp),%ecx movl %eax,(%ecx) movl %edx,4(%ecx) xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret /* * Support for writing MSRs in the safe manner. */ ENTRY(wrmsr_safe) /* int wrmsr_safe(u_int msr, uint64_t data) */ movl PCPU(CURPCB),%ecx movl $msr_onfault,PCB_ONFAULT(%ecx) movl 4(%esp),%ecx movl 8(%esp),%eax movl 12(%esp),%edx wrmsr xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret /* * MSR operations fault handler */ ALIGN_TEXT msr_onfault: movl PCPU(CURPCB),%ecx movl $0,PCB_ONFAULT(%ecx) movl $EFAULT,%eax ret -ENTRY(handle_ibrs_entry) + .altmacro + .macro rsb_seq_label l +rsb_seq_\l: + .endm + .macro rsb_call_label l + call rsb_seq_\l + .endm + .macro rsb_seq count + ll=1 + .rept \count + rsb_call_label %(ll) + nop + rsb_seq_label %(ll) + addl $4,%esp + ll=ll+1 + .endr + .endm + +ENTRY(rsb_flush) + rsb_seq 32 ret + +ENTRY(handle_ibrs_entry) + jmp rsb_flush END(handle_ibrs_entry) ENTRY(handle_ibrs_exit) ret END(handle_ibrs_exit) ENTRY(mds_handler_void) ret END(mds_handler_void) ENTRY(mds_handler_verw) subl $4, %esp movw %ds, (%esp) verw (%esp) addl $4, %esp ret END(mds_handler_verw) ENTRY(mds_handler_ivb) movl %cr0, %eax testb $CR0_TS, %al je 1f clts 1: movl PCPU(MDS_BUF), %edx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 lfence orpd (%edx), %xmm0 orpd (%edx), %xmm0 mfence movl $40, %ecx addl $16, %edx 2: movntdq %xmm0, (%edx) addl $16, %edx decl %ecx jnz 2b mfence movdqa PCPU(MDS_TMP),%xmm0 testb $CR0_TS, %al je 3f movl %eax, %cr0 3: ret END(mds_handler_ivb) ENTRY(mds_handler_bdw) movl %cr0, %eax testb $CR0_TS, %al je 1f clts 1: movl PCPU(MDS_BUF), %ebx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 movl %ebx, %edi movl %ebx, %esi movl $40, %ecx 2: movntdq %xmm0, (%ebx) addl $16, %ebx decl %ecx jnz 2b mfence movl $1536, %ecx rep; movsb lfence movdqa PCPU(MDS_TMP),%xmm0 testb $CR0_TS, %al je 3f movl %eax, %cr0 3: ret END(mds_handler_bdw) ENTRY(mds_handler_skl_sse) movl %cr0, %eax testb $CR0_TS, %al je 1f clts 1: movl PCPU(MDS_BUF), %edi movl PCPU(MDS_BUF64), %edx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 lfence orpd (%edx), %xmm0 orpd (%edx), %xmm0 xorl %eax, %eax 2: clflushopt 5376(%edi, %eax, 8) addl $8, %eax cmpl $8 * 12, %eax jb 2b sfence movl $6144, %ecx xorl %eax, %eax rep; stosb mfence movdqa PCPU(MDS_TMP), %xmm0 testb $CR0_TS, %al je 3f movl %eax, %cr0 3: ret END(mds_handler_skl_sse) ENTRY(mds_handler_skl_avx) movl %cr0, %eax testb $CR0_TS, %al je 1f clts 1: movl PCPU(MDS_BUF), %edi movl PCPU(MDS_BUF64), %edx vmovdqa %ymm0, PCPU(MDS_TMP) vpxor %ymm0, %ymm0, %ymm0 lfence vorpd (%edx), %ymm0, %ymm0 vorpd (%edx), %ymm0, %ymm0 xorl %eax, %eax 2: clflushopt 5376(%edi, %eax, 8) addl $8, %eax cmpl $8 * 12, %eax jb 2b sfence movl $6144, %ecx xorl %eax, %eax rep; stosb mfence vmovdqa PCPU(MDS_TMP), %ymm0 testb $CR0_TS, %al je 3f movl %eax, %cr0 3: ret END(mds_handler_skl_avx) ENTRY(mds_handler_skl_avx512) movl %cr0, %eax testb $CR0_TS, %al je 1f clts 1: movl PCPU(MDS_BUF), %edi movl PCPU(MDS_BUF64), %edx vmovdqa64 %zmm0, PCPU(MDS_TMP) vpxor %zmm0, %zmm0, %zmm0 lfence vorpd (%edx), %zmm0, %zmm0 vorpd (%edx), %zmm0, %zmm0 xorl %eax, %eax 2: clflushopt 5376(%edi, %eax, 8) addl $8, %eax cmpl $8 * 12, %eax jb 2b sfence movl $6144, %ecx xorl %eax, %eax rep; stosb mfence vmovdqa64 PCPU(MDS_TMP), %zmm0 testb $CR0_TS, %al je 3f movl %eax, %cr0 3: ret END(mds_handler_skl_avx512) ENTRY(mds_handler_silvermont) movl %cr0, %eax testb $CR0_TS, %al je 1f clts 1: movl PCPU(MDS_BUF), %edx movdqa %xmm0, PCPU(MDS_TMP) pxor %xmm0, %xmm0 movl $16, %ecx 2: movntdq %xmm0, (%edx) addl $16, %edx decl %ecx jnz 2b mfence movdqa PCPU(MDS_TMP),%xmm0 testb $CR0_TS, %al je 3f movl %eax, %cr0 3: ret END(mds_handler_silvermont) Index: releng/11.4/sys/x86/include/x86_var.h =================================================================== --- releng/11.4/sys/x86/include/x86_var.h (revision 361587) +++ releng/11.4/sys/x86/include/x86_var.h (revision 361588) @@ -1,160 +1,161 @@ /*- * Copyright (c) 1995 Bruce D. Evans. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _X86_X86_VAR_H_ #define _X86_X86_VAR_H_ /* * Miscellaneous machine-dependent declarations. */ extern long Maxmem; extern u_int basemem; extern int busdma_swi_pending; extern u_int cpu_exthigh; extern u_int cpu_feature; extern u_int cpu_feature2; extern u_int amd_feature; extern u_int amd_feature2; extern u_int amd_pminfo; extern u_int amd_extended_feature_extensions; extern u_int via_feature_rng; extern u_int via_feature_xcrypt; extern u_int cpu_clflush_line_size; extern u_int cpu_stdext_feature; extern u_int cpu_stdext_feature2; extern u_int cpu_stdext_feature3; extern uint64_t cpu_ia32_arch_caps; extern u_int cpu_fxsr; extern u_int cpu_high; extern u_int cpu_id; extern u_int cpu_max_ext_state_size; extern u_int cpu_mxcsr_mask; extern u_int cpu_procinfo; extern u_int cpu_procinfo2; extern char cpu_vendor[]; extern u_int cpu_vendor_id; extern u_int cpu_mon_mwait_flags; extern u_int cpu_mon_min_size; extern u_int cpu_mon_max_size; extern u_int cpu_maxphyaddr; extern char ctx_switch_xsave[]; extern u_int hv_high; extern char hv_vendor[]; extern char kstack[]; extern char sigcode[]; extern int szsigcode; extern int vm_page_dump_size; extern int workaround_erratum383; extern int _udatasel; extern int _ucodesel; extern int _ucode32sel; extern int _ufssel; extern int _ugssel; extern int use_xsave; extern uint64_t xsave_mask; extern int pti; extern int hw_ibrs_ibpb_active; extern int hw_mds_disable; extern int hw_ssb_active; extern int x86_taa_enable; +extern int cpu_flush_rsb_ctxsw; struct pcb; struct thread; struct reg; struct fpreg; struct dbreg; struct dumperinfo; struct trapframe; /* * The interface type of the interrupt handler entry point cannot be * expressed in C. Use simplest non-variadic function type as an * approximation. */ typedef void alias_for_inthand_t(void); /* * Returns the maximum physical address that can be used with the * current system. */ static __inline vm_paddr_t cpu_getmaxphyaddr(void) { #if defined(__i386__) && !defined(PAE) return (0xffffffff); #else return ((1ULL << cpu_maxphyaddr) - 1); #endif } void *alloc_fpusave(int flags); void busdma_swi(void); bool cpu_mwait_usable(void); void cpu_probe_amdc1e(void); void cpu_setregs(void); void dump_add_page(vm_paddr_t); void dump_drop_page(vm_paddr_t); void finishidentcpu(void); void identify_cpu1(void); void identify_cpu2(void); void identify_hypervisor(void); void initializecpu(void); void initializecpucache(void); bool fix_cpuid(void); void fillw(int /*u_short*/ pat, void *base, size_t cnt); int is_physical_memory(vm_paddr_t addr); int isa_nmi(int cd); void handle_ibrs_entry(void); void handle_ibrs_exit(void); void hw_ibrs_recalculate(bool all_cpus); void hw_mds_recalculate(void); void hw_ssb_recalculate(bool all_cpus); void x86_taa_recalculate(void); void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame); void nmi_call_kdb_smp(u_int type, struct trapframe *frame); void nmi_handle_intr(u_int type, struct trapframe *frame); void pagecopy(void *from, void *to); void printcpuinfo(void); int pti_get_default(void); int user_dbreg_trap(register_t dr6); int minidumpsys(struct dumperinfo *); struct pcb *get_pcb_td(struct thread *td); #define MSR_OP_ANDNOT 0x00000001 #define MSR_OP_OR 0x00000002 #define MSR_OP_WRITE 0x00000003 #define MSR_OP_LOCAL 0x10000000 #define MSR_OP_SCHED 0x20000000 #define MSR_OP_RENDEZVOUS 0x30000000 void x86_msr_op(u_int msr, u_int op, uint64_t arg1); #endif Index: releng/11.4/sys/x86/x86/cpu_machdep.c =================================================================== --- releng/11.4/sys/x86/x86/cpu_machdep.c (revision 361587) +++ releng/11.4/sys/x86/x86/cpu_machdep.c (revision 361588) @@ -1,1362 +1,1366 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_perfmon.h" #include "opt_platform.h" #ifdef __i386__ #include "opt_apic.h" #include "opt_xbox.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #include #ifdef SMP #include #endif #ifdef CPU_ELAN #include #endif #include #include #include #include #include #include #include #include #include #ifndef PC98 #include #endif #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 #ifdef SMP static u_int cpu_reset_proxyid; static volatile u_int cpu_reset_proxy_active; #endif struct msr_op_arg { u_int msr; int op; uint64_t arg1; }; static void x86_msr_op_one(void *argp) { struct msr_op_arg *a; uint64_t v; a = argp; switch (a->op) { case MSR_OP_ANDNOT: v = rdmsr(a->msr); v &= ~a->arg1; wrmsr(a->msr, v); break; case MSR_OP_OR: v = rdmsr(a->msr); v |= a->arg1; wrmsr(a->msr, v); break; case MSR_OP_WRITE: wrmsr(a->msr, a->arg1); break; } } #define MSR_OP_EXMODE_MASK 0xf0000000 #define MSR_OP_OP_MASK 0x000000ff void x86_msr_op(u_int msr, u_int op, uint64_t arg1) { struct thread *td; struct msr_op_arg a; u_int exmode; int bound_cpu, i, is_bound; a.op = op & MSR_OP_OP_MASK; MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR || a.op == MSR_OP_WRITE); exmode = op & MSR_OP_EXMODE_MASK; MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED || exmode == MSR_OP_RENDEZVOUS); a.msr = msr; a.arg1 = arg1; switch (exmode) { case MSR_OP_LOCAL: x86_msr_op_one(&a); break; case MSR_OP_SCHED: td = curthread; thread_lock(td); is_bound = sched_is_bound(td); bound_cpu = td->td_oncpu; CPU_FOREACH(i) { sched_bind(td, i); x86_msr_op_one(&a); } if (is_bound) sched_bind(td, bound_cpu); else sched_unbind(td); thread_unlock(td); break; case MSR_OP_RENDEZVOUS: smp_rendezvous(NULL, x86_msr_op_one, NULL, &a); break; } } /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } void acpi_cpu_c1(void) { __asm __volatile("sti; hlt"); } /* * Use mwait to pause execution while waiting for an interrupt or * another thread to signal that there is more work. * * NOTE: Interrupts will cause a wakeup; however, this function does * not enable interrupt handling. The caller is responsible to enable * interrupts. */ void acpi_cpu_idle_mwait(uint32_t mwait_hint) { int *state; uint64_t v; /* * A comment in Linux patch claims that 'CPUs run faster with * speculation protection disabled. All CPU threads in a core * must disable speculation protection for it to be * disabled. Disable it while we are idle so the other * hyperthread can run fast.' * * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ state = (int *)PCPU_PTR(monitorbuf); KASSERT(atomic_load_int(state) == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); atomic_store_int(state, STATE_MWAIT); if (PCPU_GET(ibpb_set) || hw_ssb_active) { v = rdmsr(MSR_IA32_SPEC_CTRL); wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS | IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD)); } else { v = 0; } cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); /* * SSB cannot be disabled while we sleep, or rather, if it was * disabled, the sysctl thread will bind to our cpu to tweak * MSR. */ if (v != 0) wrmsr(MSR_IA32_SPEC_CTRL, v); /* * We should exit on any event that interrupts mwait, because * that event might be a wanted interrupt. */ atomic_store_int(state, STATE_RUNNING); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef __i386__ if ((cpu_feature & CPUID_TSC) == 0) return (EOPNOTSUPP); #endif /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } static void cpu_reset_real(void) { struct region_descriptor null_idt; #ifndef PC98 int b; #endif disable_intr(); #ifdef CPU_ELAN if (elan_mmcr != NULL) elan_mmcr->RESCFG = 1; #endif #ifdef __i386__ if (cpu == CPU_GEODE1100) { /* Attempt Geode's own reset */ outl(0xcf8, 0x80009044ul); outl(0xcfc, 0xf); } #endif #ifdef PC98 /* * Attempt to do a CPU reset via CPU reset port. */ if ((inb(0x35) & 0xa0) != 0xa0) { outb(0x37, 0x0f); /* SHUT0 = 0. */ outb(0x37, 0x0b); /* SHUT1 = 0. */ } outb(0xf0, 0x00); /* Reset. */ #else #if !defined(BROKEN_KEYBOARD_RESET) /* * Attempt to do a CPU reset via the keyboard controller, * do not turn off GateA20, as any machine that fails * to do the reset here would then end up in no man's land. */ outb(IO_KBD + 4, 0xFE); DELAY(500000); /* wait 0.5 sec to see if that did it */ #endif /* * Attempt to force a reset via the Reset Control register at * I/O port 0xcf9. Bit 2 forces a system reset when it * transitions from 0 to 1. Bit 1 selects the type of reset * to attempt: 0 selects a "soft" reset, and 1 selects a * "hard" reset. We try a "hard" reset. The first write sets * bit 1 to select a "hard" reset and clears bit 2. The * second write forces a 0 -> 1 transition in bit 2 to trigger * a reset. */ outb(0xcf9, 0x2); outb(0xcf9, 0x6); DELAY(500000); /* wait 0.5 sec to see if that did it */ /* * Attempt to force a reset via the Fast A20 and Init register * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. * Bit 0 asserts INIT# when set to 1. We are careful to only * preserve bit 1 while setting bit 0. We also must clear bit * 0 before setting it if it isn't already clear. */ b = inb(0x92); if (b != 0xff) { if ((b & 0x1) != 0) outb(0x92, b & 0xfe); outb(0x92, b | 0x1); DELAY(500000); /* wait 0.5 sec to see if that did it */ } #endif /* PC98 */ printf("No known reset method worked, attempting CPU shutdown\n"); DELAY(1000000); /* wait 1 sec for printf to complete */ /* Wipe the IDT. */ null_idt.rd_limit = 0; null_idt.rd_base = 0; lidt(&null_idt); /* "good night, sweet prince .... " */ breakpoint(); /* NOTREACHED */ while(1); } #ifdef SMP static void cpu_reset_proxy(void) { cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) ia32_pause(); /* Wait for other cpu to see that we've started */ printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); DELAY(1000000); cpu_reset_real(); } #endif void cpu_reset(void) { #ifdef SMP cpuset_t map; u_int cnt; if (smp_started) { map = all_cpus; CPU_CLR(PCPU_GET(cpuid), &map); CPU_NAND(&map, &stopped_cpus); if (!CPU_EMPTY(&map)) { printf("cpu_reset: Stopping other CPUs\n"); stop_cpus(map); } if (PCPU_GET(cpuid) != 0) { cpu_reset_proxyid = PCPU_GET(cpuid); cpustop_restartfunc = cpu_reset_proxy; cpu_reset_proxy_active = 0; printf("cpu_reset: Restarting BSP\n"); /* Restart CPU #0. */ CPU_SETOF(0, &started_cpus); wmb(); cnt = 0; while (cpu_reset_proxy_active == 0 && cnt < 10000000) { ia32_pause(); cnt++; /* Wait for BSP to announce restart */ } if (cpu_reset_proxy_active == 0) { printf("cpu_reset: Failed to restart BSP\n"); } else { cpu_reset_proxy_active = 2; while (1) ia32_pause(); /* NOTREACHED */ } } DELAY(1000000); } #endif cpu_reset_real(); /* NOTREACHED */ } bool cpu_mwait_usable(void) { return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); #ifndef PC98 static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); atomic_store_int(state, STATE_SLEEPING); /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) cpu_idle_hook(sbt); else acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); } #endif /* !PC98 */ static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); atomic_store_int(state, STATE_SLEEPING); /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) enable_intr(); else acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); atomic_store_int(state, STATE_MWAIT); /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) { atomic_store_int(state, STATE_RUNNING); enable_intr(); return; } cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = (int *)PCPU_PTR(monitorbuf); atomic_store_int(state, STATE_RUNNING); /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ */ #define MSR_AMDK8_IPM 0xc0010055 #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) void cpu_probe_amdc1e(void) { /* * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. */ if (cpu_vendor_id == CPU_VENDOR_AMD && (cpu_id & 0x00000f00) == 0x00000f00 && (cpu_id & 0x0fff0000) >= 0x00040000) { cpu_ident_amdc1e = 1; } } #if defined(__i386__) && defined(PC98) void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt; #else void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; #endif void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); #ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } static int cpu_idle_apl31_workaround; SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW, &cpu_idle_apl31_workaround, 0, "Apollo Lake APL31 MWAIT bug workaround"); int cpu_idle_wakeup(int cpu) { int *state; state = (int *)pcpu_find(cpu)->pc_monitorbuf; switch (atomic_load_int(state)) { case STATE_SLEEPING: return (0); case STATE_MWAIT: atomic_store_int(state, STATE_RUNNING); return (cpu_idle_apl31_workaround ? 0 : 1); case STATE_RUNNING: return (1); default: panic("bad monitor state"); return (1); } } /* * Ordered by speed/power consumption. */ static struct { void *id_fn; char *id_name; int id_cpuid2_flag; } idle_tbl[] = { { .id_fn = cpu_idle_spin, .id_name = "spin" }, { .id_fn = cpu_idle_mwait, .id_name = "mwait", .id_cpuid2_flag = CPUID2_MON }, { .id_fn = cpu_idle_hlt, .id_name = "hlt" }, #if !defined(__i386__) || !defined(PC98) { .id_fn = cpu_idle_acpi, .id_name = "acpi" }, #endif }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; #if !defined(__i386__) || !defined(PC98) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static bool cpu_idle_selector(const char *new_idle_name) { int i; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; #if !defined(__i386__) || !defined(PC98) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif if (strcmp(idle_tbl[i].id_name, new_idle_name)) continue; cpu_idle_fn = idle_tbl[i].id_fn; if (bootverbose) printf("CPU idle set to %s\n", idle_tbl[i].id_name); return (true); } return (false); } static int cpu_idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16], *p; int error, i; p = "unknown"; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); return (cpu_idle_selector(buf) ? 0 : EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, cpu_idle_sysctl, "A", "currently selected idle function"); static void cpu_idle_tun(void *unused __unused) { char tunvar[16]; if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar))) cpu_idle_selector(tunvar); else if (cpu_vendor_id == CPU_VENDOR_AMD && CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) { /* Ryzen erratas 1057, 1109. */ cpu_idle_selector("hlt"); idle_mwait = 0; } if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) { /* * Apollo Lake errata APL31 (public errata APL30). * Stores to the armed address range may not trigger * MWAIT to resume execution. OS needs to use * interrupts to wake processors from MWAIT-induced * sleep states. */ cpu_idle_apl31_workaround = 1; } TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround); } SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL); static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, &panic_on_nmi, 0, "Panic on NMI raised by hardware failure"); int nmi_is_broadcast = 1; SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, &nmi_is_broadcast, 0, "Chipset NMI is broadcast"); #ifdef KDB int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN, &kdb_on_nmi, 0, "Go to KDB on NMI with unknown source"); #endif void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) { bool claimed = false; #ifdef DEV_ISA /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err)) { claimed = true; if (panic_on_nmi) panic("NMI indicates hardware failure"); } #endif /* DEV_ISA */ #ifdef KDB if (!claimed && kdb_on_nmi) { /* * NMI can be hooked up to a pushbutton for debugging. */ printf("NMI/cpu%d ... going to debugger\n", cpu); kdb_trap(type, 0, frame); } #endif /* KDB */ } void nmi_handle_intr(u_int type, struct trapframe *frame) { #ifdef SMP if (nmi_is_broadcast) { nmi_call_kdb_smp(type, frame); return; } #endif nmi_call_kdb(PCPU_GET(cpuid), type, frame); } static int hw_ibrs_active; int hw_ibrs_ibpb_active; int hw_ibrs_disable = 1; SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active"); void hw_ibrs_recalculate(bool for_all_cpus) { if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) { x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL) | (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR), IA32_SPEC_CTRL_IBRS); hw_ibrs_active = hw_ibrs_disable == 0; hw_ibrs_ibpb_active = 0; } else { hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable; } } static int hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ibrs_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ibrs_disable = val != 0; hw_ibrs_recalculate(true); return (0); } SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", "Disable Indirect Branch Restricted Speculation"); int hw_ssb_active; int hw_ssb_disable; SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD, &hw_ssb_active, 0, "Speculative Store Bypass Disable active"); static void hw_ssb_set(bool enable, bool for_all_cpus) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) { hw_ssb_active = 0; return; } hw_ssb_active = enable; x86_msr_op(MSR_IA32_SPEC_CTRL, (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD); } void hw_ssb_recalculate(bool all_cpus) { switch (hw_ssb_disable) { default: hw_ssb_disable = 0; /* FALLTHROUGH */ case 0: /* off */ hw_ssb_set(false, all_cpus); break; case 1: /* on */ hw_ssb_set(true, all_cpus); break; case 2: /* auto */ hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ? false : true, all_cpus); break; } } static int hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ssb_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ssb_disable = val; hw_ssb_recalculate(true); return (0); } SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ssb_disable_handler, "I", "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto"); int hw_mds_disable; /* * Handler for Microarchitectural Data Sampling issues. Really not a * pointer to C function: on amd64 the code must not change any CPU * architectural state except possibly %rflags. Also, it is always * called with interrupts disabled. */ void mds_handler_void(void); void mds_handler_verw(void); void mds_handler_ivb(void); void mds_handler_bdw(void); void mds_handler_skl_sse(void); void mds_handler_skl_avx(void); void mds_handler_skl_avx512(void); void mds_handler_silvermont(void); void (*mds_handler)(void) = mds_handler_void; static int sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; if (mds_handler == mds_handler_void) state = "inactive"; else if (mds_handler == mds_handler_verw) state = "VERW"; else if (mds_handler == mds_handler_ivb) state = "software IvyBridge"; else if (mds_handler == mds_handler_bdw) state = "software Broadwell"; else if (mds_handler == mds_handler_skl_sse) state = "software Skylake SSE"; else if (mds_handler == mds_handler_skl_avx) state = "software Skylake AVX"; else if (mds_handler == mds_handler_skl_avx512) state = "software Skylake AVX512"; else if (mds_handler == mds_handler_silvermont) state = "software Silvermont"; else state = "unknown"; return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_mds_disable_state_handler, "A", "Microarchitectural Data Sampling Mitigation state"); _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512"); void hw_mds_recalculate(void) { struct pcpu *pc; vm_offset_t b64; u_long xcr0; int i; /* * Allow user to force VERW variant even if MD_CLEAR is not * reported. For instance, hypervisor might unknowingly * filter the cap out. * For the similar reasons, and for testing, allow to enable - * mitigation even for RDCL_NO or MDS_NO caps. + * mitigation even when MDS_NO cap is set. */ if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 || - ((cpu_ia32_arch_caps & (IA32_ARCH_CAP_RDCL_NO | - IA32_ARCH_CAP_MDS_NO)) != 0 && hw_mds_disable == 3)) { + ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 && + hw_mds_disable == 3)) { mds_handler = mds_handler_void; } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 && hw_mds_disable == 3) || hw_mds_disable == 1) { mds_handler = mds_handler_verw; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e || CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a || CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 || CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d || CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e || CPUID_TO_MODEL(cpu_id) == 0x3a) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Nehalem, SandyBridge, IvyBridge */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc(672, M_TEMP, M_WAITOK); bzero(pc->pc_mds_buf, 16); } } mds_handler = mds_handler_ivb; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c || CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 || CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f || CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Haswell, Broadwell */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc(1536, M_TEMP, M_WAITOK); bzero(pc->pc_mds_buf, 16); } } mds_handler = mds_handler_bdw; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id & CPUID_STEPPING) <= 5) || CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e || (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id & CPUID_STEPPING) <= 0xb) || (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id & CPUID_STEPPING) <= 0xc)) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Skylake, KabyLake, CoffeeLake, WhiskeyLake, * CascadeLake */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc(6 * 1024, M_TEMP, M_WAITOK); b64 = (vm_offset_t)malloc(64 + 63, M_TEMP, M_WAITOK); pc->pc_mds_buf64 = (void *)roundup2(b64, 64); bzero(pc->pc_mds_buf64, 64); } } xcr0 = rxcr(0); if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 && (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0) mds_handler = mds_handler_skl_avx512; else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 && (cpu_feature2 & CPUID2_AVX) != 0) mds_handler = mds_handler_skl_avx; else mds_handler = mds_handler_skl_sse; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && ((CPUID_TO_MODEL(cpu_id) == 0x37 || CPUID_TO_MODEL(cpu_id) == 0x4a || CPUID_TO_MODEL(cpu_id) == 0x4c || CPUID_TO_MODEL(cpu_id) == 0x4d || CPUID_TO_MODEL(cpu_id) == 0x5a || CPUID_TO_MODEL(cpu_id) == 0x5d || CPUID_TO_MODEL(cpu_id) == 0x6e || CPUID_TO_MODEL(cpu_id) == 0x65 || CPUID_TO_MODEL(cpu_id) == 0x75 || CPUID_TO_MODEL(cpu_id) == 0x1c || CPUID_TO_MODEL(cpu_id) == 0x26 || CPUID_TO_MODEL(cpu_id) == 0x27 || CPUID_TO_MODEL(cpu_id) == 0x35 || CPUID_TO_MODEL(cpu_id) == 0x36 || CPUID_TO_MODEL(cpu_id) == 0x7a))) { /* Silvermont, Airmont */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK); } mds_handler = mds_handler_silvermont; } else { hw_mds_disable = 0; mds_handler = mds_handler_void; } } static void hw_mds_recalculate_boot(void *arg __unused) { hw_mds_recalculate(); } SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL); static int sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_mds_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val < 0 || val > 3) return (EINVAL); hw_mds_disable = val; hw_mds_recalculate(); return (0); } SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_mds_disable_handler, "I", "Microarchitectural Data Sampling Mitigation " "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO"); /* * Intel Transactional Memory Asynchronous Abort Mitigation * CVE-2019-11135 */ int x86_taa_enable; int x86_taa_state; enum { TAA_NONE = 0, /* No mitigation enabled */ TAA_TSX_DISABLE = 1, /* Disable TSX via MSR */ TAA_VERW = 2, /* Use VERW mitigation */ TAA_AUTO = 3, /* Automatically select the mitigation */ /* The states below are not selectable by the operator */ TAA_TAA_UC = 4, /* Mitigation present in microcode */ TAA_NOT_PRESENT = 5 /* TSX is not present */ }; static void taa_set(bool enable, bool all) { x86_msr_op(MSR_IA32_TSX_CTRL, (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL), IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR); } void x86_taa_recalculate(void) { static int taa_saved_mds_disable = 0; int taa_need = 0, taa_state = 0; int mds_disable = 0, need_mds_recalc = 0; /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */ if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 || (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) { /* TSX is not present */ x86_taa_state = TAA_NOT_PRESENT; return; } /* Check to see what mitigation options the CPU gives us */ if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) { /* CPU is not suseptible to TAA */ taa_need = TAA_TAA_UC; } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) { /* * CPU can turn off TSX. This is the next best option * if TAA_NO hardware mitigation isn't present */ taa_need = TAA_TSX_DISABLE; } else { /* No TSX/TAA specific remedies are available. */ if (x86_taa_enable == TAA_TSX_DISABLE) { if (bootverbose) printf("TSX control not available\n"); return; } else taa_need = TAA_VERW; } /* Can we automatically take action, or are we being forced? */ if (x86_taa_enable == TAA_AUTO) taa_state = taa_need; else taa_state = x86_taa_enable; /* No state change, nothing to do */ if (taa_state == x86_taa_state) { if (bootverbose) printf("No TSX change made\n"); return; } /* Does the MSR need to be turned on or off? */ if (taa_state == TAA_TSX_DISABLE) taa_set(true, true); else if (x86_taa_state == TAA_TSX_DISABLE) taa_set(false, true); /* Does MDS need to be set to turn on VERW? */ if (taa_state == TAA_VERW) { taa_saved_mds_disable = hw_mds_disable; mds_disable = hw_mds_disable = 1; need_mds_recalc = 1; } else if (x86_taa_state == TAA_VERW) { mds_disable = hw_mds_disable = taa_saved_mds_disable; need_mds_recalc = 1; } if (need_mds_recalc) { hw_mds_recalculate(); if (mds_disable != hw_mds_disable) { if (bootverbose) printf("Cannot change MDS state for TAA\n"); /* Don't update our state */ return; } } x86_taa_state = taa_state; return; } static void taa_recalculate_boot(void * arg __unused) { x86_taa_recalculate(); } SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0, "TSX Asynchronous Abort Mitigation"); static int sysctl_taa_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = x86_taa_enable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val < TAA_NONE || val > TAA_AUTO) return (EINVAL); x86_taa_enable = val; x86_taa_recalculate(); return (0); } SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_taa_handler, "I", "TAA Mitigation enablement control " "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO"); static int sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; switch (x86_taa_state) { case TAA_NONE: state = "inactive"; break; case TAA_TSX_DISABLE: state = "TSX disabled"; break; case TAA_VERW: state = "VERW"; break; case TAA_TAA_UC: state = "Mitigated in microcode"; break; case TAA_NOT_PRESENT: state = "TSX not present"; break; default: state = "unknown"; } return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_taa_state_handler, "A", "TAA Mitigation state"); +int __read_frequently cpu_flush_rsb_ctxsw; +SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw, + CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0, + "Flush Return Stack Buffer on context switch"); Index: releng/11.4 =================================================================== --- releng/11.4 (revision 361587) +++ releng/11.4 (revision 361588) Property changes on: releng/11.4 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,2 ## Merged /head:r361299,361302 Merged /stable/11:r361558,361561