diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c index 3b21935255cb..0f9f987b6590 100644 --- a/sys/amd64/acpica/acpi_wakeup.c +++ b/sys/amd64/acpica/acpi_wakeup.c @@ -1,463 +1,464 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Takanori Watanabe * Copyright (c) 2001-2012 Mitsuru IWASAKI * Copyright (c) 2003 Peter Wemm * Copyright (c) 2008-2012 Jung-uk Kim * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #include #endif #include #include #include "acpi_wakecode.h" #include "acpi_wakedata.h" /* Make sure the code is less than a page and leave room for the stack. */ CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024); extern int acpi_resume_beep; extern int acpi_reset_video; extern int acpi_susp_bounce; #ifdef SMP extern struct susppcb **susppcbs; static cpuset_t suspcpus; #else static struct susppcb **susppcbs; #endif static void acpi_stop_beep(void *); #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *, int); static void acpi_wakeup_cpus(struct acpi_softc *); #endif #define ACPI_WAKEPT_PAGES 7 #define WAKECODE_FIXUP(offset, type, val) do { \ type *addr; \ addr = (type *)(sc->acpi_wakeaddr + (offset)); \ *addr = val; \ } while (0) static void acpi_stop_beep(void *arg) { if (acpi_resume_beep != 0) timer_spkr_release(); } #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *sc, int cpu) { struct pcb *pcb; int vector = (sc->acpi_wakephys >> 12) & 0xff; int apic_id = cpu_apic_ids[cpu]; int ms; pcb = &susppcbs[cpu]->sp_pcb; WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb); WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base); ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to resume. */ for (ms = 0; ms < 5000; ms++) { if (!CPU_ISSET(cpu, &suspended_cpus)) return (1); /* return SUCCESS */ DELAY(1000); } return (0); /* return FAILURE */ } #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) static void acpi_wakeup_cpus(struct acpi_softc *sc) { uint32_t mpbioswarmvec; int cpu; u_char mpbiosreason; if (!efi_boot) { /* save the current value of the warm-start vector */ mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *)WARMBOOT_SEG) = sc->acpi_wakephys >> 4; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ } /* Wake up each AP. */ for (cpu = 1; cpu < mp_ncpus; cpu++) { if (!CPU_ISSET(cpu, &suspcpus)) continue; if (acpi_wakeup_ap(sc, cpu) == 0) { /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)", cpu, cpu_apic_ids[cpu]); } } if (!efi_boot) { /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); } } #endif int acpi_sleep_machdep(struct acpi_softc *sc, int state) { ACPI_STATUS status; struct pcb *pcb; struct pcpu *pc; int i; if (sc->acpi_wakeaddr == 0ul) return (-1); /* couldn't alloc wake memory */ #ifdef SMP suspcpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &suspcpus); #endif if (acpi_resume_beep != 0) timer_spkr_acquire(); AcpiSetFirmwareWakingVector(sc->acpi_wakephys, 0); intr_suspend(); pcb = &susppcbs[0]->sp_pcb; if (savectx(pcb)) { fpususpend(susppcbs[0]->sp_fpususpend); #ifdef SMP if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) { device_printf(sc->acpi_dev, "Failed to suspend APs\n"); return (0); /* couldn't sleep */ } #endif hw_ibrs_ibpb_active = 0; hw_ssb_active = 0; cpu_stdext_feature3 = 0; CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_ibpb_set = 0; } WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0)); WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0)); WAKECODE_FIXUP(wakeup_efer, uint64_t, rdmsr(MSR_EFER) & ~(EFER_LMA)); WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb); WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base); /* Call ACPICA to enter the desired sleep state */ if (state == ACPI_STATE_S4 && sc->acpi_s4bios) status = AcpiEnterSleepStateS4bios(); else status = AcpiEnterSleepState(state); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); return (0); /* couldn't sleep */ } if (acpi_susp_bounce) resumectx(pcb); for (;;) ia32_pause(); } else { /* * Re-initialize console hardware as soon as possible. * No console output (e.g. printf) is allowed before * this point. */ cnresume(); fpuresume(susppcbs[0]->sp_fpususpend); } return (1); /* wakeup successfully */ } int acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result, int intr_enabled) { if (sleep_result == -1) return (sleep_result); if (!intr_enabled) { /* Wakeup MD procedures in interrupt disabled context */ if (sleep_result == 1) { ucode_reload(); pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); lapic_xapic_mode(); #ifdef SMP if (!CPU_EMPTY(&suspcpus)) acpi_wakeup_cpus(sc); #endif } #ifdef SMP if (!CPU_EMPTY(&suspcpus)) resume_cpus(suspcpus); #endif /* * Re-read cpu_stdext_feature3, which was zeroed-out * in acpi_sleep_machdep(), after the microcode was * reloaded. Then recalculate the active mitigation * knobs that depend on the microcode and * cpu_stdext_feature3. Do it after LAPICs are woken, * so that IPIs work. */ identify_cpu_ext_features(); mca_resume(); if (vmm_resume_p != NULL) vmm_resume_p(); intr_resume(/*suspend_cancelled*/false); hw_ibrs_recalculate(true); amd64_syscall_ret_flush_l1d_recalc(); hw_ssb_recalculate(true); x86_rngds_mitg_recalculate(true); + zenbleed_check_and_apply(true); AcpiSetFirmwareWakingVector(0, 0); } else { /* Wakeup MD procedures in interrupt enabled context */ if (sleep_result == 1 && mem_range_softc.mr_op != NULL && mem_range_softc.mr_op->reinit != NULL) mem_range_softc.mr_op->reinit(&mem_range_softc); } return (sleep_result); } static void acpi_alloc_wakeup_handler(void **wakeaddr, void *wakept_pages[ACPI_WAKEPT_PAGES]) { vm_page_t wakept_m[ACPI_WAKEPT_PAGES]; int i; *wakeaddr = NULL; memset(wakept_pages, 0, ACPI_WAKEPT_PAGES * sizeof(*wakept_pages)); memset(wakept_m, 0, ACPI_WAKEPT_PAGES * sizeof(*wakept_m)); /* * Specify the region for our wakeup code. We want it in the * low 1 MB region, excluding real mode IVT (0-0x3ff), BDA * (0x400-0x4ff), EBDA (less than 128KB, below 0xa0000, must * be excluded by SMAP and DSDT), and ROM area (0xa0000 and * above). */ *wakeaddr = contigmalloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT, 0x500, 0xa0000, PAGE_SIZE, 0ul); if (*wakeaddr == NULL) { printf("%s: can't alloc wake memory\n", __func__); goto freepages; } for (i = 0; i < ACPI_WAKEPT_PAGES - (la57 ? 0 : 1); i++) { wakept_m[i] = pmap_page_alloc_below_4g(true); wakept_pages[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS( wakept_m[i])); } if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL, EVENTHANDLER_PRI_LAST) == NULL) { printf("%s: can't register event handler\n", __func__); goto freepages; } susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK); for (i = 0; i < mp_ncpus; i++) { susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK); susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK); } return; freepages: if (*wakeaddr != NULL) contigfree(*wakeaddr, PAGE_SIZE, M_DEVBUF); for (i = 0; i < ACPI_WAKEPT_PAGES; i++) { if (wakept_m[i] != NULL) vm_page_free(wakept_m[i]); } *wakeaddr = NULL; } void acpi_install_wakeup_handler(struct acpi_softc *sc) { static void *wakeaddr; void *wakept_pages[ACPI_WAKEPT_PAGES]; uint64_t *pt5, *pt4, *pt3, *pt2_0, *pt2_1, *pt2_2, *pt2_3; vm_paddr_t pt5pa, pt4pa, pt3pa, pt2_0pa, pt2_1pa, pt2_2pa, pt2_3pa; int i; if (wakeaddr != NULL) return; acpi_alloc_wakeup_handler(&wakeaddr, wakept_pages); if (wakeaddr == NULL) return; sc->acpi_wakeaddr = (vm_offset_t)wakeaddr; sc->acpi_wakephys = vtophys(wakeaddr); if (la57) { pt5 = wakept_pages[6]; pt5pa = vtophys(pt5); } pt4 = wakept_pages[0]; pt3 = wakept_pages[1]; pt2_0 = wakept_pages[2]; pt2_1 = wakept_pages[3]; pt2_2 = wakept_pages[4]; pt2_3 = wakept_pages[5]; pt4pa = vtophys(pt4); pt3pa = vtophys(pt3); pt2_0pa = vtophys(pt2_0); pt2_1pa = vtophys(pt2_1); pt2_2pa = vtophys(pt2_2); pt2_3pa = vtophys(pt2_3); bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode)); /* Patch GDT base address, ljmp targets. */ WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t, sc->acpi_wakephys + bootgdt); WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t, sc->acpi_wakephys + wakeup_32); WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t, sc->acpi_wakephys + wakeup_64); WAKECODE_FIXUP(wakeup_pagetables, uint32_t, la57 ? (pt5pa | 0x1) : pt4pa); /* Save pointers to some global data. */ WAKECODE_FIXUP(wakeup_ret, void *, resumectx); /* Create 1:1 mapping for the low 4G */ if (la57) { bcopy(kernel_pmap->pm_pmltop, pt5, PAGE_SIZE); pt5[0] = (uint64_t)pt4pa; pt5[0] |= PG_V | PG_RW | PG_U; } else { bcopy(kernel_pmap->pm_pmltop, pt4, PAGE_SIZE); } pt4[0] = (uint64_t)pt3pa; pt4[0] |= PG_V | PG_RW | PG_U; pt3[0] = (uint64_t)pt2_0pa; pt3[0] |= PG_V | PG_RW | PG_U; pt3[1] = (uint64_t)pt2_1pa; pt3[1] |= PG_V | PG_RW | PG_U; pt3[2] = (uint64_t)pt2_2pa; pt3[2] |= PG_V | PG_RW | PG_U; pt3[3] = (uint64_t)pt2_3pa; pt3[3] |= PG_V | PG_RW | PG_U; for (i = 0; i < NPDEPG; i++) { pt2_0[i] = (pd_entry_t)i * NBPDR; pt2_0[i] |= PG_V | PG_RW | PG_PS | PG_U; } for (i = 0; i < NPDEPG; i++) { pt2_1[i] = (pd_entry_t)NBPDP + i * NBPDR; pt2_1[i] |= PG_V | PG_RW | PG_PS | PG_U; } for (i = 0; i < NPDEPG; i++) { pt2_2[i] = (pd_entry_t)2 * NBPDP + i * NBPDR; pt2_2[i] |= PG_V | PG_RW | PG_PS | PG_U; } for (i = 0; i < NPDEPG; i++) { pt2_3[i] = (pd_entry_t)3 * NBPDP + i * NBPDR; pt2_3[i] |= PG_V | PG_RW | PG_PS | PG_U; } if (bootverbose) device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n", (uintmax_t)sc->acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys); } diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c index a048c08fc9ae..c5266ffcc235 100644 --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -1,379 +1,382 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) KATO Takenori, 1997, 1998. * * All rights reserved. Unpublished rights reserved under the copyright * laws of Japan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer as * the first lines of this file unmodified. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #include #include static int hw_instruction_sse; SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU"); static int lower_sharedpage_init; int hw_lower_amd64_sharedpage; SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN, &hw_lower_amd64_sharedpage, 0, "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory"); /* * -1: automatic (default) * 0: keep enable CLFLUSH * 1: force disable CLFLUSH */ static int hw_clflush_disable = -1; static void init_amd(void) { uint64_t msr; /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ * * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. Affected models range is * taken from Linux sources. */ if ((CPUID_TO_FAMILY(cpu_id) == 0xf || CPUID_TO_FAMILY(cpu_id) == 0x10) && (cpu_feature2 & CPUID2_HV) == 0) cpu_amdc1e_bug = 1; /* * Work around Erratum 721 for Family 10h and 12h processors. * These processors may incorrectly update the stack pointer * after a long series of push and/or near-call instructions, * or a long series of pop and/or near-return instructions. * * http://support.amd.com/us/Processor_TechDocs/41322_10h_Rev_Gd.pdf * http://support.amd.com/us/Processor_TechDocs/44739_12h_Rev_Gd.pdf * * Hypervisors do not provide access to the errata MSR, * causing #GP exception on attempt to apply the errata. The * MSR write shall be done on host and persist globally * anyway, so do not try to do it when under virtualization. */ switch (CPUID_TO_FAMILY(cpu_id)) { case 0x10: case 0x12: if ((cpu_feature2 & CPUID2_HV) == 0) wrmsr(MSR_DE_CFG, rdmsr(MSR_DE_CFG) | DE_CFG_10H_12H_STACK_POINTER_JUMP_FIX_BIT); break; } /* * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG. * So, do it here or otherwise some tools could be confused by * Initial Local APIC ID reported with CPUID Function 1 in EBX. */ if (CPUID_TO_FAMILY(cpu_id) == 0x10) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(MSR_NB_CFG1); msr |= (uint64_t)1 << 54; wrmsr(MSR_NB_CFG1, msr); } } /* * BIOS may configure Family 10h processors to convert WC+ cache type * to CD. That can hurt performance of guest VMs using nested paging. * The relevant MSR bit is not documented in the BKDG, * the fix is borrowed from Linux. */ if (CPUID_TO_FAMILY(cpu_id) == 0x10) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(0xc001102a); msr &= ~((uint64_t)1 << 24); wrmsr(0xc001102a, msr); } } /* * Work around Erratum 793: Specific Combination of Writes to Write * Combined Memory Types and Locked Instructions May Cause Core Hang. * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors, * revision 3.04 or later, publication 51810. */ if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) { if ((cpu_feature2 & CPUID2_HV) == 0) { msr = rdmsr(MSR_LS_CFG); msr |= (uint64_t)1 << 15; wrmsr(MSR_LS_CFG, msr); } } /* Ryzen erratas. */ if (CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1 && (cpu_feature2 & CPUID2_HV) == 0) { /* 1021 */ msr = rdmsr(MSR_DE_CFG); msr |= DE_CFG_ZEN_LOAD_STALE_DATA_FIX_BIT; wrmsr(MSR_DE_CFG, msr); /* 1033 */ msr = rdmsr(MSR_LS_CFG); msr |= 0x10; wrmsr(MSR_LS_CFG, msr); /* 1049 */ msr = rdmsr(0xc0011028); msr |= 0x10; wrmsr(0xc0011028, msr); /* 1095 */ msr = rdmsr(MSR_LS_CFG); msr |= 0x200000000000000; wrmsr(MSR_LS_CFG, msr); } /* * Work around a problem on Ryzen that is triggered by executing * code near the top of user memory, in our case the signal * trampoline code in the shared page on amd64. * * This function is executed once for the BSP before tunables take * effect so the value determined here can be overridden by the * tunable. This function is then executed again for each AP and * also on resume. Set a flag the first time so that value set by * the tunable is not overwritten. * * The stepping and/or microcode versions should be checked after * this issue is fixed by AMD so that we don't use this mode if not * needed. */ if (lower_sharedpage_init == 0) { lower_sharedpage_init = 1; if (CPUID_TO_FAMILY(cpu_id) == 0x17 || CPUID_TO_FAMILY(cpu_id) == 0x18) { hw_lower_amd64_sharedpage = 1; } } + + /* Zenbleed. See the comments in 'cpu_machdep.c'. */ + zenbleed_check_and_apply(false); } /* * Initialize special VIA features */ static void init_via(void) { u_int regs[4], val; /* * Check extended CPUID for PadLock features. * * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf */ do_cpuid(0xc0000000, regs); if (regs[0] >= 0xc0000001) { do_cpuid(0xc0000001, regs); val = regs[3]; } else return; /* Enable RNG if present. */ if ((val & VIA_CPUID_HAS_RNG) != 0) { via_feature_rng = VIA_HAS_RNG; wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG); } /* Enable PadLock if present. */ if ((val & VIA_CPUID_HAS_ACE) != 0) via_feature_xcrypt |= VIA_HAS_AES; if ((val & VIA_CPUID_HAS_ACE2) != 0) via_feature_xcrypt |= VIA_HAS_AESCTR; if ((val & VIA_CPUID_HAS_PHE) != 0) via_feature_xcrypt |= VIA_HAS_SHA; if ((val & VIA_CPUID_HAS_PMM) != 0) via_feature_xcrypt |= VIA_HAS_MM; if (via_feature_xcrypt != 0) wrmsr(0x1107, rdmsr(0x1107) | (1 << 28)); } /* * The value for the TSC_AUX MSR and rdtscp/rdpid on the invoking CPU. * * Caller should prevent CPU migration. */ u_int cpu_auxmsr(void) { KASSERT((read_rflags() & PSL_I) == 0, ("context switch possible")); return (PCPU_GET(cpuid)); } void cpu_init_small_core(void) { u_int r[4]; if (cpu_high < 0x1a) return; cpuid_count(0x1a, 0, r); if ((r[0] & CPUID_HYBRID_CORE_MASK) != CPUID_HYBRID_SMALL_CORE) return; PCPU_SET(small_core, 1); if (pmap_pcid_enabled && invpcid_works && pmap_pcid_invlpg_workaround_uena) { PCPU_SET(pcid_invlpg_workaround, 1); pmap_pcid_invlpg_workaround = 1; } } /* * Initialize CPU control registers */ void initializecpu(void) { uint64_t msr; uint32_t cr4; TSENTER(); cr4 = rcr4(); if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { cr4 |= CR4_FXSR | CR4_XMM; hw_instruction_sse = 1; } if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) cr4 |= CR4_FSGSBASE; if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) cr4 |= CR4_PKE; /* * If SMEP is present, we only need to flush RSB (by default) * on context switches, to prevent cross-process ret2spec * attacks. Do it automatically if ibrs_disable is set, to * complete the mitigation. * * Postpone enabling the SMEP on the boot CPU until the page * tables are switched from the boot loader identity mapping * to the kernel tables. The boot loader enables the U bit in * its tables. */ if (IS_BSP()) { if (cpu_stdext_feature & CPUID_STDEXT_SMEP && !TUNABLE_INT_FETCH( "machdep.mitigations.cpu_flush_rsb_ctxsw", &cpu_flush_rsb_ctxsw) && hw_ibrs_disable) cpu_flush_rsb_ctxsw = 1; } else { if (cpu_stdext_feature & CPUID_STDEXT_SMEP) cr4 |= CR4_SMEP; if (cpu_stdext_feature & CPUID_STDEXT_SMAP) cr4 |= CR4_SMAP; } TSENTER2("load_cr4"); load_cr4(cr4); TSEXIT2("load_cr4"); /* Reload cpu ext features to reflect cr4 changes */ if (IS_BSP() && cold) identify_cpu_ext_features(); if (IS_BSP() && (amd_feature & AMDID_NX) != 0) { msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); pg_nx = PG_NX; } hw_ibrs_recalculate(false); hw_ssb_recalculate(false); amd64_syscall_ret_flush_l1d_recalc(); x86_rngds_mitg_recalculate(false); switch (cpu_vendor_id) { case CPU_VENDOR_AMD: case CPU_VENDOR_HYGON: init_amd(); break; case CPU_VENDOR_CENTAUR: init_via(); break; } if ((amd_feature & AMDID_RDTSCP) != 0 || (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0) wrmsr(MSR_TSC_AUX, cpu_auxmsr()); if (!IS_BSP()) cpu_init_small_core(); TSEXIT(); } void initializecpucache(void) { /* * CPUID with %eax = 1, %ebx returns * Bits 15-8: CLFLUSH line size * (Value * 8 = cache line size in bytes) */ if ((cpu_feature & CPUID_CLFSH) != 0) cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8; /* * XXXKIB: (temporary) hack to work around traps generated * when CLFLUSHing APIC register window under virtualization * environments. These environments tend to disable the * CPUID_SS feature even though the native CPU supports it. */ TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable); if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } /* * The kernel's use of CLFLUSH{,OPT} can be disabled manually * by setting the hw.clflush_disable tunable. */ if (hw_clflush_disable == 1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } } diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index f39678d1f4e5..4b81f5b9671e 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1,1914 +1,1918 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_pci.h" #include "opt_platform.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); /* * The PTI trampoline stack needs enough space for a hardware trapframe and a * couple of scratch registers, as well as the trapframe left behind after an * iret fault. */ CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - offsetof(struct pti_frame, pti_rip)); extern u_int64_t hammer_time(u_int64_t, u_int64_t); static void cpu_startup(void *); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Probe 8254 PIT and TSC. */ static void native_clock_source_init(void); /* Preload data parse function */ static caddr_t native_parse_preload_data(u_int64_t); /* Native function to fetch and parse the e820 map */ static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); /* Default init_ops implementation. */ struct init_ops init_ops = { .parse_preload_data = native_parse_preload_data, .early_clock_source_init = native_clock_source_init, .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, }; /* * Physical address of the EFI System Table. Stashed from the metadata hints * passed into the kernel and used by the EFI code to call runtime services. */ vm_paddr_t efi_systbl_phys; /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; long Maxmem = 0; long realmem = 0; int late_console = 1; struct kva_md_info kmi; struct region_descriptor r_idt; struct pcpu *__pcpu; struct pcpu temp_bsp_pcpu; struct mtx icu_lock; struct mem_range_softc mem_range_softc; struct mtx dt_lock; /* lock for GDT and LDT */ void (*vmm_resume_p)(void); bool efi_boot; static void cpu_startup(void *dummy) { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); #ifdef DEV_PCI if (bootverbose && intel_graphics_stolen_base != 0) printf("intel stolen mem: base %#jx size %ju MB\n", (uintmax_t)intel_graphics_stolen_base, (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); #endif /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } static void late_ifunc_resolve(void *dummy __unused) { link_elf_late_ireloc(); } SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); void cpu_setregs(void) { register_t cr0; TSENTER(); cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; TSENTER2("load_cr0"); load_cr0(cr0); TSEXIT2("load_cr0"); TSEXIT(); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); static char mce0_stack[MCE_STACK_SIZE] __aligned(16); static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); /* * Software prototypes -- in more palatable form. * * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same * slots as corresponding segments for i386 kernel. */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GNULL2_SEL 1 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 8 64 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, .ssd_type = SDT_SYSTSS, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Actually, the TSS is a system descriptor which is double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 LDT Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 12 LDT Descriptor, double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); void setidt(int idx, inthand_t *func, int typ, int dpl, int ist) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(div_pti), IDTVEC(bpt_pti), IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), #endif IDTVEC(fast_syscall), IDTVEC(fast_syscall32), IDTVEC(fast_syscall_pti); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE) { struct { uint16_t limit; uint64_t base; } __packed idtr, gdtr; uint16_t ldt, tr; __asm __volatile("sidt %0" : "=m" (idtr)); db_printf("idtr\t0x%016lx/%04x\n", (u_long)idtr.base, (u_int)idtr.limit); __asm __volatile("sgdt %0" : "=m" (gdtr)); db_printf("gdtr\t0x%016lx/%04x\n", (u_long)gdtr.base, (u_int)gdtr.limit); __asm __volatile("sldt %0" : "=r" (ldt)); db_printf("ldtr\t0x%04x\n", ldt); __asm __volatile("str %0" : "=r" (tr)); db_printf("tr\t0x%04x\n", tr); db_printf("cr0\t0x%016lx\n", rcr0()); db_printf("cr2\t0x%016lx\n", rcr2()); db_printf("cr3\t0x%016lx\n", rcr3()); db_printf("cr4\t0x%016lx\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016lx\n", rxcr(0)); db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); } DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE) { db_printf("dr0\t0x%016lx\n", rdr0()); db_printf("dr1\t0x%016lx\n", rdr1()); db_printf("dr2\t0x%016lx\n", rdr2()); db_printf("dr3\t0x%016lx\n", rdr3()); db_printf("dr6\t0x%016lx\n", rdr6()); db_printf("dr7\t0x%016lx\n", rdr7()); } #endif void sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd) { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd) { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd) { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } u_int basemem; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. * * NB: physmap_idx points to the next free slot. */ insert_idx = physmap_idx; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYS_AVAIL_ENTRIES) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } void bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap, *smapend; smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (!add_physmap_entry(smap->base, smap->length, physmap, physmap_idx)) break; } } static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, int *physmap_idx) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %012lx %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE, physmap, physmap_idx)) break; } } static void native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap; struct efi_map_header *efihdr; u_int32_t size; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); smap = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (efihdr == NULL && smap == NULL) panic("No BIOS smap or EFI map info from loader!"); if (efihdr != NULL) { add_efi_map_entries(efihdr, physmap, physmap_idx); strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); } else { size = *((u_int32_t *)smap - 1); bios_add_smap_entries(smap, size, physmap, physmap_idx); strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } } #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; u_long physmem_start, physmem_tunable, memtest; pt_entry_t *pte; quad_t dcons_addr, dcons_size; int page_counter; TSENTER(); /* * Tell the physical memory allocator about pages used to store * the kernel and preloaded data. See kmem_bootstrap_free(). */ vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); bzero(physmap, sizeof(physmap)); physmap_idx = 0; init_ops.parse_memmap(kmdp, physmap, &physmap_idx); physmap_idx -= 2; /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] <= 0xA0000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0 || basemem > 640) { if (bootverbose) printf( "Memory map doesn't contain a basemem segment, faking it"); basemem = 640; } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. * * XXX Some BIOSes corrupt low 64KB between suspend and resume. * By default, mask off the first 16 pages unless we appear to be * running in a VM. */ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); if (physmap[0] < physmem_start) { if (physmem_start < PAGE_SIZE) physmap[0] = PAGE_SIZE; else if (physmem_start >= physmap[1]) physmap[0] = round_page(physmap[1] - PAGE_SIZE); else physmap[0] = round_page(physmem_start); } pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ page_counter = 0; if (memtest != 0) printf("Testing system memory"); for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= (vm_paddr_t)kernphys && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * Print a "." every GB to show we're making * progress. */ page_counter++; if ((page_counter % PAGES_PER_GB) == 0) printf("."); /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ENTRIES) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == PHYS_AVAIL_ENTRIES) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); if (memtest != 0) printf("\n"); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); TSEXIT(); } static caddr_t native_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; char *envp; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; #endif preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += KERNBASE; init_static_kenv(envp, 0); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); db_fetch_ksymtab(ksym_start, ksym_end, 0); #endif efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); return (kmdp); } static void native_clock_source_init(void) { i8254_init(); } static void amd64_kdb_init(void) { kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } /* Set up the fast syscall stuff */ void amd64_conf_fast_syscall(void) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); } void amd64_bsp_pcpu_init1(struct pcpu *pc) { struct user_segment_descriptor *gdt; PCPU_SET(prvspace, pc); gdt = *PCPU_PTR(gdt); PCPU_SET(curthread, &thread0); PCPU_SET(tssp, PCPU_PTR(common_tss)); PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); PCPU_SET(smp_tlb_gen, 1); } void amd64_bsp_pcpu_init2(uint64_t rsp0) { PCPU_SET(rsp0, rsp0); PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); PCPU_SET(curpcb, thread0.td_pcb); } void amd64_bsp_ist_init(struct pcpu *pc) { struct nmi_pcpu *np; struct amd64tss *tssp; tssp = &pc->pc_common_tss; /* doublefault stack space, runs on ist1 */ np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist1 = (long)np; /* * NMI stack, runs on ist2. The pcpu pointer is stored just * above the start of the ist2 stack. */ np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist2 = (long)np; /* * MC# stack, runs on ist3. The pcpu pointer is stored just * above the start of the ist3 stack. */ np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist3 = (long)np; /* * DB# stack, runs on ist4. */ np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist4 = (long)np; } /* * Calculate the kernel load address by inspecting page table created by loader. * The assumptions: * - kernel is mapped at KERNBASE, backed by contiguous phys memory * aligned at 2M, below 4G (the latter is important for AP startup) * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) * - kernel is mapped with 2M superpages * - all participating memory, i.e. kernel, modules, metadata, * page table is accessible by pre-created 1:1 mapping * (right now loader creates 1:1 mapping for lower 4G, and all * memory is from there) * - there is a usable memory block right after the end of the * mapped kernel and all modules/metadata, pointed to by * physfree, for early allocations */ vm_paddr_t __nosanitizeaddress __nosanitizememory amd64_loadaddr(void) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; uint64_t cr3; cr3 = rcr3(); pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART); pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART); pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART); return (*pde & PG_FRAME); } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; uint64_t rsp0; char *env; struct user_segment_descriptor *gdt; struct region_descriptor r_gdt; size_t kstack0_sz; TSRAW(&thread0, TS_ENTER, __func__, NULL); kernphys = amd64_loadaddr(); physfree += kernphys; kmdp = init_ops.parse_preload_data(modulep); efi_boot = preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL; if (!efi_boot) { /* Tell the bios to warmboot next time */ atomic_store_short((u_short *)0x472, 0x1234); } physfree += ucode_load_bsp(physfree - kernphys + KERNSTART); physfree = roundup2(physfree, PAGE_SIZE); identify_cpu1(); identify_hypervisor(); identify_hypervisor_smbios(); identify_cpu_fixup_bsp(); identify_cpu2(); initializecpucache(); /* * Check for pti, pcid, and invpcid before ifuncs are * resolved, to correctly select the implementation for * pmap_activate_sw_mode(). */ pti = pti_get_default(); TUNABLE_INT_FETCH("vm.pmap.pti", &pti); TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; } else { pmap_pcid_enabled = 0; } /* * Now we can do small core initialization, after the PCID * CPU features and user knobs are evaluated. */ TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround", &pmap_pcid_invlpg_workaround_uena); cpu_init_small_core(); if ((cpu_feature2 & CPUID2_XSAVE) != 0) { use_xsave = 1; TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); } link_elf_ireloc(kmdp); /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); /* Init basic tunables, hz etc */ init_param1(); thread0.td_kstack = physfree - kernphys + KERNSTART; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); physfree += kstack0_sz; /* * Initialize enough of thread0 for delayed invalidation to * work very early. Rely on thread0.td_base_pri * zero-initialization, it is reset to PVM at proc0_init(). */ pmap_thread_init_invl_gen(&thread0); pc = &temp_bsp_pcpu; pcpu_init(pc, 0, sizeof(struct pcpu)); gdt = &temp_bsp_pcpu.pc_gdt[0]; /* * make gdt memory segments */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long)gdt; lgdt(&r_gdt); wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) * transition). * Once bootblocks have updated, we can test directly for * efi_systbl != NULL here... */ if (efi_boot) vty_set_preferred(VTY_VT); TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", &syscall_ret_l1d_flush_mode); TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", &x86_rngds_mitg_enable); + TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable", + &zenbleed_enable); + zenbleed_sanitize_enable(); + finishidentcpu(); /* Final stage of CPU initialization */ /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); initializecpu(); /* Initialize CPU registers */ amd64_bsp_ist_init(pc); /* Set the IO permission bitmap (empty due to tss seg limit) */ pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); amd64_conf_fast_syscall(); /* * We initialize the PCB pointer early so that exception * handlers will work. Also set up td_critnest to short-cut * the page fault handler. */ cpu_max_ext_state_size = sizeof(struct savefpu); set_top_of_stack_td(&thread0); thread0.td_pcb = get_pcb_td(&thread0); thread0.td_critnest = 1; /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); amd64_kdb_init(); } getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ #ifdef DEV_PCI /* This call might adjust phys_avail[]. */ pci_early_quirks(); #endif if (late_console) cninit(); /* * Dump the boot metadata. We have to wait for cninit() since console * output is required. If it's grossly incorrect the kernel will never * make it this far. */ if (getenv_is_true("debug.dump_modinfo_at_boot")) preload_dump(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?" #endif if (late_console) amd64_kdb_init(); msgbufinit(msgbufp, msgbufsize); fpuinit(); /* make an initial tss so cpu can get interrupt stack on syscall! */ rsp0 = thread0.td_md.md_stack_base; /* Ensure the stack is aligned to 16 bytes */ rsp0 &= ~0xFul; PCPU_PTR(common_tss)->tss_rsp0 = rsp0; amd64_bsp_pcpu_init2(rsp0); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); kcsan_cpu_init(0); #ifdef FDT x86_init_fdt(); #endif thread0.td_critnest = 0; kasan_init(); kmsan_init(); TSEXIT(); /* Location of kernel stack for locore */ return (thread0.td_md.md_stack_base); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); static int efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct efi_map_header *efihdr; caddr_t kmdp; uint32_t efisize; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) return (0); efisize = *((uint32_t *)efihdr - 1); return (SYSCTL_OUT(req, efihdr, efisize)); } SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } /* * The pcb_flags is only modified by current thread, or by other threads * when current thread is stopped. However, current thread may change it * from the interrupt context in cpu_switch(), or in the trap handler. * When we read-modify-write pcb_flags from C sources, compiler may generate * code that is not atomic regarding the interrupt handler. If a trap or * interrupt happens and any flag is modified from the handler, it can be * clobbered with the cached value later. Therefore, we implement setting * and clearing flags with single-instruction functions, which do not race * with possible modification of the flags from the trap or interrupt context, * because traps and interrupts are executed only on instruction boundary. */ void set_pcb_flags_raw(struct pcb *pcb, const u_int flags) { __asm __volatile("orl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) : "cc", "memory"); } /* * The support for RDFSBASE, WRFSBASE and similar instructions for %gs * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into * pcb if user space modified the bases. We must save on the context * switch or if the return to usermode happens through the doreti. * * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, * which have a consequence that the base MSRs must be saved each time * the PCB_FULL_IRET flag is set. We disable interrupts to sync with * context switches. */ static void set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) { register_t r; if (curpcb == pcb && (flags & PCB_FULL_IRET) != 0 && (pcb->pcb_flags & PCB_FULL_IRET) == 0) { r = intr_disable(); if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { if (rfs() == _ufssel) pcb->pcb_fsbase = rdfsbase(); if (rgs() == _ugssel) pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); } set_pcb_flags_raw(pcb, flags); intr_restore(r); } else { set_pcb_flags_raw(pcb, flags); } } DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) { return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? set_pcb_flags_fsgsbase : set_pcb_flags_raw); } void clear_pcb_flags(struct pcb *pcb, const u_int flags) { __asm __volatile("andl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) : "cc", "memory"); } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ #undef memset #undef memmove #undef memcpy void *memset_std(void *buf, int c, size_t len); void *memset_erms(void *buf, int c, size_t len); void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); #ifdef KCSAN /* * These fail to build as ifuncs when used with KCSAN. */ void * memset(void *buf, int c, size_t len) { return (memset_std(buf, c, len)); } void * memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memmove_std(dst, src, len)); } void * memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memcpy_std(dst, src, len)); } #else DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memset_erms : memset_std); } DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memmove_erms : memmove_std); } DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memcpy_erms : memcpy_std); } #endif void pagezero_std(void *addr); void pagezero_erms(void *addr); DEFINE_IFUNC(, void , pagezero, (void *)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? pagezero_erms : pagezero_std); } diff --git a/sys/dev/cpuctl/cpuctl.c b/sys/dev/cpuctl/cpuctl.c index 9c56db1ad19a..1fa655342121 100644 --- a/sys/dev/cpuctl/cpuctl.c +++ b/sys/dev/cpuctl/cpuctl.c @@ -1,602 +1,603 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2006-2008 Stanislav Sedov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static d_open_t cpuctl_open; static d_ioctl_t cpuctl_ioctl; #define CPUCTL_VERSION 1 #ifdef CPUCTL_DEBUG # define DPRINTF(format,...) printf(format, __VA_ARGS__); #else # define DPRINTF(...) #endif #define UCODE_SIZE_MAX (4 * 1024 * 1024) static int cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd, struct thread *td); static int cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data, struct thread *td); static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data, struct thread *td); static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td); static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td); static int update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td); static int update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td); static int update_via(int cpu, cpuctl_update_args_t *args, struct thread *td); static struct cdev **cpuctl_devs; static MALLOC_DEFINE(M_CPUCTL, "cpuctl", "CPUCTL buffer"); static struct cdevsw cpuctl_cdevsw = { .d_version = D_VERSION, .d_open = cpuctl_open, .d_ioctl = cpuctl_ioctl, .d_name = "cpuctl", }; /* * This function checks if specified cpu enabled or not. */ static int cpu_enabled(int cpu) { return (pmc_cpu_is_disabled(cpu) == 0); } /* * Check if the current thread is bound to a specific cpu. */ static int cpu_sched_is_bound(struct thread *td) { int ret; thread_lock(td); ret = sched_is_bound(td); thread_unlock(td); return (ret); } /* * Switch to target cpu to run. */ static void set_cpu(int cpu, struct thread *td) { KASSERT(cpu >= 0 && cpu <= mp_maxid && cpu_enabled(cpu), ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); thread_lock(td); sched_bind(td, cpu); thread_unlock(td); KASSERT(td->td_oncpu == cpu, ("[cpuctl,%d]: cannot bind to target cpu %d on cpu %d", __LINE__, cpu, td->td_oncpu)); } static void restore_cpu(int oldcpu, int is_bound, struct thread *td) { KASSERT(oldcpu >= 0 && oldcpu <= mp_maxid && cpu_enabled(oldcpu), ("[cpuctl,%d]: bad cpu number %d", __LINE__, oldcpu)); thread_lock(td); if (is_bound == 0) sched_unbind(td); else sched_bind(td, oldcpu); thread_unlock(td); } int cpuctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags, struct thread *td) { int cpu, ret; cpu = dev2unit(dev); if (cpu > mp_maxid || !cpu_enabled(cpu)) { DPRINTF("[cpuctl,%d]: bad cpu number %d\n", __LINE__, cpu); return (ENXIO); } /* Require write flag for "write" requests. */ if ((cmd == CPUCTL_MSRCBIT || cmd == CPUCTL_MSRSBIT || cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR || cmd == CPUCTL_EVAL_CPU_FEATURES) && (flags & FWRITE) == 0) return (EPERM); switch (cmd) { case CPUCTL_RDMSR: ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td); break; case CPUCTL_MSRSBIT: case CPUCTL_MSRCBIT: case CPUCTL_WRMSR: ret = priv_check(td, PRIV_CPUCTL_WRMSR); if (ret != 0) goto fail; ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td); break; case CPUCTL_CPUID: ret = cpuctl_do_cpuid(cpu, (cpuctl_cpuid_args_t *)data, td); break; case CPUCTL_UPDATE: ret = priv_check(td, PRIV_CPUCTL_UPDATE); if (ret != 0) goto fail; ret = cpuctl_do_update(cpu, (cpuctl_update_args_t *)data, td); break; case CPUCTL_CPUID_COUNT: ret = cpuctl_do_cpuid_count(cpu, (cpuctl_cpuid_count_args_t *)data, td); break; case CPUCTL_EVAL_CPU_FEATURES: ret = cpuctl_do_eval_cpu_features(cpu, td); break; default: ret = EINVAL; break; } fail: return (ret); } /* * Actually perform cpuid operation. */ static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data, struct thread *td) { int is_bound = 0; int oldcpu; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); /* Explicitly clear cpuid data to avoid returning stale info. */ bzero(data->data, sizeof(data->data)); DPRINTF("[cpuctl,%d]: retrieving cpuid lev %#0x type %#0x for %d cpu\n", __LINE__, data->level, data->level_type, cpu); #ifdef __i386__ if (cpu_id == 0) return (ENODEV); #endif oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); cpuid_count(data->level, data->level_type, data->data); restore_cpu(oldcpu, is_bound, td); return (0); } static int cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data, struct thread *td) { cpuctl_cpuid_count_args_t cdata; int error; cdata.level = data->level; /* Override the level type. */ cdata.level_type = 0; error = cpuctl_do_cpuid_count(cpu, &cdata, td); bcopy(cdata.data, data->data, sizeof(data->data)); /* Ignore error */ return (error); } /* * Actually perform MSR operations. */ static int cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd, struct thread *td) { uint64_t reg; int is_bound = 0; int oldcpu; int ret; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); /* * Explicitly clear cpuid data to avoid returning stale * info */ DPRINTF("[cpuctl,%d]: operating on MSR %#0x for %d cpu\n", __LINE__, data->msr, cpu); #ifdef __i386__ if ((cpu_feature & CPUID_MSR) == 0) return (ENODEV); #endif oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); if (cmd == CPUCTL_RDMSR) { data->data = 0; ret = rdmsr_safe(data->msr, &data->data); } else if (cmd == CPUCTL_WRMSR) { ret = wrmsr_safe(data->msr, data->data); } else if (cmd == CPUCTL_MSRSBIT) { critical_enter(); ret = rdmsr_safe(data->msr, ®); if (ret == 0) ret = wrmsr_safe(data->msr, reg | data->data); critical_exit(); } else if (cmd == CPUCTL_MSRCBIT) { critical_enter(); ret = rdmsr_safe(data->msr, ®); if (ret == 0) ret = wrmsr_safe(data->msr, reg & ~data->data); critical_exit(); } else panic("[cpuctl,%d]: unknown operation requested: %lu", __LINE__, cmd); restore_cpu(oldcpu, is_bound, td); return (ret); } /* * Actually perform microcode update. */ static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td) { cpuctl_cpuid_args_t args = { .level = 0, }; char vendor[13]; int ret; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); DPRINTF("[cpuctl,%d]: XXX %d", __LINE__, cpu); ret = cpuctl_do_cpuid(cpu, &args, td); if (ret != 0) return (ret); ((uint32_t *)vendor)[0] = args.data[1]; ((uint32_t *)vendor)[1] = args.data[3]; ((uint32_t *)vendor)[2] = args.data[2]; vendor[12] = '\0'; if (strncmp(vendor, INTEL_VENDOR_ID, sizeof(INTEL_VENDOR_ID)) == 0) ret = update_intel(cpu, data, td); else if(strncmp(vendor, AMD_VENDOR_ID, sizeof(AMD_VENDOR_ID)) == 0) ret = update_amd(cpu, data, td); else if(strncmp(vendor, CENTAUR_VENDOR_ID, sizeof(CENTAUR_VENDOR_ID)) == 0) ret = update_via(cpu, data, td); else ret = ENXIO; return (ret); } struct ucode_update_data { void *ptr; int cpu; int ret; }; static void ucode_intel_load_rv(void *arg) { struct ucode_update_data *d; d = arg; if (PCPU_GET(cpuid) == d->cpu) d->ret = ucode_intel_load(d->ptr, true, NULL, NULL); } static int update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td) { struct ucode_update_data d; void *ptr; int is_bound, oldcpu, ret; if (args->size == 0 || args->data == NULL) { DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__); return (EINVAL); } if (args->size > UCODE_SIZE_MAX) { DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__); return (EINVAL); } /* * 16 byte alignment required. Rely on the fact that * malloc(9) always returns the pointer aligned at least on * the size of the allocation. */ ptr = malloc(args->size + 16, M_CPUCTL, M_WAITOK); if (copyin(args->data, ptr, args->size) != 0) { DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed", __LINE__, args->data, ptr, args->size); ret = EFAULT; goto out; } oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); d.ptr = ptr; d.cpu = cpu; smp_rendezvous(NULL, ucode_intel_load_rv, NULL, &d); restore_cpu(oldcpu, is_bound, td); ret = d.ret; /* * Replace any existing update. This ensures that the new update * will be reloaded automatically during ACPI resume. */ if (ret == 0) ptr = ucode_update(ptr); out: free(ptr, M_CPUCTL); return (ret); } /* * NB: MSR 0xc0010020, MSR_K8_UCODE_UPDATE, is not documented by AMD. * Coreboot, illumos and Linux source code was used to understand * its workings. */ static void amd_ucode_wrmsr(void *ucode_ptr) { uint32_t tmp[4]; wrmsr_safe(MSR_K8_UCODE_UPDATE, (uintptr_t)ucode_ptr); do_cpuid(0, tmp); } static int update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td) { void *ptr; int ret; if (args->size == 0 || args->data == NULL) { DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__); return (EINVAL); } if (args->size > UCODE_SIZE_MAX) { DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__); return (EINVAL); } /* * 16 byte alignment required. Rely on the fact that * malloc(9) always returns the pointer aligned at least on * the size of the allocation. */ ptr = malloc(args->size + 16, M_CPUCTL, M_ZERO | M_WAITOK); if (copyin(args->data, ptr, args->size) != 0) { DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed", __LINE__, args->data, ptr, args->size); ret = EFAULT; goto fail; } smp_rendezvous(NULL, amd_ucode_wrmsr, NULL, ptr); ret = 0; fail: free(ptr, M_CPUCTL); return (ret); } static int update_via(int cpu, cpuctl_update_args_t *args, struct thread *td) { void *ptr; uint64_t rev0, rev1, res; uint32_t tmp[4]; int is_bound; int oldcpu; int ret; if (args->size == 0 || args->data == NULL) { DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__); return (EINVAL); } if (args->size > UCODE_SIZE_MAX) { DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__); return (EINVAL); } /* * 4 byte alignment required. */ ptr = malloc(args->size, M_CPUCTL, M_WAITOK); if (copyin(args->data, ptr, args->size) != 0) { DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed", __LINE__, args->data, ptr, args->size); ret = EFAULT; goto fail; } oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); critical_enter(); rdmsr_safe(MSR_BIOS_SIGN, &rev0); /* Get current microcode revision. */ /* * Perform update. */ wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uintptr_t)(ptr)); do_cpuid(1, tmp); /* * Result are in low byte of MSR FCR5: * 0x00: No update has been attempted since RESET. * 0x01: The last attempted update was successful. * 0x02: The last attempted update was unsuccessful due to a bad * environment. No update was loaded and any preexisting * patches are still active. * 0x03: The last attempted update was not applicable to this processor. * No update was loaded and any preexisting patches are still * active. * 0x04: The last attempted update was not successful due to an invalid * update data block. No update was loaded and any preexisting * patches are still active */ rdmsr_safe(0x1205, &res); res &= 0xff; critical_exit(); rdmsr_safe(MSR_BIOS_SIGN, &rev1); /* Get new microcode revision. */ restore_cpu(oldcpu, is_bound, td); DPRINTF("[cpu,%d]: rev0=%x rev1=%x res=%x\n", __LINE__, (unsigned)(rev0 >> 32), (unsigned)(rev1 >> 32), (unsigned)res); if (res != 0x01) ret = EINVAL; else ret = 0; fail: free(ptr, M_CPUCTL); return (ret); } static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td) { int is_bound = 0; int oldcpu; KASSERT(cpu >= 0 && cpu <= mp_maxid, ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu)); #ifdef __i386__ if (cpu_id == 0) return (ENODEV); #endif oldcpu = td->td_oncpu; is_bound = cpu_sched_is_bound(td); set_cpu(cpu, td); identify_cpu1(); identify_cpu2(); restore_cpu(oldcpu, is_bound, td); hw_ibrs_recalculate(true); hw_ssb_recalculate(true); #ifdef __amd64__ amd64_syscall_ret_flush_l1d_recalc(); pmap_allow_2m_x_ept_recalculate(); #endif hw_mds_recalculate(); x86_taa_recalculate(); x86_rngds_mitg_recalculate(true); + zenbleed_check_and_apply(true); printcpuinfo(); return (0); } int cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td) { int ret = 0; int cpu; cpu = dev2unit(dev); if (cpu > mp_maxid || !cpu_enabled(cpu)) { DPRINTF("[cpuctl,%d]: incorrect cpu number %d\n", __LINE__, cpu); return (ENXIO); } if (flags & FWRITE) ret = securelevel_gt(td->td_ucred, 0); return (ret); } static int cpuctl_modevent(module_t mod __unused, int type, void *data __unused) { int cpu; switch(type) { case MOD_LOAD: if (bootverbose) printf("cpuctl: access to MSR registers/cpuid info.\n"); cpuctl_devs = malloc(sizeof(*cpuctl_devs) * (mp_maxid + 1), M_CPUCTL, M_WAITOK | M_ZERO); CPU_FOREACH(cpu) if (cpu_enabled(cpu)) cpuctl_devs[cpu] = make_dev(&cpuctl_cdevsw, cpu, UID_ROOT, GID_KMEM, 0640, "cpuctl%d", cpu); break; case MOD_UNLOAD: CPU_FOREACH(cpu) { if (cpuctl_devs[cpu] != NULL) destroy_dev(cpuctl_devs[cpu]); } free(cpuctl_devs, M_CPUCTL); break; case MOD_SHUTDOWN: break; default: return (EOPNOTSUPP); } return (0); } DEV_MODULE(cpuctl, cpuctl_modevent, NULL); MODULE_VERSION(cpuctl, CPUCTL_VERSION); diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h index f45990a056c8..4886430c84b6 100644 --- a/sys/x86/include/specialreg.h +++ b/sys/x86/include/specialreg.h @@ -1,1212 +1,1213 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91 */ #ifndef _MACHINE_SPECIALREG_H_ #define _MACHINE_SPECIALREG_H_ /* * Bits in 386 special registers: */ #define CR0_PE 0x00000001 /* Protected mode Enable */ #define CR0_MP 0x00000002 /* "Math" (fpu) Present */ #define CR0_EM 0x00000004 /* EMulate FPU instructions. (trap ESC only) */ #define CR0_TS 0x00000008 /* Task Switched (if MP, trap ESC and WAIT) */ #define CR0_PG 0x80000000 /* PaGing enable */ /* * Bits in 486 special registers: */ #define CR0_NE 0x00000020 /* Numeric Error enable (EX16 vs IRQ13) */ #define CR0_WP 0x00010000 /* Write Protect (honor page protect in all modes) */ #define CR0_AM 0x00040000 /* Alignment Mask (set to enable AC flag) */ #define CR0_NW 0x20000000 /* Not Write-through */ #define CR0_CD 0x40000000 /* Cache Disable */ #define CR3_PCID_SAVE 0x8000000000000000 #define CR3_PCID_MASK 0xfff /* * Bits in PPro special registers */ #define CR4_VME 0x00000001 /* Virtual 8086 mode extensions */ #define CR4_PVI 0x00000002 /* Protected-mode virtual interrupts */ #define CR4_TSD 0x00000004 /* Time stamp disable */ #define CR4_DE 0x00000008 /* Debugging extensions */ #define CR4_PSE 0x00000010 /* Page size extensions */ #define CR4_PAE 0x00000020 /* Physical address extension */ #define CR4_MCE 0x00000040 /* Machine check enable */ #define CR4_PGE 0x00000080 /* Page global enable */ #define CR4_PCE 0x00000100 /* Performance monitoring counter enable */ #define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ #define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */ #define CR4_UMIP 0x00000800 /* User Mode Instruction Prevention */ #define CR4_LA57 0x00001000 /* Enable 5-level paging */ #define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */ #define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */ #define CR4_PCIDE 0x00020000 /* Enable Context ID */ #define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ #define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ #define CR4_SMAP 0x00200000 /* Supervisor-Mode Access Prevention */ #define CR4_PKE 0x00400000 /* Protection Keys Enable */ /* * Bits in AMD64 special registers. EFER is 64 bits wide. */ #define EFER_SCE 0x000000001 /* System Call Extensions (R/W) */ #define EFER_LME 0x000000100 /* Long mode enable (R/W) */ #define EFER_LMA 0x000000400 /* Long mode active (R) */ #define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ #define EFER_SVM 0x000001000 /* SVM enable bit for AMD, reserved for Intel */ #define EFER_LMSLE 0x000002000 /* Long Mode Segment Limit Enable */ #define EFER_FFXSR 0x000004000 /* Fast FXSAVE/FSRSTOR */ #define EFER_TCE 0x000008000 /* Translation Cache Extension */ #define EFER_MCOMMIT 0x00020000 /* Enable MCOMMIT (AMD) */ /* * Intel Extended Features registers */ #define XCR0 0 /* XFEATURE_ENABLED_MASK register */ #define XFEATURE_ENABLED_X87 0x00000001 #define XFEATURE_ENABLED_SSE 0x00000002 #define XFEATURE_ENABLED_YMM_HI128 0x00000004 #define XFEATURE_ENABLED_AVX XFEATURE_ENABLED_YMM_HI128 #define XFEATURE_ENABLED_BNDREGS 0x00000008 #define XFEATURE_ENABLED_BNDCSR 0x00000010 #define XFEATURE_ENABLED_OPMASK 0x00000020 #define XFEATURE_ENABLED_ZMM_HI256 0x00000040 #define XFEATURE_ENABLED_HI16_ZMM 0x00000080 #define XFEATURE_ENABLED_PKRU 0x00000200 #define XFEATURE_ENABLED_TILECONFIG 0x00020000 #define XFEATURE_ENABLED_TILEDATA 0x00040000 #define XFEATURE_AVX \ (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX) #define XFEATURE_AVX512 \ (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 | \ XFEATURE_ENABLED_HI16_ZMM) #define XFEATURE_MPX \ (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR) /* * CPUID instruction features register */ #define CPUID_FPU 0x00000001 #define CPUID_VME 0x00000002 #define CPUID_DE 0x00000004 #define CPUID_PSE 0x00000008 #define CPUID_TSC 0x00000010 #define CPUID_MSR 0x00000020 #define CPUID_PAE 0x00000040 #define CPUID_MCE 0x00000080 #define CPUID_CX8 0x00000100 #define CPUID_APIC 0x00000200 #define CPUID_B10 0x00000400 #define CPUID_SEP 0x00000800 #define CPUID_MTRR 0x00001000 #define CPUID_PGE 0x00002000 #define CPUID_MCA 0x00004000 #define CPUID_CMOV 0x00008000 #define CPUID_PAT 0x00010000 #define CPUID_PSE36 0x00020000 #define CPUID_PSN 0x00040000 #define CPUID_CLFSH 0x00080000 #define CPUID_B20 0x00100000 #define CPUID_DS 0x00200000 #define CPUID_ACPI 0x00400000 #define CPUID_MMX 0x00800000 #define CPUID_FXSR 0x01000000 #define CPUID_SSE 0x02000000 #define CPUID_XMM 0x02000000 #define CPUID_SSE2 0x04000000 #define CPUID_SS 0x08000000 #define CPUID_HTT 0x10000000 #define CPUID_TM 0x20000000 #define CPUID_IA64 0x40000000 #define CPUID_PBE 0x80000000 #define CPUID2_SSE3 0x00000001 #define CPUID2_PCLMULQDQ 0x00000002 #define CPUID2_DTES64 0x00000004 #define CPUID2_MON 0x00000008 #define CPUID2_DS_CPL 0x00000010 #define CPUID2_VMX 0x00000020 #define CPUID2_SMX 0x00000040 #define CPUID2_EST 0x00000080 #define CPUID2_TM2 0x00000100 #define CPUID2_SSSE3 0x00000200 #define CPUID2_CNXTID 0x00000400 #define CPUID2_SDBG 0x00000800 #define CPUID2_FMA 0x00001000 #define CPUID2_CX16 0x00002000 #define CPUID2_XTPR 0x00004000 #define CPUID2_PDCM 0x00008000 #define CPUID2_PCID 0x00020000 #define CPUID2_DCA 0x00040000 #define CPUID2_SSE41 0x00080000 #define CPUID2_SSE42 0x00100000 #define CPUID2_X2APIC 0x00200000 #define CPUID2_MOVBE 0x00400000 #define CPUID2_POPCNT 0x00800000 #define CPUID2_TSCDLT 0x01000000 #define CPUID2_AESNI 0x02000000 #define CPUID2_XSAVE 0x04000000 #define CPUID2_OSXSAVE 0x08000000 #define CPUID2_AVX 0x10000000 #define CPUID2_F16C 0x20000000 #define CPUID2_RDRAND 0x40000000 #define CPUID2_HV 0x80000000 /* Intel Processor Trace CPUID. */ /* Leaf 0 ebx. */ #define CPUPT_CR3 (1 << 0) /* CR3 Filtering Support */ #define CPUPT_PSB (1 << 1) /* Configurable PSB and Cycle-Accurate Mode Supported */ #define CPUPT_IPF (1 << 2) /* IP Filtering and TraceStop supported */ #define CPUPT_MTC (1 << 3) /* MTC Supported */ #define CPUPT_PRW (1 << 4) /* PTWRITE Supported */ #define CPUPT_PWR (1 << 5) /* Power Event Trace Supported */ /* Leaf 0 ecx. */ #define CPUPT_TOPA (1 << 0) /* ToPA Output Supported */ #define CPUPT_TOPA_MULTI (1 << 1) /* ToPA Tables Allow Multiple Output Entries */ #define CPUPT_SINGLE (1 << 2) /* Single-Range Output Supported */ #define CPUPT_TT_OUT (1 << 3) /* Output to Trace Transport Subsystem Supported */ #define CPUPT_LINEAR_IP (1 << 31) /* IP Payloads are Linear IP, otherwise IP is effective */ /* Leaf 1 eax. */ #define CPUPT_NADDR_S 0 /* Number of Address Ranges */ #define CPUPT_NADDR_M (0x7 << CPUPT_NADDR_S) #define CPUPT_MTC_BITMAP_S 16 /* Bitmap of supported MTC Period Encodings */ #define CPUPT_MTC_BITMAP_M (0xffff << CPUPT_MTC_BITMAP_S) /* Leaf 1 ebx. */ #define CPUPT_CT_BITMAP_S 0 /* Bitmap of supported Cycle Threshold values */ #define CPUPT_CT_BITMAP_M (0xffff << CPUPT_CT_BITMAP_S) #define CPUPT_PFE_BITMAP_S 16 /* Bitmap of supported Configurable PSB Frequency encoding */ #define CPUPT_PFE_BITMAP_M (0xffff << CPUPT_PFE_BITMAP_S) /* * Important bits in the AMD extended cpuid flags */ #define AMDID_SYSCALL 0x00000800 #define AMDID_MP 0x00080000 #define AMDID_NX 0x00100000 #define AMDID_EXT_MMX 0x00400000 #define AMDID_FFXSR 0x02000000 #define AMDID_PAGE1GB 0x04000000 #define AMDID_RDTSCP 0x08000000 #define AMDID_LM 0x20000000 #define AMDID_EXT_3DNOW 0x40000000 #define AMDID_3DNOW 0x80000000 #define AMDID2_LAHF 0x00000001 #define AMDID2_CMP 0x00000002 #define AMDID2_SVM 0x00000004 #define AMDID2_EXT_APIC 0x00000008 #define AMDID2_CR8 0x00000010 #define AMDID2_ABM 0x00000020 #define AMDID2_SSE4A 0x00000040 #define AMDID2_MAS 0x00000080 #define AMDID2_PREFETCH 0x00000100 #define AMDID2_OSVW 0x00000200 #define AMDID2_IBS 0x00000400 #define AMDID2_XOP 0x00000800 #define AMDID2_SKINIT 0x00001000 #define AMDID2_WDT 0x00002000 #define AMDID2_LWP 0x00008000 #define AMDID2_FMA4 0x00010000 #define AMDID2_TCE 0x00020000 #define AMDID2_NODE_ID 0x00080000 #define AMDID2_TBM 0x00200000 #define AMDID2_TOPOLOGY 0x00400000 #define AMDID2_PCXC 0x00800000 #define AMDID2_PNXC 0x01000000 #define AMDID2_DBE 0x04000000 #define AMDID2_PTSC 0x08000000 #define AMDID2_PTSCEL2I 0x10000000 #define AMDID2_MWAITX 0x20000000 /* * CPUID instruction 1 eax info */ #define CPUID_STEPPING 0x0000000f #define CPUID_MODEL 0x000000f0 #define CPUID_FAMILY 0x00000f00 #define CPUID_EXT_MODEL 0x000f0000 #define CPUID_EXT_FAMILY 0x0ff00000 #ifdef __i386__ #define CPUID_TO_MODEL(id) \ ((((id) & CPUID_MODEL) >> 4) | \ ((((id) & CPUID_FAMILY) >= 0x600) ? \ (((id) & CPUID_EXT_MODEL) >> 12) : 0)) #define CPUID_TO_FAMILY(id) \ ((((id) & CPUID_FAMILY) >> 8) + \ ((((id) & CPUID_FAMILY) == 0xf00) ? \ (((id) & CPUID_EXT_FAMILY) >> 20) : 0)) #else #define CPUID_TO_MODEL(id) \ ((((id) & CPUID_MODEL) >> 4) | \ (((id) & CPUID_EXT_MODEL) >> 12)) #define CPUID_TO_FAMILY(id) \ ((((id) & CPUID_FAMILY) >> 8) + \ (((id) & CPUID_EXT_FAMILY) >> 20)) #endif #define CPUID_TO_STEPPING(id) ((id) & CPUID_STEPPING) /* * CPUID instruction 1 ebx info */ #define CPUID_BRAND_INDEX 0x000000ff #define CPUID_CLFUSH_SIZE 0x0000ff00 #define CPUID_HTT_CORES 0x00ff0000 #define CPUID_LOCAL_APIC_ID 0xff000000 /* * CPUID instruction 5 info */ #define CPUID5_MON_MIN_SIZE 0x0000ffff /* eax */ #define CPUID5_MON_MAX_SIZE 0x0000ffff /* ebx */ #define CPUID5_MON_MWAIT_EXT 0x00000001 /* ecx */ #define CPUID5_MWAIT_INTRBREAK 0x00000002 /* ecx */ /* * MWAIT cpu power states. Lower 4 bits are sub-states. */ #define MWAIT_C0 0xf0 #define MWAIT_C1 0x00 #define MWAIT_C2 0x10 #define MWAIT_C3 0x20 #define MWAIT_C4 0x30 /* * MWAIT extensions. */ /* Interrupt breaks MWAIT even when masked. */ #define MWAIT_INTRBREAK 0x00000001 /* * CPUID leaf 6: Thermal and Power management. */ /* Eax. */ #define CPUTPM1_SENSOR 0x00000001 #define CPUTPM1_TURBO 0x00000002 #define CPUTPM1_ARAT 0x00000004 #define CPUTPM1_PLN 0x00000010 #define CPUTPM1_ECMD 0x00000020 #define CPUTPM1_PTM 0x00000040 #define CPUTPM1_HWP 0x00000080 #define CPUTPM1_HWP_NOTIFICATION 0x00000100 #define CPUTPM1_HWP_ACTIVITY_WINDOW 0x00000200 #define CPUTPM1_HWP_PERF_PREF 0x00000400 #define CPUTPM1_HWP_PKG 0x00000800 #define CPUTPM1_HDC 0x00002000 #define CPUTPM1_TURBO30 0x00004000 #define CPUTPM1_HWP_CAPABILITIES 0x00008000 #define CPUTPM1_HWP_PECI_OVR 0x00010000 #define CPUTPM1_HWP_FLEXIBLE 0x00020000 #define CPUTPM1_HWP_FAST_MSR 0x00040000 #define CPUTPM1_HW_FEEDBACK 0x00080000 #define CPUTPM1_HWP_IGN_IDLE 0x00100000 #define CPUTPM1_THREAD_DIRECTOR 0x00800000 /* Ebx. */ #define CPUTPM_B_NSENSINTTHRESH 0x0000000f /* Ecx. */ #define CPUID_PERF_STAT 0x00000001 #define CPUID_PERF_BIAS 0x00000008 /* * CPUID instruction 0xb ebx info. */ #define CPUID_TYPE_INVAL 0 #define CPUID_TYPE_SMT 1 #define CPUID_TYPE_CORE 2 /* * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1 */ #define CPUID_EXTSTATE_XSAVEOPT 0x00000001 #define CPUID_EXTSTATE_XSAVEC 0x00000002 #define CPUID_EXTSTATE_XINUSE 0x00000004 #define CPUID_EXTSTATE_XSAVES 0x00000008 /* * AMD extended function 8000_0007h ebx info */ #define AMDRAS_MCA_OF_RECOV 0x00000001 #define AMDRAS_SUCCOR 0x00000002 #define AMDRAS_HW_ASSERT 0x00000004 #define AMDRAS_SCALABLE_MCA 0x00000008 #define AMDRAS_PFEH_SUPPORT 0x00000010 /* * AMD extended function 8000_0007h edx info */ #define AMDPM_TS 0x00000001 #define AMDPM_FID 0x00000002 #define AMDPM_VID 0x00000004 #define AMDPM_TTP 0x00000008 #define AMDPM_TM 0x00000010 #define AMDPM_STC 0x00000020 #define AMDPM_100MHZ_STEPS 0x00000040 #define AMDPM_HW_PSTATE 0x00000080 #define AMDPM_TSC_INVARIANT 0x00000100 #define AMDPM_CPB 0x00000200 /* * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions) */ #define AMDFEID_CLZERO 0x00000001 #define AMDFEID_IRPERF 0x00000002 #define AMDFEID_XSAVEERPTR 0x00000004 #define AMDFEID_RDPRU 0x00000010 #define AMDFEID_MCOMMIT 0x00000100 #define AMDFEID_WBNOINVD 0x00000200 #define AMDFEID_IBPB 0x00001000 #define AMDFEID_IBRS 0x00004000 #define AMDFEID_STIBP 0x00008000 /* The below are only defined if the corresponding base feature above exists. */ #define AMDFEID_IBRS_ALWAYSON 0x00010000 #define AMDFEID_STIBP_ALWAYSON 0x00020000 #define AMDFEID_PREFER_IBRS 0x00040000 #define AMDFEID_PPIN 0x00800000 #define AMDFEID_SSBD 0x01000000 /* SSBD via MSRC001_011F instead of MSR 0x48: */ #define AMDFEID_VIRT_SSBD 0x02000000 #define AMDFEID_SSB_NO 0x04000000 /* * AMD extended function 8000_0008h ecx info */ #define AMDID_CMP_CORES 0x000000ff #define AMDID_COREID_SIZE 0x0000f000 #define AMDID_COREID_SIZE_SHIFT 12 /* * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info */ #define CPUID_STDEXT_FSGSBASE 0x00000001 #define CPUID_STDEXT_TSC_ADJUST 0x00000002 #define CPUID_STDEXT_SGX 0x00000004 #define CPUID_STDEXT_BMI1 0x00000008 #define CPUID_STDEXT_HLE 0x00000010 #define CPUID_STDEXT_AVX2 0x00000020 #define CPUID_STDEXT_FDP_EXC 0x00000040 #define CPUID_STDEXT_SMEP 0x00000080 #define CPUID_STDEXT_BMI2 0x00000100 #define CPUID_STDEXT_ERMS 0x00000200 #define CPUID_STDEXT_INVPCID 0x00000400 #define CPUID_STDEXT_RTM 0x00000800 #define CPUID_STDEXT_PQM 0x00001000 #define CPUID_STDEXT_NFPUSG 0x00002000 #define CPUID_STDEXT_MPX 0x00004000 #define CPUID_STDEXT_PQE 0x00008000 #define CPUID_STDEXT_AVX512F 0x00010000 #define CPUID_STDEXT_AVX512DQ 0x00020000 #define CPUID_STDEXT_RDSEED 0x00040000 #define CPUID_STDEXT_ADX 0x00080000 #define CPUID_STDEXT_SMAP 0x00100000 #define CPUID_STDEXT_AVX512IFMA 0x00200000 /* Formerly PCOMMIT */ #define CPUID_STDEXT_CLFLUSHOPT 0x00800000 #define CPUID_STDEXT_CLWB 0x01000000 #define CPUID_STDEXT_PROCTRACE 0x02000000 #define CPUID_STDEXT_AVX512PF 0x04000000 #define CPUID_STDEXT_AVX512ER 0x08000000 #define CPUID_STDEXT_AVX512CD 0x10000000 #define CPUID_STDEXT_SHA 0x20000000 #define CPUID_STDEXT_AVX512BW 0x40000000 #define CPUID_STDEXT_AVX512VL 0x80000000 /* * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info */ #define CPUID_STDEXT2_PREFETCHWT1 0x00000001 #define CPUID_STDEXT2_AVX512VBMI 0x00000002 #define CPUID_STDEXT2_UMIP 0x00000004 #define CPUID_STDEXT2_PKU 0x00000008 #define CPUID_STDEXT2_OSPKE 0x00000010 #define CPUID_STDEXT2_WAITPKG 0x00000020 #define CPUID_STDEXT2_AVX512VBMI2 0x00000040 #define CPUID_STDEXT2_GFNI 0x00000100 #define CPUID_STDEXT2_VAES 0x00000200 #define CPUID_STDEXT2_VPCLMULQDQ 0x00000400 #define CPUID_STDEXT2_AVX512VNNI 0x00000800 #define CPUID_STDEXT2_AVX512BITALG 0x00001000 #define CPUID_STDEXT2_TME 0x00002000 #define CPUID_STDEXT2_AVX512VPOPCNTDQ 0x00004000 #define CPUID_STDEXT2_LA57 0x00010000 #define CPUID_STDEXT2_RDPID 0x00400000 #define CPUID_STDEXT2_CLDEMOTE 0x02000000 #define CPUID_STDEXT2_MOVDIRI 0x08000000 #define CPUID_STDEXT2_MOVDIR64B 0x10000000 #define CPUID_STDEXT2_ENQCMD 0x20000000 #define CPUID_STDEXT2_SGXLC 0x40000000 /* * CPUID instruction 7 Structured Extended Features, leaf 0 edx info */ #define CPUID_STDEXT3_AVX5124VNNIW 0x00000004 #define CPUID_STDEXT3_AVX5124FMAPS 0x00000008 #define CPUID_STDEXT3_FSRM 0x00000010 #define CPUID_STDEXT3_AVX512VP2INTERSECT 0x00000100 #define CPUID_STDEXT3_MCUOPT 0x00000200 #define CPUID_STDEXT3_MD_CLEAR 0x00000400 #define CPUID_STDEXT3_TSXFA 0x00002000 #define CPUID_STDEXT3_PCONFIG 0x00040000 #define CPUID_STDEXT3_IBPB 0x04000000 #define CPUID_STDEXT3_STIBP 0x08000000 #define CPUID_STDEXT3_L1D_FLUSH 0x10000000 #define CPUID_STDEXT3_ARCH_CAP 0x20000000 #define CPUID_STDEXT3_CORE_CAP 0x40000000 #define CPUID_STDEXT3_SSBD 0x80000000 /* CPUID_HYBRID_ID leaf 0x1a */ #define CPUID_HYBRID_CORE_MASK 0xff000000 #define CPUID_HYBRID_SMALL_CORE 0x20000000 #define CPUID_HYBRID_LARGE_CORE 0x40000000 /* MSR IA32_ARCH_CAP(ABILITIES) bits */ #define IA32_ARCH_CAP_RDCL_NO 0x00000001 #define IA32_ARCH_CAP_IBRS_ALL 0x00000002 #define IA32_ARCH_CAP_RSBA 0x00000004 #define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x00000008 #define IA32_ARCH_CAP_SSB_NO 0x00000010 #define IA32_ARCH_CAP_MDS_NO 0x00000020 #define IA32_ARCH_CAP_IF_PSCHANGE_MC_NO 0x00000040 #define IA32_ARCH_CAP_TSX_CTRL 0x00000080 #define IA32_ARCH_CAP_TAA_NO 0x00000100 /* MSR IA32_TSX_CTRL bits */ #define IA32_TSX_CTRL_RTM_DISABLE 0x00000001 #define IA32_TSX_CTRL_TSX_CPUID_CLEAR 0x00000002 /* * CPUID manufacturers identifiers */ #define AMD_VENDOR_ID "AuthenticAMD" #define CENTAUR_VENDOR_ID "CentaurHauls" #define CYRIX_VENDOR_ID "CyrixInstead" #define INTEL_VENDOR_ID "GenuineIntel" #define NEXGEN_VENDOR_ID "NexGenDriven" #define NSC_VENDOR_ID "Geode by NSC" #define RISE_VENDOR_ID "RiseRiseRise" #define SIS_VENDOR_ID "SiS SiS SiS " #define TRANSMETA_VENDOR_ID "GenuineTMx86" #define UMC_VENDOR_ID "UMC UMC UMC " #define HYGON_VENDOR_ID "HygonGenuine" /* * Model-specific registers for the i386 family */ #define MSR_P5_MC_ADDR 0x000 #define MSR_P5_MC_TYPE 0x001 #define MSR_TSC 0x010 #define MSR_P5_CESR 0x011 #define MSR_P5_CTR0 0x012 #define MSR_P5_CTR1 0x013 #define MSR_IA32_PLATFORM_ID 0x017 #define MSR_APICBASE 0x01b #define MSR_EBL_CR_POWERON 0x02a #define MSR_TEST_CTL 0x033 #define MSR_IA32_FEATURE_CONTROL 0x03a #define MSR_IA32_SPEC_CTRL 0x048 #define MSR_IA32_PRED_CMD 0x049 #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 #define MSR_BBL_CR_D1 0x089 #define MSR_BBL_CR_D2 0x08a #define MSR_BIOS_SIGN 0x08b #define MSR_PERFCTR0 0x0c1 #define MSR_PERFCTR1 0x0c2 #define MSR_PLATFORM_INFO 0x0ce #define MSR_MPERF 0x0e7 #define MSR_APERF 0x0e8 #define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ #define MSR_MTRRcap 0x0fe #define MSR_IA32_ARCH_CAP 0x10a #define MSR_IA32_FLUSH_CMD 0x10b #define MSR_TSX_FORCE_ABORT 0x10f #define MSR_BBL_CR_ADDR 0x116 #define MSR_BBL_CR_DECC 0x118 #define MSR_BBL_CR_CTL 0x119 #define MSR_BBL_CR_TRIG 0x11a #define MSR_BBL_CR_BUSY 0x11b #define MSR_BBL_CR_CTL3 0x11e #define MSR_IA32_TSX_CTRL 0x122 #define MSR_IA32_MCU_OPT_CTRL 0x123 #define MSR_MISC_FEATURE_ENABLES 0x140 #define MSR_SYSENTER_CS_MSR 0x174 #define MSR_SYSENTER_ESP_MSR 0x175 #define MSR_SYSENTER_EIP_MSR 0x176 #define MSR_MCG_CAP 0x179 #define MSR_MCG_STATUS 0x17a #define MSR_MCG_CTL 0x17b #define MSR_EVNTSEL0 0x186 #define MSR_EVNTSEL1 0x187 #define MSR_THERM_CONTROL 0x19a #define MSR_THERM_INTERRUPT 0x19b #define MSR_THERM_STATUS 0x19c #define MSR_IA32_MISC_ENABLE 0x1a0 #define MSR_IA32_TEMPERATURE_TARGET 0x1a2 #define MSR_TURBO_RATIO_LIMIT 0x1ad #define MSR_TURBO_RATIO_LIMIT1 0x1ae #define MSR_IA32_ENERGY_PERF_BIAS 0x1b0 #define MSR_DEBUGCTLMSR 0x1d9 #define MSR_LASTBRANCHFROMIP 0x1db #define MSR_LASTBRANCHTOIP 0x1dc #define MSR_LASTINTFROMIP 0x1dd #define MSR_LASTINTTOIP 0x1de #define MSR_ROB_CR_BKUPTMPDR6 0x1e0 #define MSR_MTRRVarBase 0x200 #define MSR_MTRR64kBase 0x250 #define MSR_MTRR16kBase 0x258 #define MSR_MTRR4kBase 0x268 #define MSR_PAT 0x277 #define MSR_MC0_CTL2 0x280 #define MSR_MTRRdefType 0x2ff #define MSR_MC0_CTL 0x400 #define MSR_MC0_STATUS 0x401 #define MSR_MC0_ADDR 0x402 #define MSR_MC0_MISC 0x403 #define MSR_MC1_CTL 0x404 #define MSR_MC1_STATUS 0x405 #define MSR_MC1_ADDR 0x406 #define MSR_MC1_MISC 0x407 #define MSR_MC2_CTL 0x408 #define MSR_MC2_STATUS 0x409 #define MSR_MC2_ADDR 0x40a #define MSR_MC2_MISC 0x40b #define MSR_MC3_CTL 0x40c #define MSR_MC3_STATUS 0x40d #define MSR_MC3_ADDR 0x40e #define MSR_MC3_MISC 0x40f #define MSR_MC4_CTL 0x410 #define MSR_MC4_STATUS 0x411 #define MSR_MC4_ADDR 0x412 #define MSR_MC4_MISC 0x413 #define MSR_MCG_EXT_CTL 0x4d0 #define MSR_RAPL_POWER_UNIT 0x606 #define MSR_PKG_ENERGY_STATUS 0x611 #define MSR_DRAM_ENERGY_STATUS 0x619 #define MSR_PP0_ENERGY_STATUS 0x639 #define MSR_PP1_ENERGY_STATUS 0x641 #define MSR_PPERF 0x64e #define MSR_TSC_DEADLINE 0x6e0 /* Writes are not serializing */ #define MSR_IA32_PM_ENABLE 0x770 #define MSR_IA32_HWP_CAPABILITIES 0x771 #define MSR_IA32_HWP_REQUEST_PKG 0x772 #define MSR_IA32_HWP_INTERRUPT 0x773 #define MSR_IA32_HWP_REQUEST 0x774 #define MSR_IA32_HWP_STATUS 0x777 /* * VMX MSRs */ #define MSR_VMX_BASIC 0x480 #define MSR_VMX_PINBASED_CTLS 0x481 #define MSR_VMX_PROCBASED_CTLS 0x482 #define MSR_VMX_EXIT_CTLS 0x483 #define MSR_VMX_ENTRY_CTLS 0x484 #define MSR_VMX_CR0_FIXED0 0x486 #define MSR_VMX_CR0_FIXED1 0x487 #define MSR_VMX_CR4_FIXED0 0x488 #define MSR_VMX_CR4_FIXED1 0x489 #define MSR_VMX_PROCBASED_CTLS2 0x48b #define MSR_VMX_EPT_VPID_CAP 0x48c #define MSR_VMX_TRUE_PINBASED_CTLS 0x48d #define MSR_VMX_TRUE_PROCBASED_CTLS 0x48e #define MSR_VMX_TRUE_EXIT_CTLS 0x48f #define MSR_VMX_TRUE_ENTRY_CTLS 0x490 /* * X2APIC MSRs. * Writes are not serializing. */ #define MSR_APIC_000 0x800 #define MSR_APIC_ID 0x802 #define MSR_APIC_VERSION 0x803 #define MSR_APIC_TPR 0x808 #define MSR_APIC_EOI 0x80b #define MSR_APIC_LDR 0x80d #define MSR_APIC_SVR 0x80f #define MSR_APIC_ISR0 0x810 #define MSR_APIC_ISR1 0x811 #define MSR_APIC_ISR2 0x812 #define MSR_APIC_ISR3 0x813 #define MSR_APIC_ISR4 0x814 #define MSR_APIC_ISR5 0x815 #define MSR_APIC_ISR6 0x816 #define MSR_APIC_ISR7 0x817 #define MSR_APIC_TMR0 0x818 #define MSR_APIC_IRR0 0x820 #define MSR_APIC_ESR 0x828 #define MSR_APIC_LVT_CMCI 0x82F #define MSR_APIC_ICR 0x830 #define MSR_APIC_LVT_TIMER 0x832 #define MSR_APIC_LVT_THERMAL 0x833 #define MSR_APIC_LVT_PCINT 0x834 #define MSR_APIC_LVT_LINT0 0x835 #define MSR_APIC_LVT_LINT1 0x836 #define MSR_APIC_LVT_ERROR 0x837 #define MSR_APIC_ICR_TIMER 0x838 #define MSR_APIC_CCR_TIMER 0x839 #define MSR_APIC_DCR_TIMER 0x83e #define MSR_APIC_SELF_IPI 0x83f #define MSR_IA32_XSS 0xda0 /* * Intel Processor Trace (PT) MSRs. */ #define MSR_IA32_RTIT_OUTPUT_BASE 0x560 /* Trace Output Base Register (R/W) */ #define MSR_IA32_RTIT_OUTPUT_MASK_PTRS 0x561 /* Trace Output Mask Pointers Register (R/W) */ #define MSR_IA32_RTIT_CTL 0x570 /* Trace Control Register (R/W) */ #define RTIT_CTL_TRACEEN (1 << 0) #define RTIT_CTL_CYCEN (1 << 1) #define RTIT_CTL_OS (1 << 2) #define RTIT_CTL_USER (1 << 3) #define RTIT_CTL_PWREVTEN (1 << 4) #define RTIT_CTL_FUPONPTW (1 << 5) #define RTIT_CTL_FABRICEN (1 << 6) #define RTIT_CTL_CR3FILTER (1 << 7) #define RTIT_CTL_TOPA (1 << 8) #define RTIT_CTL_MTCEN (1 << 9) #define RTIT_CTL_TSCEN (1 << 10) #define RTIT_CTL_DISRETC (1 << 11) #define RTIT_CTL_PTWEN (1 << 12) #define RTIT_CTL_BRANCHEN (1 << 13) #define RTIT_CTL_MTC_FREQ_S 14 #define RTIT_CTL_MTC_FREQ(n) ((n) << RTIT_CTL_MTC_FREQ_S) #define RTIT_CTL_MTC_FREQ_M (0xf << RTIT_CTL_MTC_FREQ_S) #define RTIT_CTL_CYC_THRESH_S 19 #define RTIT_CTL_CYC_THRESH_M (0xf << RTIT_CTL_CYC_THRESH_S) #define RTIT_CTL_PSB_FREQ_S 24 #define RTIT_CTL_PSB_FREQ_M (0xf << RTIT_CTL_PSB_FREQ_S) #define RTIT_CTL_ADDR_CFG_S(n) (32 + (n) * 4) #define RTIT_CTL_ADDR0_CFG_S 32 #define RTIT_CTL_ADDR0_CFG_M (0xfULL << RTIT_CTL_ADDR0_CFG_S) #define RTIT_CTL_ADDR1_CFG_S 36 #define RTIT_CTL_ADDR1_CFG_M (0xfULL << RTIT_CTL_ADDR1_CFG_S) #define RTIT_CTL_ADDR2_CFG_S 40 #define RTIT_CTL_ADDR2_CFG_M (0xfULL << RTIT_CTL_ADDR2_CFG_S) #define RTIT_CTL_ADDR3_CFG_S 44 #define RTIT_CTL_ADDR3_CFG_M (0xfULL << RTIT_CTL_ADDR3_CFG_S) #define MSR_IA32_RTIT_STATUS 0x571 /* Tracing Status Register (R/W) */ #define RTIT_STATUS_FILTEREN (1 << 0) #define RTIT_STATUS_CONTEXTEN (1 << 1) #define RTIT_STATUS_TRIGGEREN (1 << 2) #define RTIT_STATUS_ERROR (1 << 4) #define RTIT_STATUS_STOPPED (1 << 5) #define RTIT_STATUS_PACKETBYTECNT_S 32 #define RTIT_STATUS_PACKETBYTECNT_M (0x1ffffULL << RTIT_STATUS_PACKETBYTECNT_S) #define MSR_IA32_RTIT_CR3_MATCH 0x572 /* Trace Filter CR3 Match Register (R/W) */ #define MSR_IA32_RTIT_ADDR_A(n) (0x580 + (n) * 2) #define MSR_IA32_RTIT_ADDR_B(n) (0x581 + (n) * 2) #define MSR_IA32_RTIT_ADDR0_A 0x580 /* Region 0 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR0_B 0x581 /* Region 0 End Address (R/W) */ #define MSR_IA32_RTIT_ADDR1_A 0x582 /* Region 1 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR1_B 0x583 /* Region 1 End Address (R/W) */ #define MSR_IA32_RTIT_ADDR2_A 0x584 /* Region 2 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR2_B 0x585 /* Region 2 End Address (R/W) */ #define MSR_IA32_RTIT_ADDR3_A 0x586 /* Region 3 Start Address (R/W) */ #define MSR_IA32_RTIT_ADDR3_B 0x587 /* Region 3 End Address (R/W) */ /* Intel Processor Trace Table of Physical Addresses (ToPA). */ #define TOPA_SIZE_S 6 #define TOPA_SIZE_M (0xf << TOPA_SIZE_S) #define TOPA_SIZE_4K (0 << TOPA_SIZE_S) #define TOPA_SIZE_8K (1 << TOPA_SIZE_S) #define TOPA_SIZE_16K (2 << TOPA_SIZE_S) #define TOPA_SIZE_32K (3 << TOPA_SIZE_S) #define TOPA_SIZE_64K (4 << TOPA_SIZE_S) #define TOPA_SIZE_128K (5 << TOPA_SIZE_S) #define TOPA_SIZE_256K (6 << TOPA_SIZE_S) #define TOPA_SIZE_512K (7 << TOPA_SIZE_S) #define TOPA_SIZE_1M (8 << TOPA_SIZE_S) #define TOPA_SIZE_2M (9 << TOPA_SIZE_S) #define TOPA_SIZE_4M (10 << TOPA_SIZE_S) #define TOPA_SIZE_8M (11 << TOPA_SIZE_S) #define TOPA_SIZE_16M (12 << TOPA_SIZE_S) #define TOPA_SIZE_32M (13 << TOPA_SIZE_S) #define TOPA_SIZE_64M (14 << TOPA_SIZE_S) #define TOPA_SIZE_128M (15 << TOPA_SIZE_S) #define TOPA_STOP (1 << 4) #define TOPA_INT (1 << 2) #define TOPA_END (1 << 0) /* * Constants related to MSR's. */ #define APICBASE_RESERVED 0x000002ff #define APICBASE_BSP 0x00000100 #define APICBASE_X2APIC 0x00000400 #define APICBASE_ENABLED 0x00000800 #define APICBASE_ADDRESS 0xfffff000 /* MSR_IA32_FEATURE_CONTROL related */ #define IA32_FEATURE_CONTROL_LOCK 0x01 /* lock bit */ #define IA32_FEATURE_CONTROL_SMX_EN 0x02 /* enable VMX inside SMX */ #define IA32_FEATURE_CONTROL_VMX_EN 0x04 /* enable VMX outside SMX */ #define IA32_FEATURE_CONTROL_LMCE_EN 0x100000 /* enable local MCE */ /* MSR IA32_MISC_ENABLE */ #define IA32_MISC_EN_FASTSTR 0x0000000000000001ULL #define IA32_MISC_EN_ATCCE 0x0000000000000008ULL #define IA32_MISC_EN_PERFMON 0x0000000000000080ULL #define IA32_MISC_EN_PEBSU 0x0000000000001000ULL #define IA32_MISC_EN_ESSTE 0x0000000000010000ULL #define IA32_MISC_EN_MONE 0x0000000000040000ULL #define IA32_MISC_EN_LIMCPUID 0x0000000000400000ULL #define IA32_MISC_EN_xTPRD 0x0000000000800000ULL #define IA32_MISC_EN_XDD 0x0000000400000000ULL /* * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel' * document 336996-001 Speculative Execution Side Channel Mitigations. * * AMD uses the same MSRs and bit definitions, as described in 111006-B * "Indirect Branch Control Extension" and 124441 "Speculative Store Bypass * Disable." */ /* MSR IA32_SPEC_CTRL */ #define IA32_SPEC_CTRL_IBRS 0x00000001 #define IA32_SPEC_CTRL_STIBP 0x00000002 #define IA32_SPEC_CTRL_SSBD 0x00000004 /* MSR IA32_PRED_CMD */ #define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL /* MSR IA32_FLUSH_CMD */ #define IA32_FLUSH_CMD_L1D 0x00000001 /* MSR IA32_MCU_OPT_CTRL */ #define IA32_RNGDS_MITG_DIS 0x00000001 /* MSR IA32_HWP_CAPABILITIES */ #define IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(x) (((x) >> 0) & 0xff) #define IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(x) (((x) >> 8) & 0xff) #define IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(x) (((x) >> 16) & 0xff) #define IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(x) (((x) >> 24) & 0xff) /* MSR IA32_HWP_REQUEST */ #define IA32_HWP_REQUEST_MINIMUM_VALID (1ULL << 63) #define IA32_HWP_REQUEST_MAXIMUM_VALID (1ULL << 62) #define IA32_HWP_REQUEST_DESIRED_VALID (1ULL << 61) #define IA32_HWP_REQUEST_EPP_VALID (1ULL << 60) #define IA32_HWP_REQUEST_ACTIVITY_WINDOW_VALID (1ULL << 59) #define IA32_HWP_REQUEST_PACKAGE_CONTROL (1ULL << 42) #define IA32_HWP_ACTIVITY_WINDOW (0x3ffULL << 32) #define IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE (0xffULL << 24) #define IA32_HWP_DESIRED_PERFORMANCE (0xffULL << 16) #define IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE (0xffULL << 8) #define IA32_HWP_MINIMUM_PERFORMANCE (0xffULL << 0) /* MSR IA32_ENERGY_PERF_BIAS */ #define IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK (0xfULL << 0) /* * PAT modes. */ #define PAT_UNCACHEABLE 0x00 #define PAT_WRITE_COMBINING 0x01 #define PAT_WRITE_THROUGH 0x04 #define PAT_WRITE_PROTECTED 0x05 #define PAT_WRITE_BACK 0x06 #define PAT_UNCACHED 0x07 #define PAT_VALUE(i, m) ((long long)(m) << (8 * (i))) #define PAT_MASK(i) PAT_VALUE(i, 0xff) /* * Constants related to MTRRs */ #define MTRR_UNCACHEABLE 0x00 #define MTRR_WRITE_COMBINING 0x01 #define MTRR_WRITE_THROUGH 0x04 #define MTRR_WRITE_PROTECTED 0x05 #define MTRR_WRITE_BACK 0x06 #define MTRR_N64K 8 /* numbers of fixed-size entries */ #define MTRR_N16K 16 #define MTRR_N4K 64 #define MTRR_CAP_WC 0x0000000000000400 #define MTRR_CAP_FIXED 0x0000000000000100 #define MTRR_CAP_VCNT 0x00000000000000ff #define MTRR_DEF_ENABLE 0x0000000000000800 #define MTRR_DEF_FIXED_ENABLE 0x0000000000000400 #define MTRR_DEF_TYPE 0x00000000000000ff #define MTRR_PHYSBASE_PHYSBASE 0x000ffffffffff000 #define MTRR_PHYSBASE_TYPE 0x00000000000000ff #define MTRR_PHYSMASK_PHYSMASK 0x000ffffffffff000 #define MTRR_PHYSMASK_VALID 0x0000000000000800 /* * Cyrix configuration registers, accessible as IO ports. */ #define CCR0 0xc0 /* Configuration control register 0 */ #define CCR0_NC0 0x01 /* First 64K of each 1M memory region is non-cacheable */ #define CCR0_NC1 0x02 /* 640K-1M region is non-cacheable */ #define CCR0_A20M 0x04 /* Enables A20M# input pin */ #define CCR0_KEN 0x08 /* Enables KEN# input pin */ #define CCR0_FLUSH 0x10 /* Enables FLUSH# input pin */ #define CCR0_BARB 0x20 /* Flushes internal cache when entering hold state */ #define CCR0_CO 0x40 /* Cache org: 1=direct mapped, 0=2x set assoc */ #define CCR0_SUSPEND 0x80 /* Enables SUSP# and SUSPA# pins */ #define CCR1 0xc1 /* Configuration control register 1 */ #define CCR1_RPL 0x01 /* Enables RPLSET and RPLVAL# pins */ #define CCR1_SMI 0x02 /* Enables SMM pins */ #define CCR1_SMAC 0x04 /* System management memory access */ #define CCR1_MMAC 0x08 /* Main memory access */ #define CCR1_NO_LOCK 0x10 /* Negate LOCK# */ #define CCR1_SM3 0x80 /* SMM address space address region 3 */ #define CCR2 0xc2 #define CCR2_WB 0x02 /* Enables WB cache interface pins */ #define CCR2_SADS 0x02 /* Slow ADS */ #define CCR2_LOCK_NW 0x04 /* LOCK NW Bit */ #define CCR2_SUSP_HLT 0x08 /* Suspend on HALT */ #define CCR2_WT1 0x10 /* WT region 1 */ #define CCR2_WPR1 0x10 /* Write-protect region 1 */ #define CCR2_BARB 0x20 /* Flushes write-back cache when entering hold state. */ #define CCR2_BWRT 0x40 /* Enables burst write cycles */ #define CCR2_USE_SUSP 0x80 /* Enables suspend pins */ #define CCR3 0xc3 #define CCR3_SMILOCK 0x01 /* SMM register lock */ #define CCR3_NMI 0x02 /* Enables NMI during SMM */ #define CCR3_LINBRST 0x04 /* Linear address burst cycles */ #define CCR3_SMMMODE 0x08 /* SMM Mode */ #define CCR3_MAPEN0 0x10 /* Enables Map0 */ #define CCR3_MAPEN1 0x20 /* Enables Map1 */ #define CCR3_MAPEN2 0x40 /* Enables Map2 */ #define CCR3_MAPEN3 0x80 /* Enables Map3 */ #define CCR4 0xe8 #define CCR4_IOMASK 0x07 #define CCR4_MEM 0x08 /* Enables momory bypassing */ #define CCR4_DTE 0x10 /* Enables directory table entry cache */ #define CCR4_FASTFPE 0x20 /* Fast FPU exception */ #define CCR4_CPUID 0x80 /* Enables CPUID instruction */ #define CCR5 0xe9 #define CCR5_WT_ALLOC 0x01 /* Write-through allocate */ #define CCR5_SLOP 0x02 /* LOOP instruction slowed down */ #define CCR5_LBR1 0x10 /* Local bus region 1 */ #define CCR5_ARREN 0x20 /* Enables ARR region */ #define CCR6 0xea #define CCR7 0xeb /* Performance Control Register (5x86 only). */ #define PCR0 0x20 #define PCR0_RSTK 0x01 /* Enables return stack */ #define PCR0_BTB 0x02 /* Enables branch target buffer */ #define PCR0_LOOP 0x04 /* Enables loop */ #define PCR0_AIS 0x08 /* Enables all instrcutions stalled to serialize pipe. */ #define PCR0_MLR 0x10 /* Enables reordering of misaligned loads */ #define PCR0_BTBRT 0x40 /* Enables BTB test register. */ #define PCR0_LSSER 0x80 /* Disable reorder */ /* Device Identification Registers */ #define DIR0 0xfe #define DIR1 0xff /* * Machine Check register constants. */ #define MCG_CAP_COUNT 0x000000ff #define MCG_CAP_CTL_P 0x00000100 #define MCG_CAP_EXT_P 0x00000200 #define MCG_CAP_CMCI_P 0x00000400 #define MCG_CAP_TES_P 0x00000800 #define MCG_CAP_EXT_CNT 0x00ff0000 #define MCG_CAP_SER_P 0x01000000 #define MCG_CAP_EMC_P 0x02000000 #define MCG_CAP_ELOG_P 0x04000000 #define MCG_CAP_LMCE_P 0x08000000 #define MCG_STATUS_RIPV 0x00000001 #define MCG_STATUS_EIPV 0x00000002 #define MCG_STATUS_MCIP 0x00000004 #define MCG_STATUS_LMCS 0x00000008 /* if MCG_CAP_LMCE_P */ #define MCG_CTL_ENABLE 0xffffffffffffffff #define MCG_CTL_DISABLE 0x0000000000000000 #define MSR_MC_CTL(x) (MSR_MC0_CTL + (x) * 4) #define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) #define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) #define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) #define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */ #define MC_STATUS_MCA_ERROR 0x000000000000ffff #define MC_STATUS_MODEL_ERROR 0x00000000ffff0000 #define MC_STATUS_OTHER_INFO 0x01ffffff00000000 #define MC_STATUS_COR_COUNT 0x001fffc000000000 /* If MCG_CAP_CMCI_P */ #define MC_STATUS_TES_STATUS 0x0060000000000000 /* If MCG_CAP_TES_P */ #define MC_STATUS_AR 0x0080000000000000 /* If MCG_CAP_TES_P */ #define MC_STATUS_S 0x0100000000000000 /* If MCG_CAP_TES_P */ #define MC_STATUS_PCC 0x0200000000000000 #define MC_STATUS_ADDRV 0x0400000000000000 #define MC_STATUS_MISCV 0x0800000000000000 #define MC_STATUS_EN 0x1000000000000000 #define MC_STATUS_UC 0x2000000000000000 #define MC_STATUS_OVER 0x4000000000000000 #define MC_STATUS_VAL 0x8000000000000000 #define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ #define MC_MISC_PCIE_RID 0x00000000ffff0000 #define MC_MISC_PCIE_FUNC 0x0000000000070000 #define MC_MISC_PCIE_SLOT 0x0000000000f80000 #define MC_MISC_PCIE_BUS 0x00000000ff000000 #define MC_MISC_PCIE_SEG 0x000000ff00000000 #define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 #define MC_AMDNB_BANK 4 #define MC_MISC_AMD_VAL 0x8000000000000000 /* Counter presence valid */ #define MC_MISC_AMD_CNTP 0x4000000000000000 /* Counter present */ #define MC_MISC_AMD_LOCK 0x2000000000000000 /* Register locked */ #define MC_MISC_AMD_INTP 0x1000000000000000 /* Int. type can generate interrupts */ #define MC_MISC_AMD_LVT_MASK 0x00f0000000000000 /* Extended LVT offset */ #define MC_MISC_AMD_LVT_SHIFT 52 #define MC_MISC_AMD_CNTEN 0x0008000000000000 /* Counter enabled */ #define MC_MISC_AMD_INT_MASK 0x0006000000000000 /* Interrupt type */ #define MC_MISC_AMD_INT_LVT 0x0002000000000000 /* Interrupt via Extended LVT */ #define MC_MISC_AMD_INT_SMI 0x0004000000000000 /* SMI */ #define MC_MISC_AMD_OVERFLOW 0x0001000000000000 /* Counter overflow */ #define MC_MISC_AMD_CNT_MASK 0x00000fff00000000 /* Counter value */ #define MC_MISC_AMD_CNT_SHIFT 32 #define MC_MISC_AMD_CNT_MAX 0xfff #define MC_MISC_AMD_PTR_MASK 0x00000000ff000000 /* Pointer to additional registers */ #define MC_MISC_AMD_PTR_SHIFT 24 /* AMD Scalable MCA */ #define MSR_SMCA_MC0_CTL 0xc0002000 #define MSR_SMCA_MC0_STATUS 0xc0002001 #define MSR_SMCA_MC0_ADDR 0xc0002002 #define MSR_SMCA_MC0_MISC0 0xc0002003 #define MSR_SMCA_MC_CTL(x) (MSR_SMCA_MC0_CTL + 0x10 * (x)) #define MSR_SMCA_MC_STATUS(x) (MSR_SMCA_MC0_STATUS + 0x10 * (x)) #define MSR_SMCA_MC_ADDR(x) (MSR_SMCA_MC0_ADDR + 0x10 * (x)) #define MSR_SMCA_MC_MISC(x) (MSR_SMCA_MC0_MISC0 + 0x10 * (x)) /* * The following four 3-byte registers control the non-cacheable regions. * These registers must be written as three separate bytes. * * NCRx+0: A31-A24 of starting address * NCRx+1: A23-A16 of starting address * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx. * * The non-cacheable region's starting address must be aligned to the * size indicated by the NCR_SIZE_xx field. */ #define NCR1 0xc4 #define NCR2 0xc7 #define NCR3 0xca #define NCR4 0xcd #define NCR_SIZE_0K 0 #define NCR_SIZE_4K 1 #define NCR_SIZE_8K 2 #define NCR_SIZE_16K 3 #define NCR_SIZE_32K 4 #define NCR_SIZE_64K 5 #define NCR_SIZE_128K 6 #define NCR_SIZE_256K 7 #define NCR_SIZE_512K 8 #define NCR_SIZE_1M 9 #define NCR_SIZE_2M 10 #define NCR_SIZE_4M 11 #define NCR_SIZE_8M 12 #define NCR_SIZE_16M 13 #define NCR_SIZE_32M 14 #define NCR_SIZE_4G 15 /* * The address region registers are used to specify the location and * size for the eight address regions. * * ARRx + 0: A31-A24 of start address * ARRx + 1: A23-A16 of start address * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx */ #define ARR0 0xc4 #define ARR1 0xc7 #define ARR2 0xca #define ARR3 0xcd #define ARR4 0xd0 #define ARR5 0xd3 #define ARR6 0xd6 #define ARR7 0xd9 #define ARR_SIZE_0K 0 #define ARR_SIZE_4K 1 #define ARR_SIZE_8K 2 #define ARR_SIZE_16K 3 #define ARR_SIZE_32K 4 #define ARR_SIZE_64K 5 #define ARR_SIZE_128K 6 #define ARR_SIZE_256K 7 #define ARR_SIZE_512K 8 #define ARR_SIZE_1M 9 #define ARR_SIZE_2M 10 #define ARR_SIZE_4M 11 #define ARR_SIZE_8M 12 #define ARR_SIZE_16M 13 #define ARR_SIZE_32M 14 #define ARR_SIZE_4G 15 /* * The region control registers specify the attributes associated with * the ARRx addres regions. */ #define RCR0 0xdc #define RCR1 0xdd #define RCR2 0xde #define RCR3 0xdf #define RCR4 0xe0 #define RCR5 0xe1 #define RCR6 0xe2 #define RCR7 0xe3 #define RCR_RCD 0x01 /* Disables caching for ARRx (x = 0-6). */ #define RCR_RCE 0x01 /* Enables caching for ARR7. */ #define RCR_WWO 0x02 /* Weak write ordering. */ #define RCR_WL 0x04 /* Weak locking. */ #define RCR_WG 0x08 /* Write gathering. */ #define RCR_WT 0x10 /* Write-through. */ #define RCR_NLB 0x20 /* LBA# pin is not asserted. */ /* AMD Write Allocate Top-Of-Memory and Control Register */ #define AMD_WT_ALLOC_TME 0x40000 /* top-of-memory enable */ #define AMD_WT_ALLOC_PRE 0x20000 /* programmable range enable */ #define AMD_WT_ALLOC_FRE 0x10000 /* fixed (A0000-FFFFF) range enable */ /* AMD64 MSR's */ #define MSR_EFER 0xc0000080 /* extended features */ #define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target/cs/ss */ #define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target rip */ #define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target rip */ #define MSR_SF_MASK 0xc0000084 /* syscall flags mask */ #define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ #define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ #define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ #define MSR_TSC_AUX 0xc0000103 #define MSR_PERFEVSEL0 0xc0010000 #define MSR_PERFEVSEL1 0xc0010001 #define MSR_PERFEVSEL2 0xc0010002 #define MSR_PERFEVSEL3 0xc0010003 #define MSR_K7_PERFCTR0 0xc0010004 #define MSR_K7_PERFCTR1 0xc0010005 #define MSR_K7_PERFCTR2 0xc0010006 #define MSR_K7_PERFCTR3 0xc0010007 #define MSR_SYSCFG 0xc0010010 #define MSR_HWCR 0xc0010015 #define MSR_IORRBASE0 0xc0010016 #define MSR_IORRMASK0 0xc0010017 #define MSR_IORRBASE1 0xc0010018 #define MSR_IORRMASK1 0xc0010019 #define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ #define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ #define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */ #define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ #define MSR_MC0_CTL_MASK 0xc0010044 #define MSR_AMDK8_IPM 0xc0010055 #define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */ #define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */ #define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */ #define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */ #define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */ #define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */ #define MSR_VM_CR 0xc0010114 /* SVM: feature control */ #define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ #define MSR_AMD_CPUID07 0xc0011002 /* CPUID 07 %ebx override */ #define MSR_EXTFEATURES 0xc0011005 /* Extended CPUID Features override */ #define MSR_LS_CFG 0xc0011020 #define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */ #define MSR_DE_CFG 0xc0011029 /* Decode Configuration */ /* MSR_AMDK8_IPM */ #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) /* MSR_VM_CR related */ #define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */ /* MSR_DE_CFG */ #define DE_CFG_10H_12H_STACK_POINTER_JUMP_FIX_BIT 0x1 #define DE_CFG_ZEN_LOAD_STALE_DATA_FIX_BIT 0x2000 +#define DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 0x200 /* VIA ACE crypto featureset: for via_feature_rng */ #define VIA_HAS_RNG 1 /* cpu has RNG */ /* VIA ACE crypto featureset: for via_feature_xcrypt */ #define VIA_HAS_AES 1 /* cpu has AES */ #define VIA_HAS_SHA 2 /* cpu has SHA1 & SHA256 */ #define VIA_HAS_MM 4 /* cpu has RSA instructions */ #define VIA_HAS_AESCTR 8 /* cpu has AES-CTR instructions */ /* Centaur Extended Feature flags */ #define VIA_CPUID_HAS_RNG 0x000004 #define VIA_CPUID_DO_RNG 0x000008 #define VIA_CPUID_HAS_ACE 0x000040 #define VIA_CPUID_DO_ACE 0x000080 #define VIA_CPUID_HAS_ACE2 0x000100 #define VIA_CPUID_DO_ACE2 0x000200 #define VIA_CPUID_HAS_PHE 0x000400 #define VIA_CPUID_DO_PHE 0x000800 #define VIA_CPUID_HAS_PMM 0x001000 #define VIA_CPUID_DO_PMM 0x002000 /* VIA ACE xcrypt-* instruction context control options */ #define VIA_CRYPT_CWLO_ROUND_M 0x0000000f #define VIA_CRYPT_CWLO_ALG_M 0x00000070 #define VIA_CRYPT_CWLO_ALG_AES 0x00000000 #define VIA_CRYPT_CWLO_KEYGEN_M 0x00000080 #define VIA_CRYPT_CWLO_KEYGEN_HW 0x00000000 #define VIA_CRYPT_CWLO_KEYGEN_SW 0x00000080 #define VIA_CRYPT_CWLO_NORMAL 0x00000000 #define VIA_CRYPT_CWLO_INTERMEDIATE 0x00000100 #define VIA_CRYPT_CWLO_ENCRYPT 0x00000000 #define VIA_CRYPT_CWLO_DECRYPT 0x00000200 #define VIA_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */ #define VIA_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */ #define VIA_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */ #endif /* !_MACHINE_SPECIALREG_H_ */ diff --git a/sys/x86/include/x86_var.h b/sys/x86/include/x86_var.h index 1629236c5928..44314456d8bf 100644 --- a/sys/x86/include/x86_var.h +++ b/sys/x86/include/x86_var.h @@ -1,181 +1,184 @@ /*- * Copyright (c) 1995 Bruce D. Evans. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _X86_X86_VAR_H_ #define _X86_X86_VAR_H_ /* * Miscellaneous machine-dependent declarations. */ extern long Maxmem; extern u_int basemem; extern u_int cpu_exthigh; extern u_int cpu_feature; extern u_int cpu_feature2; extern u_int amd_feature; extern u_int amd_feature2; extern u_int amd_rascap; extern u_int amd_pminfo; extern u_int amd_extended_feature_extensions; extern u_int via_feature_rng; extern u_int via_feature_xcrypt; extern u_int cpu_clflush_line_size; extern u_int cpu_stdext_feature; extern u_int cpu_stdext_feature2; extern u_int cpu_stdext_feature3; extern uint64_t cpu_ia32_arch_caps; extern u_int cpu_high; extern u_int cpu_id; extern u_int cpu_max_ext_state_size; extern u_int cpu_mxcsr_mask; extern u_int cpu_procinfo; extern u_int cpu_procinfo2; extern char cpu_vendor[]; extern u_int cpu_vendor_id; extern u_int cpu_mon_mwait_flags; extern u_int cpu_mon_min_size; extern u_int cpu_mon_max_size; extern u_int cpu_maxphyaddr; extern u_int cpu_power_eax; extern u_int cpu_power_ebx; extern u_int cpu_power_ecx; extern u_int cpu_power_edx; extern u_int hv_base; extern u_int hv_high; extern char hv_vendor[]; extern char kstack[]; extern char sigcode[]; extern int szsigcode; extern int workaround_erratum383; extern int _udatasel; extern int _ucodesel; extern int _ucode32sel; extern int _ufssel; extern int _ugssel; extern int use_xsave; extern uint64_t xsave_mask; extern u_int max_apic_id; extern int i386_read_exec; extern int pti; extern int hw_ibrs_ibpb_active; extern int hw_mds_disable; extern int hw_ssb_active; extern int x86_taa_enable; extern int cpu_flush_rsb_ctxsw; extern int x86_rngds_mitg_enable; +extern int zenbleed_enable; extern int cpu_amdc1e_bug; extern char bootmethod[16]; struct pcb; struct thread; struct reg; struct fpreg; struct dbreg; struct dumperinfo; struct trapframe; struct minidumpstate; /* * The interface type of the interrupt handler entry point cannot be * expressed in C. Use simplest non-variadic function type as an * approximation. */ typedef void alias_for_inthand_t(void); bool acpi_get_fadt_bootflags(uint16_t *flagsp); void *alloc_fpusave(int flags); u_int cpu_auxmsr(void); vm_paddr_t cpu_getmaxphyaddr(void); bool cpu_mwait_usable(void); void cpu_probe_amdc1e(void); void cpu_setregs(void); int dbreg_set_watchpoint(vm_offset_t addr, vm_size_t size, int access); int dbreg_clr_watchpoint(vm_offset_t addr, vm_size_t size); void dbreg_list_watchpoints(void); void x86_clear_dbregs(struct pcb *pcb); bool disable_wp(void); void restore_wp(bool old_wp); void finishidentcpu(void); void identify_cpu1(void); void identify_cpu2(void); void identify_cpu_ext_features(void); void identify_cpu_fixup_bsp(void); void identify_hypervisor(void); void initializecpu(void); void initializecpucache(void); bool fix_cpuid(void); void fillw(int /*u_short*/ pat, void *base, size_t cnt); int isa_nmi(int cd); void handle_ibrs_entry(void); void handle_ibrs_exit(void); void hw_ibrs_recalculate(bool all_cpus); void hw_mds_recalculate(void); void hw_ssb_recalculate(bool all_cpus); void x86_taa_recalculate(void); void x86_rngds_mitg_recalculate(bool all_cpus); +void zenbleed_sanitize_enable(void); +void zenbleed_check_and_apply(bool all_cpus); void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame); void nmi_call_kdb_smp(u_int type, struct trapframe *frame); void nmi_handle_intr(u_int type, struct trapframe *frame); void pagecopy(void *from, void *to); void printcpuinfo(void); int pti_get_default(void); int user_dbreg_trap(register_t dr6); int cpu_minidumpsys(struct dumperinfo *, const struct minidumpstate *); struct pcb *get_pcb_td(struct thread *td); void x86_set_fork_retval(struct thread *td); uint64_t rdtsc_ordered(void); /* * MSR ops for x86_msr_op() */ #define MSR_OP_ANDNOT 0x00000001 #define MSR_OP_OR 0x00000002 #define MSR_OP_WRITE 0x00000003 #define MSR_OP_READ 0x00000004 /* * Where and which execution mode */ #define MSR_OP_LOCAL 0x10000000 #define MSR_OP_SCHED_ALL 0x20000000 #define MSR_OP_SCHED_ONE 0x30000000 #define MSR_OP_RENDEZVOUS_ALL 0x40000000 #define MSR_OP_RENDEZVOUS_ONE 0x50000000 #define MSR_OP_CPUID(id) ((id) << 8) void x86_msr_op(u_int msr, u_int op, uint64_t arg1, uint64_t *res); #if defined(__i386__) && defined(INVARIANTS) void trap_check_kstack(void); #else #define trap_check_kstack() #endif #endif diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c index c80284cf2987..868e6e2bc365 100644 --- a/sys/x86/x86/cpu_machdep.c +++ b/sys/x86/x86/cpu_machdep.c @@ -1,1538 +1,1661 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include #include "opt_acpi.h" #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_platform.h" #include "opt_sched.h" #ifdef __i386__ #include "opt_apic.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef CPU_ELAN #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 #ifdef SMP static u_int cpu_reset_proxyid; static volatile u_int cpu_reset_proxy_active; #endif char bootmethod[16]; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); struct msr_op_arg { u_int msr; int op; uint64_t arg1; uint64_t *res; }; static void x86_msr_op_one(void *argp) { struct msr_op_arg *a; uint64_t v; a = argp; switch (a->op) { case MSR_OP_ANDNOT: v = rdmsr(a->msr); v &= ~a->arg1; wrmsr(a->msr, v); break; case MSR_OP_OR: v = rdmsr(a->msr); v |= a->arg1; wrmsr(a->msr, v); break; case MSR_OP_WRITE: wrmsr(a->msr, a->arg1); break; case MSR_OP_READ: v = rdmsr(a->msr); *a->res = v; break; } } #define MSR_OP_EXMODE_MASK 0xf0000000 #define MSR_OP_OP_MASK 0x000000ff #define MSR_OP_GET_CPUID(x) (((x) & ~MSR_OP_EXMODE_MASK) >> 8) void x86_msr_op(u_int msr, u_int op, uint64_t arg1, uint64_t *res) { struct thread *td; struct msr_op_arg a; cpuset_t set; u_int exmode; int bound_cpu, cpu, i, is_bound; a.op = op & MSR_OP_OP_MASK; MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR || a.op == MSR_OP_WRITE || a.op == MSR_OP_READ); exmode = op & MSR_OP_EXMODE_MASK; MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED_ALL || exmode == MSR_OP_SCHED_ONE || exmode == MSR_OP_RENDEZVOUS_ALL || exmode == MSR_OP_RENDEZVOUS_ONE); a.msr = msr; a.arg1 = arg1; a.res = res; switch (exmode) { case MSR_OP_LOCAL: x86_msr_op_one(&a); break; case MSR_OP_SCHED_ALL: td = curthread; thread_lock(td); is_bound = sched_is_bound(td); bound_cpu = td->td_oncpu; CPU_FOREACH(i) { sched_bind(td, i); x86_msr_op_one(&a); } if (is_bound) sched_bind(td, bound_cpu); else sched_unbind(td); thread_unlock(td); break; case MSR_OP_SCHED_ONE: td = curthread; cpu = MSR_OP_GET_CPUID(op); thread_lock(td); is_bound = sched_is_bound(td); bound_cpu = td->td_oncpu; if (!is_bound || bound_cpu != cpu) sched_bind(td, cpu); x86_msr_op_one(&a); if (is_bound) { if (bound_cpu != cpu) sched_bind(td, bound_cpu); } else { sched_unbind(td); } thread_unlock(td); break; case MSR_OP_RENDEZVOUS_ALL: smp_rendezvous(smp_no_rendezvous_barrier, x86_msr_op_one, smp_no_rendezvous_barrier, &a); break; case MSR_OP_RENDEZVOUS_ONE: cpu = MSR_OP_GET_CPUID(op); CPU_SETOF(cpu, &set); smp_rendezvous_cpus(set, smp_no_rendezvous_barrier, x86_msr_op_one, smp_no_rendezvous_barrier, &a); break; } } /* * Automatically initialized per CPU errata in cpu_idle_tun below. */ bool mwait_cpustop_broken = false; SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN, &mwait_cpustop_broken, 0, "Can not reliably wake MONITOR/MWAIT cpus without interrupts"); /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } void acpi_cpu_c1(void) { __asm __volatile("sti; hlt"); } /* * Use mwait to pause execution while waiting for an interrupt or * another thread to signal that there is more work. * * NOTE: Interrupts will cause a wakeup; however, this function does * not enable interrupt handling. The caller is responsible to enable * interrupts. */ void acpi_cpu_idle_mwait(uint32_t mwait_hint) { int *state; uint64_t v; /* * A comment in Linux patch claims that 'CPUs run faster with * speculation protection disabled. All CPU threads in a core * must disable speculation protection for it to be * disabled. Disable it while we are idle so the other * hyperthread can run fast.' * * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ state = &PCPU_PTR(monitorbuf)->idle_state; KASSERT(atomic_load_int(state) == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); atomic_store_int(state, STATE_MWAIT); if (PCPU_GET(ibpb_set) || hw_ssb_active) { v = rdmsr(MSR_IA32_SPEC_CTRL); wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS | IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD)); } else { v = 0; } cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); /* * SSB cannot be disabled while we sleep, or rather, if it was * disabled, the sysctl thread will bind to our cpu to tweak * MSR. */ if (v != 0) wrmsr(MSR_IA32_SPEC_CTRL, v); /* * We should exit on any event that interrupts mwait, because * that event might be a wanted interrupt. */ atomic_store_int(state, STATE_RUNNING); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef __i386__ if ((cpu_feature & CPUID_TSC) == 0) return (EOPNOTSUPP); #endif /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } static void cpu_reset_real(void) { struct region_descriptor null_idt; int b; disable_intr(); #ifdef CPU_ELAN if (elan_mmcr != NULL) elan_mmcr->RESCFG = 1; #endif #ifdef __i386__ if (cpu == CPU_GEODE1100) { /* Attempt Geode's own reset */ outl(0xcf8, 0x80009044ul); outl(0xcfc, 0xf); } #endif #if !defined(BROKEN_KEYBOARD_RESET) /* * Attempt to do a CPU reset via the keyboard controller, * do not turn off GateA20, as any machine that fails * to do the reset here would then end up in no man's land. */ outb(IO_KBD + 4, 0xFE); DELAY(500000); /* wait 0.5 sec to see if that did it */ #endif /* * Attempt to force a reset via the Reset Control register at * I/O port 0xcf9. Bit 2 forces a system reset when it * transitions from 0 to 1. Bit 1 selects the type of reset * to attempt: 0 selects a "soft" reset, and 1 selects a * "hard" reset. We try a "hard" reset. The first write sets * bit 1 to select a "hard" reset and clears bit 2. The * second write forces a 0 -> 1 transition in bit 2 to trigger * a reset. */ outb(0xcf9, 0x2); outb(0xcf9, 0x6); DELAY(500000); /* wait 0.5 sec to see if that did it */ /* * Attempt to force a reset via the Fast A20 and Init register * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. * Bit 0 asserts INIT# when set to 1. We are careful to only * preserve bit 1 while setting bit 0. We also must clear bit * 0 before setting it if it isn't already clear. */ b = inb(0x92); if (b != 0xff) { if ((b & 0x1) != 0) outb(0x92, b & 0xfe); outb(0x92, b | 0x1); DELAY(500000); /* wait 0.5 sec to see if that did it */ } printf("No known reset method worked, attempting CPU shutdown\n"); DELAY(1000000); /* wait 1 sec for printf to complete */ /* Wipe the IDT. */ null_idt.rd_limit = 0; null_idt.rd_base = 0; lidt(&null_idt); /* "good night, sweet prince .... " */ breakpoint(); /* NOTREACHED */ while(1); } #ifdef SMP static void cpu_reset_proxy(void) { cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) ia32_pause(); /* Wait for other cpu to see that we've started */ printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); DELAY(1000000); cpu_reset_real(); } #endif void cpu_reset(void) { #ifdef SMP struct monitorbuf *mb; cpuset_t map; u_int cnt; if (smp_started) { map = all_cpus; CPU_CLR(PCPU_GET(cpuid), &map); CPU_ANDNOT(&map, &map, &stopped_cpus); if (!CPU_EMPTY(&map)) { printf("cpu_reset: Stopping other CPUs\n"); stop_cpus(map); } if (PCPU_GET(cpuid) != 0) { cpu_reset_proxyid = PCPU_GET(cpuid); cpustop_restartfunc = cpu_reset_proxy; cpu_reset_proxy_active = 0; printf("cpu_reset: Restarting BSP\n"); /* Restart CPU #0. */ CPU_SETOF(0, &started_cpus); mb = &pcpu_find(0)->pc_monitorbuf; atomic_store_int(&mb->stop_state, MONITOR_STOPSTATE_RUNNING); cnt = 0; while (cpu_reset_proxy_active == 0 && cnt < 10000000) { ia32_pause(); cnt++; /* Wait for BSP to announce restart */ } if (cpu_reset_proxy_active == 0) { printf("cpu_reset: Failed to restart BSP\n"); } else { cpu_reset_proxy_active = 2; while (1) ia32_pause(); /* NOTREACHED */ } } } #endif cpu_reset_real(); /* NOTREACHED */ } bool cpu_mwait_usable(void) { return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ int cpu_amdc1e_bug = 0; /* AMD C1E APIC workaround required. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); static bool cpu_idle_enter(int *statep, int newstate) { KASSERT(atomic_load_int(statep) == STATE_RUNNING, ("%s: state %d", __func__, atomic_load_int(statep))); /* * A fence is needed to prevent reordering of the load in * sched_runnable() with this store to the idle state word. Without it, * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having * added load to the queue, and elide an IPI. Then, sched_runnable() * can observe tdq_load == 0, so the CPU ends up idling with pending * work. tdq_notify() similarly ensures that a prior update to tdq_load * is visible before calling cpu_idle_wakeup(). */ atomic_store_int(statep, newstate); #if defined(SCHED_ULE) && defined(SMP) atomic_thread_fence_seq_cst(); #endif /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) { enable_intr(); atomic_store_int(statep, STATE_RUNNING); return (false); } else { return (true); } } static void cpu_idle_exit(int *statep) { atomic_store_int(statep, STATE_RUNNING); } static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; if (cpu_idle_enter(state, STATE_SLEEPING)) { if (cpu_idle_hook) cpu_idle_hook(sbt); else acpi_cpu_c1(); cpu_idle_exit(state); } } static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; if (cpu_idle_enter(state, STATE_SLEEPING)) { acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); } } static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; if (cpu_idle_enter(state, STATE_MWAIT)) { cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); cpu_idle_exit(state); } } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_RUNNING); /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR1(KTR_SPARE2, "cpu_idle(%d)", busy); /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_amdc1e_bug && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if ((msr & (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)) != 0) wrmsr(MSR_AMDK8_IPM, msr & ~(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR1(KTR_SPARE2, "cpu_idle(%d) done", busy); } static int cpu_idle_apl31_workaround; SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW, &cpu_idle_apl31_workaround, 0, "Apollo Lake APL31 MWAIT bug workaround"); int cpu_idle_wakeup(int cpu) { struct monitorbuf *mb; int *state; mb = &pcpu_find(cpu)->pc_monitorbuf; state = &mb->idle_state; switch (atomic_load_int(state)) { case STATE_SLEEPING: return (0); case STATE_MWAIT: atomic_store_int(state, STATE_RUNNING); return (cpu_idle_apl31_workaround ? 0 : 1); case STATE_RUNNING: return (1); default: panic("bad monitor state"); return (1); } } /* * Ordered by speed/power consumption. */ static const struct { void *id_fn; const char *id_name; int id_cpuid2_flag; } idle_tbl[] = { { .id_fn = cpu_idle_spin, .id_name = "spin" }, { .id_fn = cpu_idle_mwait, .id_name = "mwait", .id_cpuid2_flag = CPUID2_MON }, { .id_fn = cpu_idle_hlt, .id_name = "hlt" }, { .id_fn = cpu_idle_acpi, .id_name = "acpi" }, }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static bool cpu_idle_selector(const char *new_idle_name) { int i; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; if (strcmp(idle_tbl[i].id_name, new_idle_name)) continue; cpu_idle_fn = idle_tbl[i].id_fn; if (bootverbose) printf("CPU idle set to %s\n", idle_tbl[i].id_name); return (true); } return (false); } static int cpu_idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; const char *p; int error, i; p = "unknown"; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); return (cpu_idle_selector(buf) ? 0 : EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, cpu_idle_sysctl, "A", "currently selected idle function"); static void cpu_idle_tun(void *unused __unused) { char tunvar[16]; if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar))) cpu_idle_selector(tunvar); else if (cpu_vendor_id == CPU_VENDOR_AMD && CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) { /* Ryzen erratas 1057, 1109. */ cpu_idle_selector("hlt"); idle_mwait = 0; mwait_cpustop_broken = true; } if (cpu_vendor_id == CPU_VENDOR_INTEL && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x5c) { /* * Apollo Lake errata APL31 (public errata APL30). * Stores to the armed address range may not trigger * MWAIT to resume execution. OS needs to use * interrupts to wake processors from MWAIT-induced * sleep states. */ cpu_idle_apl31_workaround = 1; mwait_cpustop_broken = true; } TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround); } SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL); static int panic_on_nmi = 0xff; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, &panic_on_nmi, 0, "Panic on NMI: 1 = H/W failure; 2 = unknown; 0xff = all"); int nmi_is_broadcast = 1; SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, &nmi_is_broadcast, 0, "Chipset NMI is broadcast"); int (*apei_nmi)(void); void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) { bool claimed = false; #ifdef DEV_ISA /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err)) { claimed = true; if ((panic_on_nmi & 1) != 0) panic("NMI indicates hardware failure"); } #endif /* DEV_ISA */ /* ACPI Platform Error Interfaces callback. */ if (apei_nmi != NULL && (*apei_nmi)()) claimed = true; /* * NMIs can be useful for debugging. They can be hooked up to a * pushbutton, usually on an ISA, PCI, or PCIe card. They can also be * generated by an IPMI BMC, either manually or in response to a * watchdog timeout. For example, see the "power diag" command in * ports/sysutils/ipmitool. They can also be generated by a * hypervisor; see "bhyvectl --inject-nmi". */ #ifdef KDB if (!claimed && (panic_on_nmi & 2) != 0) { if (debugger_on_panic) { printf("NMI/cpu%d ... going to debugger\n", cpu); claimed = kdb_trap(type, 0, frame); } } #endif /* KDB */ if (!claimed && panic_on_nmi != 0) panic("NMI"); } void nmi_handle_intr(u_int type, struct trapframe *frame) { #ifdef SMP if (nmi_is_broadcast) { nmi_call_kdb_smp(type, frame); return; } #endif nmi_call_kdb(PCPU_GET(cpuid), type, frame); } static int hw_ibrs_active; int hw_ibrs_ibpb_active; int hw_ibrs_disable = 1; SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ibrs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Indirect Branch Restricted Speculation active"); SYSCTL_INT(_machdep_mitigations_ibrs, OID_AUTO, active, CTLFLAG_RD, &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active"); void hw_ibrs_recalculate(bool for_all_cpus) { if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) { x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL) | (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR), IA32_SPEC_CTRL_IBRS, NULL); hw_ibrs_active = hw_ibrs_disable == 0; hw_ibrs_ibpb_active = 0; } else { hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable; } } static int hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ibrs_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ibrs_disable = val != 0; hw_ibrs_recalculate(true); return (0); } SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", "Disable Indirect Branch Restricted Speculation"); SYSCTL_PROC(_machdep_mitigations_ibrs, OID_AUTO, disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", "Disable Indirect Branch Restricted Speculation"); int hw_ssb_active; int hw_ssb_disable; SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD, &hw_ssb_active, 0, "Speculative Store Bypass Disable active"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ssb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Speculative Store Bypass Disable active"); SYSCTL_INT(_machdep_mitigations_ssb, OID_AUTO, active, CTLFLAG_RD, &hw_ssb_active, 0, "Speculative Store Bypass Disable active"); static void hw_ssb_set(bool enable, bool for_all_cpus) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) { hw_ssb_active = 0; return; } hw_ssb_active = enable; x86_msr_op(MSR_IA32_SPEC_CTRL, (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (for_all_cpus ? MSR_OP_SCHED_ALL : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD, NULL); } void hw_ssb_recalculate(bool all_cpus) { switch (hw_ssb_disable) { default: hw_ssb_disable = 0; /* FALLTHROUGH */ case 0: /* off */ hw_ssb_set(false, all_cpus); break; case 1: /* on */ hw_ssb_set(true, all_cpus); break; case 2: /* auto */ hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ? false : true, all_cpus); break; } } static int hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ssb_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ssb_disable = val; hw_ssb_recalculate(true); return (0); } SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ssb_disable_handler, "I", "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)"); SYSCTL_PROC(_machdep_mitigations_ssb, OID_AUTO, disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ssb_disable_handler, "I", "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)"); int hw_mds_disable; /* * Handler for Microarchitectural Data Sampling issues. Really not a * pointer to C function: on amd64 the code must not change any CPU * architectural state except possibly %rflags. Also, it is always * called with interrupts disabled. */ void mds_handler_void(void); void mds_handler_verw(void); void mds_handler_ivb(void); void mds_handler_bdw(void); void mds_handler_skl_sse(void); void mds_handler_skl_avx(void); void mds_handler_skl_avx512(void); void mds_handler_silvermont(void); void (*mds_handler)(void) = mds_handler_void; static int sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; if (mds_handler == mds_handler_void) state = "inactive"; else if (mds_handler == mds_handler_verw) state = "VERW"; else if (mds_handler == mds_handler_ivb) state = "software IvyBridge"; else if (mds_handler == mds_handler_bdw) state = "software Broadwell"; else if (mds_handler == mds_handler_skl_sse) state = "software Skylake SSE"; else if (mds_handler == mds_handler_skl_avx) state = "software Skylake AVX"; else if (mds_handler == mds_handler_skl_avx512) state = "software Skylake AVX512"; else if (mds_handler == mds_handler_silvermont) state = "software Silvermont"; else state = "unknown"; return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_mds_disable_state_handler, "A", "Microarchitectural Data Sampling Mitigation state"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, mds, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Microarchitectural Data Sampling Mitigation state"); SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_mds_disable_state_handler, "A", "Microarchitectural Data Sampling Mitigation state"); _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512"); void hw_mds_recalculate(void) { struct pcpu *pc; vm_offset_t b64; u_long xcr0; int i; /* * Allow user to force VERW variant even if MD_CLEAR is not * reported. For instance, hypervisor might unknowingly * filter the cap out. * For the similar reasons, and for testing, allow to enable * mitigation even when MDS_NO cap is set. */ if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 || ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 && hw_mds_disable == 3)) { mds_handler = mds_handler_void; } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 && hw_mds_disable == 3) || hw_mds_disable == 1) { mds_handler = mds_handler_verw; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e || CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a || CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 || CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d || CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e || CPUID_TO_MODEL(cpu_id) == 0x3a) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Nehalem, SandyBridge, IvyBridge */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc_domainset(672, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); bzero(pc->pc_mds_buf, 16); } } mds_handler = mds_handler_ivb; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c || CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 || CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f || CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Haswell, Broadwell */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc_domainset(1536, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); bzero(pc->pc_mds_buf, 16); } } mds_handler = mds_handler_bdw; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id & CPUID_STEPPING) <= 5) || CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e || (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id & CPUID_STEPPING) <= 0xb) || (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id & CPUID_STEPPING) <= 0xc)) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Skylake, KabyLake, CoffeeLake, WhiskeyLake, * CascadeLake */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc_domainset(6 * 1024, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); b64 = (vm_offset_t)malloc_domainset(64 + 63, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); pc->pc_mds_buf64 = (void *)roundup2(b64, 64); bzero(pc->pc_mds_buf64, 64); } } xcr0 = rxcr(0); if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 && (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0) mds_handler = mds_handler_skl_avx512; else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 && (cpu_feature2 & CPUID2_AVX) != 0) mds_handler = mds_handler_skl_avx; else mds_handler = mds_handler_skl_sse; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && ((CPUID_TO_MODEL(cpu_id) == 0x37 || CPUID_TO_MODEL(cpu_id) == 0x4a || CPUID_TO_MODEL(cpu_id) == 0x4c || CPUID_TO_MODEL(cpu_id) == 0x4d || CPUID_TO_MODEL(cpu_id) == 0x5a || CPUID_TO_MODEL(cpu_id) == 0x5d || CPUID_TO_MODEL(cpu_id) == 0x6e || CPUID_TO_MODEL(cpu_id) == 0x65 || CPUID_TO_MODEL(cpu_id) == 0x75 || CPUID_TO_MODEL(cpu_id) == 0x1c || CPUID_TO_MODEL(cpu_id) == 0x26 || CPUID_TO_MODEL(cpu_id) == 0x27 || CPUID_TO_MODEL(cpu_id) == 0x35 || CPUID_TO_MODEL(cpu_id) == 0x36 || CPUID_TO_MODEL(cpu_id) == 0x7a))) { /* Silvermont, Airmont */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK); } mds_handler = mds_handler_silvermont; } else { hw_mds_disable = 0; mds_handler = mds_handler_void; } } static void hw_mds_recalculate_boot(void *arg __unused) { hw_mds_recalculate(); } SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL); static int sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_mds_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val < 0 || val > 3) return (EINVAL); hw_mds_disable = val; hw_mds_recalculate(); return (0); } SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_mds_disable_handler, "I", "Microarchitectural Data Sampling Mitigation " "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)"); SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_mds_disable_handler, "I", "Microarchitectural Data Sampling Mitigation " "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)"); /* * Intel Transactional Memory Asynchronous Abort Mitigation * CVE-2019-11135 */ int x86_taa_enable; int x86_taa_state; enum { TAA_NONE = 0, /* No mitigation enabled */ TAA_TSX_DISABLE = 1, /* Disable TSX via MSR */ TAA_VERW = 2, /* Use VERW mitigation */ TAA_AUTO = 3, /* Automatically select the mitigation */ /* The states below are not selectable by the operator */ TAA_TAA_UC = 4, /* Mitigation present in microcode */ TAA_NOT_PRESENT = 5 /* TSX is not present */ }; static void taa_set(bool enable, bool all) { x86_msr_op(MSR_IA32_TSX_CTRL, (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (all ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL), IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR, NULL); } void x86_taa_recalculate(void) { static int taa_saved_mds_disable = 0; int taa_need = 0, taa_state = 0; int mds_disable = 0, need_mds_recalc = 0; /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */ if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 || (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) { /* TSX is not present */ x86_taa_state = TAA_NOT_PRESENT; return; } /* Check to see what mitigation options the CPU gives us */ if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) { /* CPU is not suseptible to TAA */ taa_need = TAA_TAA_UC; } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) { /* * CPU can turn off TSX. This is the next best option * if TAA_NO hardware mitigation isn't present */ taa_need = TAA_TSX_DISABLE; } else { /* No TSX/TAA specific remedies are available. */ if (x86_taa_enable == TAA_TSX_DISABLE) { if (bootverbose) printf("TSX control not available\n"); return; } else taa_need = TAA_VERW; } /* Can we automatically take action, or are we being forced? */ if (x86_taa_enable == TAA_AUTO) taa_state = taa_need; else taa_state = x86_taa_enable; /* No state change, nothing to do */ if (taa_state == x86_taa_state) { if (bootverbose) printf("No TSX change made\n"); return; } /* Does the MSR need to be turned on or off? */ if (taa_state == TAA_TSX_DISABLE) taa_set(true, true); else if (x86_taa_state == TAA_TSX_DISABLE) taa_set(false, true); /* Does MDS need to be set to turn on VERW? */ if (taa_state == TAA_VERW) { taa_saved_mds_disable = hw_mds_disable; mds_disable = hw_mds_disable = 1; need_mds_recalc = 1; } else if (x86_taa_state == TAA_VERW) { mds_disable = hw_mds_disable = taa_saved_mds_disable; need_mds_recalc = 1; } if (need_mds_recalc) { hw_mds_recalculate(); if (mds_disable != hw_mds_disable) { if (bootverbose) printf("Cannot change MDS state for TAA\n"); /* Don't update our state */ return; } } x86_taa_state = taa_state; return; } static void taa_recalculate_boot(void * arg __unused) { x86_taa_recalculate(); } SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TSX Asynchronous Abort Mitigation"); static int sysctl_taa_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = x86_taa_enable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val < TAA_NONE || val > TAA_AUTO) return (EINVAL); x86_taa_enable = val; x86_taa_recalculate(); return (0); } SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_taa_handler, "I", "TAA Mitigation enablement control " "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO)"); static int sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; switch (x86_taa_state) { case TAA_NONE: state = "inactive"; break; case TAA_TSX_DISABLE: state = "TSX disabled"; break; case TAA_VERW: state = "VERW"; break; case TAA_TAA_UC: state = "Mitigated in microcode"; break; case TAA_NOT_PRESENT: state = "TSX not present"; break; default: state = "unknown"; } return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_taa_state_handler, "A", "TAA Mitigation state"); int __read_frequently cpu_flush_rsb_ctxsw; SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw, CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0, "Flush Return Stack Buffer on context switch"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "MCU Optimization, disable RDSEED mitigation"); int x86_rngds_mitg_enable = 1; void x86_rngds_mitg_recalculate(bool all_cpus) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) return; x86_msr_op(MSR_IA32_MCU_OPT_CTRL, (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL), IA32_RNGDS_MITG_DIS, NULL); } static int sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = x86_rngds_mitg_enable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); x86_rngds_mitg_enable = val; x86_rngds_mitg_recalculate(true); return (0); } SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_rngds_mitg_enable_handler, "I", "MCU Optimization, disabling RDSEED mitigation control " "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled)"); static int sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) { state = "Not applicable"; } else if (x86_rngds_mitg_enable == 0) { state = "RDSEED not serialized"; } else { state = "Mitigated"; } return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rngds_state_handler, "A", "MCU Optimization state"); + +/* + * Zenbleed. + * + * No corresponding errata is publicly listed. AMD has issued a security + * bulletin (AMD-SB-7008), entitled "Cross-Process Information Leak". This + * document lists (as of August 2023) platform firmware's availability target + * dates, with most being November/December 2023. It will then be up to + * motherboard manufacturers to produce corresponding BIOS updates, which will + * happen with an inevitable lag. Additionally, for a variety of reasons, + * operators might not be able to apply them everywhere due. On the side of + * standalone CPU microcodes, no plans for availability have been published so + * far. However, a developer appearing to be an AMD employee has hardcoded in + * Linux revision numbers of future microcodes that are presumed to fix the + * vulnerability. + * + * Given the stability issues encountered with early microcode releases for Rome + * (the only microcode publicly released so far) and the absence of official + * communication on standalone CPU microcodes, we have opted instead for + * matching by default all AMD Zen2 processors which, according to the + * vulnerability's discoverer, are all affected (see + * https://lock.cmpxchg8b.com/zenbleed.html). This policy, also adopted by + * OpenBSD, may be overriden using the tunable/sysctl + * 'machdep.mitigations.zenbleed.enable'. We might revise it later depending on + * official statements, microcode updates' public availability and community + * assessment that they actually fix the vulnerability without any instability + * side effects. + */ + +SYSCTL_NODE(_machdep_mitigations, OID_AUTO, zenbleed, + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Zenbleed OS-triggered prevention (via chicken bit)"); + +/* 2 is auto, see below. */ +int zenbleed_enable = 2; + +void +zenbleed_sanitize_enable(void) +{ + /* Default to auto (2). */ + if (zenbleed_enable < 0 || zenbleed_enable > 2) + zenbleed_enable = 2; +} + +static bool +zenbleed_chicken_bit_applicable(void) +{ + /* Concerns only bare-metal AMD Zen2 processors. */ + return (cpu_vendor_id == CPU_VENDOR_AMD && + CPUID_TO_FAMILY(cpu_id) == 0x17 && + CPUID_TO_MODEL(cpu_id) >= 0x30 && + vm_guest == VM_GUEST_NO); +} + +static bool +zenbleed_chicken_bit_should_enable(void) +{ + /* + * Obey tunable/sysctl. + * + * As explained above, currently, the automatic setting (2) and the "on" + * one (1) have the same effect. In the future, we might additionally + * check for specific microcode revisions as part of the automatic + * determination. + */ + return (zenbleed_enable != 0); +} + +void +zenbleed_check_and_apply(bool all_cpus) +{ + bool set; + + if (!zenbleed_chicken_bit_applicable()) + return; + + set = zenbleed_chicken_bit_should_enable(); + + x86_msr_op(MSR_DE_CFG, + (set ? MSR_OP_OR : MSR_OP_ANDNOT) | + (all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL), + DE_CFG_ZEN2_FP_BACKUP_FIX_BIT, NULL); +} + +static int +sysctl_zenbleed_enable_handler(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = zenbleed_enable; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + zenbleed_enable = val; + zenbleed_sanitize_enable(); + zenbleed_check_and_apply(true); + return (0); +} +SYSCTL_PROC(_machdep_mitigations_zenbleed, OID_AUTO, enable, CTLTYPE_INT | + CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, + sysctl_zenbleed_enable_handler, "I", + "Enable Zenbleed OS-triggered mitigation (chicken bit) " + "(0: Force disable, 1: Force enable, 2: Automatic determination)"); + +static int +sysctl_zenbleed_state_handler(SYSCTL_HANDLER_ARGS) +{ + const char *state; + + if (!zenbleed_chicken_bit_applicable()) + state = "Not applicable"; + else if (zenbleed_chicken_bit_should_enable()) + state = "Mitigation enabled"; + else + state = "Mitigation disabled"; + return (SYSCTL_OUT(req, state, strlen(state))); +} +SYSCTL_PROC(_machdep_mitigations_zenbleed, OID_AUTO, state, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_zenbleed_state_handler, "A", + "Zenbleed OS-triggered mitigation (chicken bit) state"); + + /* * Enable and restore kernel text write permissions. * Callers must ensure that disable_wp()/restore_wp() are executed * without rescheduling on the same core. */ bool disable_wp(void) { u_int cr0; cr0 = rcr0(); if ((cr0 & CR0_WP) == 0) return (false); load_cr0(cr0 & ~CR0_WP); return (true); } void restore_wp(bool old_wp) { if (old_wp) load_cr0(rcr0() | CR0_WP); } bool acpi_get_fadt_bootflags(uint16_t *flagsp) { #ifdef DEV_ACPI ACPI_TABLE_FADT *fadt; vm_paddr_t physaddr; physaddr = acpi_find_table(ACPI_SIG_FADT); if (physaddr == 0) return (false); fadt = acpi_map_table(physaddr, ACPI_SIG_FADT); if (fadt == NULL) return (false); *flagsp = fadt->BootFlags; acpi_unmap_table(fadt); return (true); #else return (false); #endif } DEFINE_IFUNC(, uint64_t, rdtsc_ordered, (void)) { bool cpu_is_amd = cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_HYGON; if ((amd_feature & AMDID_RDTSCP) != 0) return (rdtscp); else if ((cpu_feature & CPUID_SSE2) != 0) return (cpu_is_amd ? rdtsc_ordered_mfence : rdtsc_ordered_lfence); else return (rdtsc); }