diff --git a/sys/x86/include/mca.h b/sys/x86/include/mca.h index 183480625f6d..d8b8da86523d 100644 --- a/sys/x86/include/mca.h +++ b/sys/x86/include/mca.h @@ -1,56 +1,61 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef __X86_MCA_H__ #define __X86_MCA_H__ struct mca_record { uint64_t mr_status; uint64_t mr_addr; uint64_t mr_misc; uint64_t mr_tsc; int mr_apic_id; int mr_bank; uint64_t mr_mcg_cap; uint64_t mr_mcg_status; int mr_cpu_id; int mr_cpu_vendor_id; int mr_cpu; }; #ifdef _KERNEL +typedef void (*mca_decode_func)(void *, const struct mca_record *); + +int mca_register_decode_func(mca_decode_func func, void *aux); +int mca_deregister_decode_func(mca_decode_func func); + void cmc_intr(void); void mca_init(void); void mca_intr(void); void mca_resume(void); #endif #endif /* !__X86_MCA_H__ */ diff --git a/sys/x86/x86/mca.c b/sys/x86/x86/mca.c index b293fcedbd84..89ab59639882 100644 --- a/sys/x86/x86/mca.c +++ b/sys/x86/x86/mca.c @@ -1,1561 +1,1609 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for x86 machine check architecture. */ #include #ifdef __amd64__ #define DEV_APIC #else #include "opt_apic.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Modes for mca_scan() */ enum scan_mode { POLLED, MCE, CMCI, }; #ifdef DEV_APIC /* * State maintained for each monitored MCx bank to control the * corrected machine check interrupt threshold. */ struct cmc_state { int max_threshold; time_t last_intr; }; struct amd_et_state { int cur_threshold; time_t last_intr; }; #endif struct mca_internal { struct mca_record rec; STAILQ_ENTRY(mca_internal) link; }; struct mca_enumerator_ops { unsigned int (*ctl)(int); unsigned int (*status)(int); unsigned int (*addr)(int); unsigned int (*misc)(int); }; static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture"); static volatile int mca_count; /* Number of records stored. */ static int mca_banks; /* Number of per-CPU register banks. */ static int mca_maxcount = -1; /* Limit on records stored. (-1 = unlimited) */ +static mca_decode_func mca_func; +static void *mca_decode_arg; static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Machine Check Architecture"); static int mca_enabled = 1; SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0, "Administrative toggle for machine check support"); static int log_corrected = 1; SYSCTL_INT(_hw_mca, OID_AUTO, log_corrected, CTLFLAG_RWTUN, &log_corrected, 0, "Log corrected errors to the console"); static int amd10h_L1TP = 1; SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0, "Administrative toggle for logging of level one TLB parity (L1TP) errors"); static int intel6h_HSD131; SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0, "Administrative toggle for logging of spurious corrected errors"); int workaround_erratum383; SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN, &workaround_erratum383, 0, "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?"); static STAILQ_HEAD(, mca_internal) mca_freelist; static int mca_freecount; static STAILQ_HEAD(, mca_internal) mca_records; static STAILQ_HEAD(, mca_internal) mca_pending; static int mca_ticks = 300; static struct taskqueue *mca_tq; static struct task mca_resize_task; static struct timeout_task mca_scan_task; static struct mtx mca_lock; static unsigned int mca_ia32_ctl_reg(int bank) { return (MSR_MC_CTL(bank)); } static unsigned int mca_ia32_status_reg(int bank) { return (MSR_MC_STATUS(bank)); } static unsigned int mca_ia32_addr_reg(int bank) { return (MSR_MC_ADDR(bank)); } static unsigned int mca_ia32_misc_reg(int bank) { return (MSR_MC_MISC(bank)); } static unsigned int mca_smca_ctl_reg(int bank) { return (MSR_SMCA_MC_CTL(bank)); } static unsigned int mca_smca_status_reg(int bank) { return (MSR_SMCA_MC_STATUS(bank)); } static unsigned int mca_smca_addr_reg(int bank) { return (MSR_SMCA_MC_ADDR(bank)); } static unsigned int mca_smca_misc_reg(int bank) { return (MSR_SMCA_MC_MISC(bank)); } static struct mca_enumerator_ops mca_msr_ops = { .ctl = mca_ia32_ctl_reg, .status = mca_ia32_status_reg, .addr = mca_ia32_addr_reg, .misc = mca_ia32_misc_reg }; #ifdef DEV_APIC static struct cmc_state **cmc_state; /* Indexed by cpuid, bank. */ static struct amd_et_state **amd_et_state; /* Indexed by cpuid, bank. */ static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ static int amd_elvt = -1; static inline bool amd_thresholding_supported(void) { if (cpu_vendor_id != CPU_VENDOR_AMD && cpu_vendor_id != CPU_VENDOR_HYGON) return (false); /* * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F). * * It begins to be documented in family 0x15 model 30 and family 0x16, * but neither of these families documents the ScalableMca bit, which * supposedly defines the presence of this feature on family 0x17. */ if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16) return (true); if (CPUID_TO_FAMILY(cpu_id) >= 0x17) return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0); return (false); } #endif static inline bool cmci_supported(uint64_t mcg_cap) { /* * MCG_CAP_CMCI_P bit is reserved in AMD documentation. Until * it is defined, do not use it to check for CMCI support. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) return (false); return ((mcg_cap & MCG_CAP_CMCI_P) != 0); } static inline bool tes_supported(uint64_t mcg_cap) { /* * MCG_CAP_TES_P bit is reserved in AMD documentation. Until * it is defined, do not use it to check for TES support. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) return (false); return ((mcg_cap & MCG_CAP_TES_P) != 0); } static inline bool ser_supported(uint64_t mcg_cap) { return (tes_supported(mcg_cap) && (mcg_cap & MCG_CAP_SER_P) != 0); } static int sysctl_positive_int(SYSCTL_HANDLER_ARGS) { int error, value; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); if (value <= 0) return (EINVAL); *(int *)arg1 = value; return (0); } static int sysctl_mca_records(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; struct mca_record record; struct mca_internal *rec; int i; if (namelen != 1) return (EINVAL); if (name[0] < 0 || name[0] >= mca_count) return (EINVAL); mtx_lock_spin(&mca_lock); if (name[0] >= mca_count) { mtx_unlock_spin(&mca_lock); return (EINVAL); } i = 0; STAILQ_FOREACH(rec, &mca_records, link) { if (i == name[0]) { record = rec->rec; break; } i++; } mtx_unlock_spin(&mca_lock); return (SYSCTL_OUT(req, &record, sizeof(record))); } static const char * mca_error_ttype(uint16_t mca_error) { switch ((mca_error & 0x000c) >> 2) { case 0: return ("I"); case 1: return ("D"); case 2: return ("G"); } return ("?"); } static const char * mca_error_level(uint16_t mca_error) { switch (mca_error & 0x0003) { case 0: return ("L0"); case 1: return ("L1"); case 2: return ("L2"); case 3: return ("LG"); } return ("L?"); } static const char * mca_error_request(uint16_t mca_error) { switch ((mca_error & 0x00f0) >> 4) { case 0x0: return ("ERR"); case 0x1: return ("RD"); case 0x2: return ("WR"); case 0x3: return ("DRD"); case 0x4: return ("DWR"); case 0x5: return ("IRD"); case 0x6: return ("PREFETCH"); case 0x7: return ("EVICT"); case 0x8: return ("SNOOP"); } return ("???"); } static const char * mca_error_mmtype(uint16_t mca_error) { switch ((mca_error & 0x70) >> 4) { case 0x0: return ("GEN"); case 0x1: return ("RD"); case 0x2: return ("WR"); case 0x3: return ("AC"); case 0x4: return ("MS"); } return ("???"); } static const char * mca_addres_mode(uint64_t mca_misc) { switch ((mca_misc & MC_MISC_ADDRESS_MODE) >> 6) { case 0x0: return ("Segment Offset"); case 0x1: return ("Linear Address"); case 0x2: return ("Physical Address"); case 0x3: return ("Memory Address"); case 0x7: return ("Generic"); } return ("???"); } static int mca_mute(const struct mca_record *rec) { /* * Skip spurious corrected parity errors generated by Intel Haswell- * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48 * erratum respectively), unless reporting is enabled. * Note that these errors also have been observed with the D0-stepping * of Haswell, while at least initially the CPU specification updates * suggested only the C0-stepping to be affected. Similarly, Celeron * 2955U with a CPU ID of 0x45 apparently are also concerned with the * same problem, with HSM142 only referring to 0x3c and 0x46. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x3c || /* HSD131, HSM142, HSW131 */ CPUID_TO_MODEL(cpu_id) == 0x3d || /* BDM48 */ CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46) && /* HSM142 */ rec->mr_bank == 0 && (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 && !intel6h_HSD131) return (1); return (0); } +/* + * Pass mca_record to a memory controller driver so that it can decode + * the address and possibly do some more work. + */ +static void +mca_decode_record(const struct mca_record *rec) +{ + + if (mca_func != NULL) + mca_func(mca_decode_arg, rec); +} + +int +mca_register_decode_func(mca_decode_func func, void *aux) +{ + int error; + + if (func == NULL) + return (EINVAL); + + error = 0; + mtx_lock_spin(&mca_lock); + if (mca_func != NULL) + error = EEXIST; + + if (error == 0) { + mca_func = func; + mca_decode_arg = aux; + } + + mtx_unlock_spin(&mca_lock); + return (error); +} + +int +mca_deregister_decode_func(mca_decode_func func) +{ + if (func == NULL || func != mca_func) + return (EINVAL); + + mca_func = NULL; + return (0); +} + /* Dump details about a single machine check. */ static void mca_log(const struct mca_record *rec) { uint16_t mca_error; if (mca_mute(rec)) return; if (!log_corrected && (rec->mr_status & MC_STATUS_UC) == 0 && (!tes_supported(rec->mr_mcg_cap) || ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2)) return; printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, (long long)rec->mr_status); printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, rec->mr_cpu_id, rec->mr_apic_id); printf("MCA: CPU %d ", rec->mr_cpu); if (rec->mr_status & MC_STATUS_UC) printf("UNCOR "); else { printf("COR "); if (cmci_supported(rec->mr_mcg_cap)) printf("(%lld) ", ((long long)rec->mr_status & MC_STATUS_COR_COUNT) >> 38); if (tes_supported(rec->mr_mcg_cap)) { switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) { case 0x1: printf("(Green) "); case 0x2: printf("(Yellow) "); } } } if (rec->mr_status & MC_STATUS_EN) printf("EN "); if (rec->mr_status & MC_STATUS_PCC) printf("PCC "); if (ser_supported(rec->mr_mcg_cap)) { if (rec->mr_status & MC_STATUS_S) printf("S "); if (rec->mr_status & MC_STATUS_AR) printf("AR "); } if (rec->mr_status & MC_STATUS_OVER) printf("OVER "); mca_error = rec->mr_status & MC_STATUS_MCA_ERROR; switch (mca_error) { /* Simple error codes. */ case 0x0000: printf("no error"); break; case 0x0001: printf("unclassified error"); break; case 0x0002: printf("ucode ROM parity error"); break; case 0x0003: printf("external error"); break; case 0x0004: printf("FRC error"); break; case 0x0005: printf("internal parity error"); break; case 0x0006: printf("SMM handler code access violation"); break; case 0x0400: printf("internal timer error"); break; case 0x0e0b: printf("generic I/O error"); if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL && (rec->mr_status & MC_STATUS_MISCV)) { printf(" (pci%d:%d:%d:%d)", (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32), (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24), (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19), (int)((rec->mr_misc & MC_MISC_PCIE_FUNC) >> 16)); } break; default: if ((mca_error & 0xfc00) == 0x0400) { printf("internal error %x", mca_error & 0x03ff); break; } /* Compound error codes. */ /* Memory hierarchy error. */ if ((mca_error & 0xeffc) == 0x000c) { printf("%s memory error", mca_error_level(mca_error)); break; } /* TLB error. */ if ((mca_error & 0xeff0) == 0x0010) { printf("%sTLB %s error", mca_error_ttype(mca_error), mca_error_level(mca_error)); break; } /* Memory controller error. */ if ((mca_error & 0xef80) == 0x0080) { printf("%s channel ", mca_error_mmtype(mca_error)); if ((mca_error & 0x000f) != 0x000f) printf("%d", mca_error & 0x000f); else printf("??"); printf(" memory error"); break; } /* Cache error. */ if ((mca_error & 0xef00) == 0x0100) { printf("%sCACHE %s %s error", mca_error_ttype(mca_error), mca_error_level(mca_error), mca_error_request(mca_error)); break; } /* Extended memory error. */ if ((mca_error & 0xef80) == 0x0280) { printf("%s channel ", mca_error_mmtype(mca_error)); if ((mca_error & 0x000f) != 0x000f) printf("%d", mca_error & 0x000f); else printf("??"); printf(" extended memory error"); break; } /* Bus and/or Interconnect error. */ if ((mca_error & 0xe800) == 0x0800) { printf("BUS%s ", mca_error_level(mca_error)); switch ((mca_error & 0x0600) >> 9) { case 0: printf("Source"); break; case 1: printf("Responder"); break; case 2: printf("Observer"); break; default: printf("???"); break; } printf(" %s ", mca_error_request(mca_error)); switch ((mca_error & 0x000c) >> 2) { case 0: printf("Memory"); break; case 2: printf("I/O"); break; case 3: printf("Other"); break; default: printf("???"); break; } if (mca_error & 0x0100) printf(" timed out"); break; } printf("unknown error %x", mca_error); break; } printf("\n"); if (rec->mr_status & MC_STATUS_ADDRV) { printf("MCA: Address 0x%llx", (long long)rec->mr_addr); if (ser_supported(rec->mr_mcg_cap) && (rec->mr_status & MC_STATUS_MISCV)) { printf(" (Mode: %s, LSB: %d)", mca_addres_mode(rec->mr_misc), (int)(rec->mr_misc & MC_MISC_RA_LSB)); } printf("\n"); } if (rec->mr_status & MC_STATUS_MISCV) printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc); + + mca_decode_record(rec); } static bool mca_is_mce(uint64_t mcg_cap, uint64_t status, bool *recoverablep) { /* Corrected error. */ if ((status & MC_STATUS_UC) == 0) return (0); /* Spurious MCA error. */ if ((status & MC_STATUS_EN) == 0) return (0); /* The processor does not support software error recovery. */ if (!ser_supported(mcg_cap)) { *recoverablep = false; return (1); } /* Context might have been corrupted. */ if (status & MC_STATUS_PCC) { *recoverablep = false; return (1); } /* Uncorrected software recoverable. */ if (status & MC_STATUS_S) { /* Action required vs optional. */ if (status & MC_STATUS_AR) *recoverablep = false; return (1); } /* Uncorrected no action required. */ return (0); } static int mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank, struct mca_record *rec, bool *recoverablep) { uint64_t status; u_int p[4]; bool mce, recover; status = rdmsr(mca_msr_ops.status(bank)); if (!(status & MC_STATUS_VAL)) return (0); recover = *recoverablep; mce = mca_is_mce(mcg_cap, status, &recover); if (mce != (mode == MCE)) return (0); *recoverablep = recover; /* Save exception information. */ rec->mr_status = status; rec->mr_bank = bank; rec->mr_addr = 0; if (status & MC_STATUS_ADDRV) rec->mr_addr = rdmsr(mca_msr_ops.addr(bank)); rec->mr_misc = 0; if (status & MC_STATUS_MISCV) rec->mr_misc = rdmsr(mca_msr_ops.misc(bank)); rec->mr_tsc = rdtsc(); rec->mr_apic_id = PCPU_GET(apic_id); rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP); rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS); rec->mr_cpu_id = cpu_id; rec->mr_cpu_vendor_id = cpu_vendor_id; rec->mr_cpu = PCPU_GET(cpuid); /* * Clear machine check. Don't do this for uncorrectable * errors so that the BIOS can see them. */ if (!mce || recover) { wrmsr(mca_msr_ops.status(bank), 0); do_cpuid(0, p); } return (1); } static void mca_resize_freelist(void) { struct mca_internal *next, *rec; STAILQ_HEAD(, mca_internal) tmplist; int count, i, desired_max, desired_min; /* * Ensure we have at least one record for each bank and one * record per CPU, but no more than twice that amount. */ desired_min = imax(mp_ncpus, mca_banks); desired_max = imax(mp_ncpus, mca_banks) * 2; STAILQ_INIT(&tmplist); mtx_lock_spin(&mca_lock); while (mca_freecount > desired_max) { rec = STAILQ_FIRST(&mca_freelist); KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty", mca_freecount)); STAILQ_REMOVE_HEAD(&mca_freelist, link); mca_freecount--; STAILQ_INSERT_TAIL(&tmplist, rec, link); } while (mca_freecount < desired_min) { count = desired_min - mca_freecount; mtx_unlock_spin(&mca_lock); for (i = 0; i < count; i++) { rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); STAILQ_INSERT_TAIL(&tmplist, rec, link); } mtx_lock_spin(&mca_lock); STAILQ_CONCAT(&mca_freelist, &tmplist); mca_freecount += count; } mtx_unlock_spin(&mca_lock); STAILQ_FOREACH_SAFE(rec, &tmplist, link, next) free(rec, M_MCA); } static void mca_resize(void *context, int pending) { mca_resize_freelist(); } static void mca_record_entry(enum scan_mode mode, const struct mca_record *record) { struct mca_internal *rec; if (mode == POLLED) { rec = malloc(sizeof(*rec), M_MCA, M_WAITOK); mtx_lock_spin(&mca_lock); } else { mtx_lock_spin(&mca_lock); rec = STAILQ_FIRST(&mca_freelist); if (rec == NULL) { printf("MCA: Unable to allocate space for an event.\n"); mca_log(record); mtx_unlock_spin(&mca_lock); return; } STAILQ_REMOVE_HEAD(&mca_freelist, link); mca_freecount--; } rec->rec = *record; STAILQ_INSERT_TAIL(&mca_pending, rec, link); mtx_unlock_spin(&mca_lock); } #ifdef DEV_APIC /* * Update the interrupt threshold for a CMCI. The strategy is to use * a low trigger that interrupts as soon as the first event occurs. * However, if a steady stream of events arrive, the threshold is * increased until the interrupts are throttled to once every * cmc_throttle seconds or the periodic scan. If a periodic scan * finds that the threshold is too high, it is lowered. */ static int update_threshold(enum scan_mode mode, int valid, int last_intr, int count, int cur_threshold, int max_threshold) { u_int delta; int limit; delta = (u_int)(time_uptime - last_intr); limit = cur_threshold; /* * If an interrupt was received less than cmc_throttle seconds * since the previous interrupt and the count from the current * event is greater than or equal to the current threshold, * double the threshold up to the max. */ if (mode == CMCI && valid) { if (delta < cmc_throttle && count >= limit && limit < max_threshold) { limit = min(limit << 1, max_threshold); } return (limit); } /* * When the banks are polled, check to see if the threshold * should be lowered. */ if (mode != POLLED) return (limit); /* If a CMCI occurred recently, do nothing for now. */ if (delta < cmc_throttle) return (limit); /* * Compute a new limit based on the average rate of events per * cmc_throttle seconds since the last interrupt. */ if (valid) { limit = count * cmc_throttle / delta; if (limit <= 0) limit = 1; else if (limit > max_threshold) limit = max_threshold; } else { limit = 1; } return (limit); } static void cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) { struct cmc_state *cc; uint64_t ctl; int cur_threshold, new_threshold; int count; /* Fetch the current limit for this bank. */ cc = &cmc_state[PCPU_GET(cpuid)][bank]; ctl = rdmsr(MSR_MC_CTL2(bank)); count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; cur_threshold = ctl & MC_CTL2_THRESHOLD; new_threshold = update_threshold(mode, valid, cc->last_intr, count, cur_threshold, cc->max_threshold); if (mode == CMCI && valid) cc->last_intr = time_uptime; if (new_threshold != cur_threshold) { ctl &= ~MC_CTL2_THRESHOLD; ctl |= new_threshold; wrmsr(MSR_MC_CTL2(bank), ctl); } } static void amd_thresholding_update(enum scan_mode mode, int bank, int valid) { struct amd_et_state *cc; uint64_t misc; int new_threshold; int count; cc = &amd_et_state[PCPU_GET(cpuid)][bank]; misc = rdmsr(mca_msr_ops.misc(bank)); count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT; count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold); new_threshold = update_threshold(mode, valid, cc->last_intr, count, cc->cur_threshold, MC_MISC_AMD_CNT_MAX); cc->cur_threshold = new_threshold; misc &= ~MC_MISC_AMD_CNT_MASK; misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold) << MC_MISC_AMD_CNT_SHIFT; misc &= ~MC_MISC_AMD_OVERFLOW; wrmsr(mca_msr_ops.misc(bank), misc); if (mode == CMCI && valid) cc->last_intr = time_uptime; } #endif /* * This scans all the machine check banks of the current CPU to see if * there are any machine checks. Any non-recoverable errors are * reported immediately via mca_log(). The current thread must be * pinned when this is called. The 'mode' parameter indicates if we * are being called from the MC exception handler, the CMCI handler, * or the periodic poller. */ static int mca_scan(enum scan_mode mode, bool *recoverablep) { struct mca_record rec; uint64_t mcg_cap; int count = 0, i, valid; mcg_cap = rdmsr(MSR_MCG_CAP); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { #ifdef DEV_APIC /* * For a CMCI, only check banks this CPU is * responsible for. */ if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i)) continue; #endif valid = mca_check_status(mode, mcg_cap, i, &rec, recoverablep); if (valid) { count++; if (*recoverablep) mca_record_entry(mode, &rec); else mca_log(&rec); } #ifdef DEV_APIC /* * If this is a bank this CPU monitors via CMCI, * update the threshold. */ if (PCPU_GET(cmci_mask) & 1 << i) { if (cmc_state != NULL) cmci_update(mode, i, valid, &rec); else amd_thresholding_update(mode, i, valid); } #endif } return (count); } /* * Store a new record on the mca_records list while enforcing * mca_maxcount. */ static void mca_store_record(struct mca_internal *mca) { /* * If we are storing no records (mca_maxcount == 0), * we just free this record. * * If we are storing records (mca_maxcount != 0) and * we have free space on the list, store the record * and increment mca_count. * * If we are storing records and we do not have free * space on the list, store the new record at the * tail and free the oldest one from the head. */ if (mca_maxcount != 0) STAILQ_INSERT_TAIL(&mca_records, mca, link); if (mca_maxcount < 0 || mca_count < mca_maxcount) mca_count++; else { if (mca_maxcount != 0) { mca = STAILQ_FIRST(&mca_records); STAILQ_REMOVE_HEAD(&mca_records, link); } STAILQ_INSERT_TAIL(&mca_freelist, mca, link); mca_freecount++; } } /* * Do the work to process machine check records which have just been * gathered. Print any pending logs to the console. Queue them for storage. * Trigger a resizing of the free list. */ static void mca_process_records(enum scan_mode mode) { struct mca_internal *mca; mtx_lock_spin(&mca_lock); while ((mca = STAILQ_FIRST(&mca_pending)) != NULL) { STAILQ_REMOVE_HEAD(&mca_pending, link); mca_log(&mca->rec); mca_store_record(mca); } mtx_unlock_spin(&mca_lock); if (mode == POLLED) mca_resize_freelist(); else if (!cold) taskqueue_enqueue(mca_tq, &mca_resize_task); } /* * Scan the machine check banks on all CPUs by binding to each CPU in * turn. If any of the CPUs contained new machine check records, log * them to the console. */ static void mca_scan_cpus(void *context, int pending) { struct thread *td; int cpu; bool recoverable = true; mca_resize_freelist(); td = curthread; thread_lock(td); CPU_FOREACH(cpu) { sched_bind(td, cpu); thread_unlock(td); mca_scan(POLLED, &recoverable); thread_lock(td); sched_unbind(td); } thread_unlock(td); if (!STAILQ_EMPTY(&mca_pending)) mca_process_records(POLLED); taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task, mca_ticks * SBT_1S, 0, C_PREL(1)); } static int sysctl_mca_scan(SYSCTL_HANDLER_ARGS) { int error, i; i = 0; error = sysctl_handle_int(oidp, &i, 0, req); if (error) return (error); if (i) taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task, 0, 0, 0); return (0); } static int sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS) { struct mca_internal *mca; int error, i; bool doresize; i = mca_maxcount; error = sysctl_handle_int(oidp, &i, 0, req); if (error || req->newptr == NULL) return (error); mtx_lock_spin(&mca_lock); mca_maxcount = i; doresize = false; if (mca_maxcount >= 0) while (mca_count > mca_maxcount) { mca = STAILQ_FIRST(&mca_records); STAILQ_REMOVE_HEAD(&mca_records, link); mca_count--; STAILQ_INSERT_TAIL(&mca_freelist, mca, link); mca_freecount++; doresize = true; } mtx_unlock_spin(&mca_lock); if (doresize && !cold) taskqueue_enqueue(mca_tq, &mca_resize_task); return (error); } static void mca_startup(void *dummy) { if (mca_banks <= 0) return; /* CMCIs during boot may have claimed items from the freelist. */ mca_resize_freelist(); taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq"); taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task, mca_ticks * SBT_1S, 0, C_PREL(1)); } SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL); #ifdef DEV_APIC static void cmci_setup(void) { int i; cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA, M_WAITOK); for (i = 0; i <= mp_maxid; i++) cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks, M_MCA, M_WAITOK | M_ZERO); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &cmc_throttle, 0, sysctl_positive_int, "I", "Interval in seconds to throttle corrected MC interrupts"); } static void amd_thresholding_setup(void) { u_int i; amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *), M_MCA, M_WAITOK); for (i = 0; i <= mp_maxid; i++) amd_et_state[i] = malloc(sizeof(struct amd_et_state) * mca_banks, M_MCA, M_WAITOK | M_ZERO); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &cmc_throttle, 0, sysctl_positive_int, "I", "Interval in seconds to throttle corrected MC interrupts"); } #endif static void mca_setup(uint64_t mcg_cap) { /* * On AMD Family 10h processors, unless logging of level one TLB * parity (L1TP) errors is disabled, enable the recommended workaround * for Erratum 383. */ if (cpu_vendor_id == CPU_VENDOR_AMD && CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP) workaround_erratum383 = 1; mca_banks = mcg_cap & MCG_CAP_COUNT; mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); STAILQ_INIT(&mca_records); STAILQ_INIT(&mca_pending); mca_tq = taskqueue_create_fast("mca", M_WAITOK, taskqueue_thread_enqueue, &mca_tq); TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL); STAILQ_INIT(&mca_freelist); TASK_INIT(&mca_resize_task, 0, mca_resize, NULL); mca_resize_freelist(); SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0, "Record count"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "maxcount", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &mca_maxcount, 0, sysctl_mca_maxcount, "I", "Maximum record count (-1 is unlimited)"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "interval", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &mca_ticks, 0, sysctl_positive_int, "I", "Periodic interval in seconds to scan for machine checks"); SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "records", CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mca_records, "Machine check records"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_mca_scan, "I", "Force an immediate scan for machine checks"); #ifdef DEV_APIC if (cmci_supported(mcg_cap)) cmci_setup(); else if (amd_thresholding_supported()) amd_thresholding_setup(); #endif } #ifdef DEV_APIC /* * See if we should monitor CMCI for this bank. If CMCI_EN is already * set in MC_CTL2, then another CPU is responsible for this bank, so * ignore it. If CMCI_EN returns zero after being set, then this bank * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should * now monitor this bank. */ static void cmci_monitor(int i) { struct cmc_state *cc; uint64_t ctl; KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); /* * It is possible for some APs to report CMCI support even if the BSP * does not, apparently due to a BIOS bug. */ if (cmc_state == NULL) { if (bootverbose) { printf( "AP %d (%d,%d) reports CMCI support but the BSP does not\n", PCPU_GET(cpuid), PCPU_GET(apic_id), PCPU_GET(acpi_id)); } return; } ctl = rdmsr(MSR_MC_CTL2(i)); if (ctl & MC_CTL2_CMCI_EN) /* Already monitored by another CPU. */ return; /* Set the threshold to one event for now. */ ctl &= ~MC_CTL2_THRESHOLD; ctl |= MC_CTL2_CMCI_EN | 1; wrmsr(MSR_MC_CTL2(i), ctl); ctl = rdmsr(MSR_MC_CTL2(i)); if (!(ctl & MC_CTL2_CMCI_EN)) /* This bank does not support CMCI. */ return; cc = &cmc_state[PCPU_GET(cpuid)][i]; /* Determine maximum threshold. */ ctl &= ~MC_CTL2_THRESHOLD; ctl |= 0x7fff; wrmsr(MSR_MC_CTL2(i), ctl); ctl = rdmsr(MSR_MC_CTL2(i)); cc->max_threshold = ctl & MC_CTL2_THRESHOLD; /* Start off with a threshold of 1. */ ctl &= ~MC_CTL2_THRESHOLD; ctl |= 1; wrmsr(MSR_MC_CTL2(i), ctl); /* Mark this bank as monitored. */ PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); } /* * For resume, reset the threshold for any banks we monitor back to * one and throw away the timestamp of the last interrupt. */ static void cmci_resume(int i) { struct cmc_state *cc; uint64_t ctl; KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); /* See cmci_monitor(). */ if (cmc_state == NULL) return; /* Ignore banks not monitored by this CPU. */ if (!(PCPU_GET(cmci_mask) & 1 << i)) return; cc = &cmc_state[PCPU_GET(cpuid)][i]; cc->last_intr = 0; ctl = rdmsr(MSR_MC_CTL2(i)); ctl &= ~MC_CTL2_THRESHOLD; ctl |= MC_CTL2_CMCI_EN | 1; wrmsr(MSR_MC_CTL2(i), ctl); } /* * Apply an AMD ET configuration to the corresponding MSR. */ static void amd_thresholding_start(struct amd_et_state *cc, int bank) { uint64_t misc; KASSERT(amd_elvt >= 0, ("ELVT offset is not set")); misc = rdmsr(mca_msr_ops.misc(bank)); misc &= ~MC_MISC_AMD_INT_MASK; misc |= MC_MISC_AMD_INT_LVT; misc &= ~MC_MISC_AMD_LVT_MASK; misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT; misc &= ~MC_MISC_AMD_CNT_MASK; misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold) << MC_MISC_AMD_CNT_SHIFT; misc &= ~MC_MISC_AMD_OVERFLOW; misc |= MC_MISC_AMD_CNTEN; wrmsr(mca_msr_ops.misc(bank), misc); } static void amd_thresholding_monitor(int i) { struct amd_et_state *cc; uint64_t misc; /* * Kludge: On 10h, banks after 4 are not thresholding but also may have * bogus Valid bits. Skip them. This is definitely fixed in 15h, but * I have not investigated whether it is fixed in earlier models. */ if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5) return; /* The counter must be valid and present. */ misc = rdmsr(mca_msr_ops.misc(i)); if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) != (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) return; /* The register should not be locked. */ if ((misc & MC_MISC_AMD_LOCK) != 0) { if (bootverbose) printf("%s: 0x%jx: Bank %d: locked\n", __func__, (uintmax_t)misc, i); return; } /* * If counter is enabled then either the firmware or another CPU * has already claimed it. */ if ((misc & MC_MISC_AMD_CNTEN) != 0) { if (bootverbose) printf("%s: 0x%jx: Bank %d: already enabled\n", __func__, (uintmax_t)misc, i); return; } /* * Configure an Extended Interrupt LVT register for reporting * counter overflows if that feature is supported and the first * extended register is available. */ amd_elvt = lapic_enable_mca_elvt(); if (amd_elvt < 0) { printf("%s: Bank %d: lapic enable mca elvt failed: %d\n", __func__, i, amd_elvt); return; } cc = &amd_et_state[PCPU_GET(cpuid)][i]; cc->cur_threshold = 1; amd_thresholding_start(cc, i); /* Mark this bank as monitored. */ PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); } static void amd_thresholding_resume(int i) { struct amd_et_state *cc; KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); /* Ignore banks not monitored by this CPU. */ if (!(PCPU_GET(cmci_mask) & 1 << i)) return; cc = &amd_et_state[PCPU_GET(cpuid)][i]; cc->last_intr = 0; cc->cur_threshold = 1; amd_thresholding_start(cc, i); } #endif /* * Initializes per-CPU machine check registers and enables corrected * machine check interrupts. */ static void _mca_init(int boot) { uint64_t mcg_cap; uint64_t ctl, mask; int i, skip, family; family = CPUID_TO_FAMILY(cpu_id); /* MCE is required. */ if (!mca_enabled || !(cpu_feature & CPUID_MCE)) return; if (cpu_feature & CPUID_MCA) { if (boot) PCPU_SET(cmci_mask, 0); mcg_cap = rdmsr(MSR_MCG_CAP); if (mcg_cap & MCG_CAP_CTL_P) /* Enable MCA features. */ wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); if (IS_BSP() && boot) mca_setup(mcg_cap); /* * Disable logging of level one TLB parity (L1TP) errors by * the data cache as an alternative workaround for AMD Family * 10h Erratum 383. Unlike the recommended workaround, there * is no performance penalty to this workaround. However, * L1TP errors will go unreported. */ if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 && !amd10h_L1TP) { mask = rdmsr(MSR_MC0_CTL_MASK); if ((mask & (1UL << 5)) == 0) wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5)); } if (amd_rascap & AMDRAS_SCALABLE_MCA) { mca_msr_ops.ctl = mca_smca_ctl_reg; mca_msr_ops.status = mca_smca_status_reg; mca_msr_ops.addr = mca_smca_addr_reg; mca_msr_ops.misc = mca_smca_misc_reg; } /* Enable local MCE if supported. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && (mcg_cap & MCG_CAP_LMCE_P) && (rdmsr(MSR_IA32_FEATURE_CONTROL) & IA32_FEATURE_CONTROL_LMCE_EN)) wrmsr(MSR_MCG_EXT_CTL, rdmsr(MSR_MCG_EXT_CTL) | 1); /* * The cmci_monitor() must not be executed * simultaneously by several CPUs. */ if (boot) mtx_lock_spin(&mca_lock); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { /* By default enable logging of all errors. */ ctl = 0xffffffffffffffffUL; skip = 0; if (cpu_vendor_id == CPU_VENDOR_INTEL) { /* * For P6 models before Nehalem MC0_CTL is * always enabled and reserved. */ if (i == 0 && family == 0x6 && CPUID_TO_MODEL(cpu_id) < 0x1a) skip = 1; } else if (cpu_vendor_id == CPU_VENDOR_AMD) { /* BKDG for Family 10h: unset GartTblWkEn. */ if (i == MC_AMDNB_BANK && family >= 0xf && family < 0x17) ctl &= ~(1UL << 10); } if (!skip) wrmsr(mca_msr_ops.ctl(i), ctl); #ifdef DEV_APIC if (cmci_supported(mcg_cap)) { if (boot) cmci_monitor(i); else cmci_resume(i); } else if (amd_thresholding_supported()) { if (boot) amd_thresholding_monitor(i); else amd_thresholding_resume(i); } #endif /* Clear all errors. */ wrmsr(mca_msr_ops.status(i), 0); } if (boot) mtx_unlock_spin(&mca_lock); #ifdef DEV_APIC if (cmci_supported(mcg_cap) && PCPU_GET(cmci_mask) != 0 && boot) lapic_enable_cmc(); #endif } load_cr4(rcr4() | CR4_MCE); } /* Must be executed on each CPU during boot. */ void mca_init(void) { _mca_init(1); } /* Must be executed on each CPU during resume. */ void mca_resume(void) { _mca_init(0); } /* * The machine check registers for the BSP cannot be initialized until * the local APIC is initialized. This happens at SI_SUB_CPU, * SI_ORDER_SECOND. */ static void mca_init_bsp(void *arg __unused) { mca_init(); } SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL); /* Called when a machine check exception fires. */ void mca_intr(void) { uint64_t mcg_status; int count; bool lmcs, recoverable; if (!(cpu_feature & CPUID_MCA)) { /* * Just print the values of the old Pentium registers * and panic. */ printf("MC Type: 0x%jx Address: 0x%jx\n", (uintmax_t)rdmsr(MSR_P5_MC_TYPE), (uintmax_t)rdmsr(MSR_P5_MC_ADDR)); panic("Machine check exception"); } /* Scan the banks and check for any non-recoverable errors. */ mcg_status = rdmsr(MSR_MCG_STATUS); recoverable = (mcg_status & MCG_STATUS_RIPV) != 0; lmcs = (cpu_vendor_id != CPU_VENDOR_INTEL || (mcg_status & MCG_STATUS_LMCS)); count = mca_scan(MCE, &recoverable); if (!recoverable) { /* * Only panic if the error was detected local to this CPU. * Some errors will assert a machine check on all CPUs, but * only certain CPUs will find a valid bank to log. */ while (!lmcs && count == 0) cpu_spinwait(); panic("Unrecoverable machine check exception"); } /* Clear MCIP. */ wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP); } #ifdef DEV_APIC /* Called for a CMCI (correctable machine check interrupt). */ void cmc_intr(void) { bool recoverable = true; /* * Serialize MCA bank scanning to prevent collisions from * sibling threads. * * If we found anything, log them to the console. */ if (mca_scan(CMCI, &recoverable) != 0) mca_process_records(CMCI); } #endif