Index: sys/conf/options.amd64 =================================================================== --- sys/conf/options.amd64 +++ sys/conf/options.amd64 @@ -9,6 +9,7 @@ MPTABLE_FORCE_HTT MP_WATCHDOG NKPT opt_pmap.h +NMI_WATCHDOG opt_watchdog.h PV_STATS opt_pmap.h # Options for emulators. These should only be used at config time, so Index: sys/conf/options.i386 =================================================================== --- sys/conf/options.i386 +++ sys/conf/options.i386 @@ -13,6 +13,7 @@ MPTABLE_FORCE_HTT MP_WATCHDOG NKPT opt_pmap.h +NMI_WATCHDOG opt_watchdog.h PERFMON PMAP_SHPGPERPROC opt_pmap.h POWERFAIL_NMI opt_trap.h Index: sys/dev/acpica/acpi_hpet.c =================================================================== --- sys/dev/acpica/acpi_hpet.c +++ sys/dev/acpica/acpi_hpet.c @@ -57,8 +57,13 @@ #include #ifdef DEV_APIC -#include "pcib_if.h" +#include +#ifdef SMP +#include #endif +#include +#include "pcib_if.h" +#endif /* DEV_APIC */ #define HPET_VENDID_AMD 0x4353 #define HPET_VENDID_AMD2 0x1022 @@ -112,6 +117,7 @@ uint32_t div; uint32_t next; char name[8]; + bool nmi; } t[32]; int num_timers; struct cdev *pdev; @@ -215,10 +221,16 @@ t->mode = TIMER_ONESHOT; t->div = 0; } - if (first != 0) + KASSERT(!t->nmi || t->mode == TIMER_ONESHOT, + ("NMI timer started in periodic mode")); + if (first != 0) { fdiv = (sc->freq * first) >> 32; - else + /* Save calculated timeout for the sake of resume. */ + if (t->nmi) + t->div = fdiv; + } else { fdiv = t->div; + } if (t->irq < 0) bus_write_4(sc->mem_res, HPET_ISR, 1 << t->num); t->caps |= HPET_TCNF_INT_ENB; @@ -262,7 +274,100 @@ return (0); } +#if defined(DEV_APIC) static int +hpet_fsb_setup(struct hpet_timer *t) +{ + struct hpet_softc *sc = t->sc; + uint64_t addr; + uint32_t data; + int err; + + err = PCIB_MAP_MSI(device_get_parent(device_get_parent(sc->dev)), + sc->dev, t->irq, &addr, &data); + if (err != 0) + return (err); + /* + * If NMI mode is disabled, then use the returned values as is. + * Otherwise, directly configure NMI delivery mode. + * Destination ID is either set to BSP or to broadcast. + * This code has some direct knowledge of MSI internals. + */ + if (t->nmi) { + data = IOART_TRGREDG; + data |= amd_intr_delmode_bug ? IOART_DELRSV1 : IOART_DELNMI; +#ifdef SMP + addr &= ~0x000ff000u; + addr |= (nmi_is_broadcast ? 0xff : boot_cpu_id) << 12; +#endif + } + bus_write_4(sc->mem_res, HPET_TIMER_FSB_ADDR(t->num), addr); + bus_write_4(sc->mem_res, HPET_TIMER_FSB_VAL(t->num), data); + return (0); +} +#endif + +static int +hpet_set_nmi_mode(struct eventtimer *et, boolean_t enable) +{ +#if defined(DEV_APIC) + struct hpet_timer *mt = (struct hpet_timer *)et->et_priv; + struct hpet_softc *sc = mt->sc; + struct hpet_timer *t; + int err; + + t = (mt->pcpu_master < 0) ? mt : &sc->t[mt->pcpu_slaves[curcpu]]; + if ((t->caps & HPET_TCNF_FSB_EN) == 0) + return (ENOTSUP); + + if (t->nmi == enable) + return (0); + t->nmi = enable; + err = hpet_fsb_setup(t); + if (err != 0) + t->nmi = !enable; + return (err); +#else /* DEV_APIC */ + return (ENOTSUP); +#endif /* DEV_APIC */ +} + +static int +hpet_check_nmi(struct eventtimer *et) +{ +#if defined(DEV_APIC) + struct hpet_timer *mt = (struct hpet_timer *)et->et_priv; + struct hpet_softc *sc = mt->sc; + struct hpet_timer *t; + uint32_t val; + + t = (mt->pcpu_master < 0) ? mt : &sc->t[mt->pcpu_slaves[curcpu]]; + if ((t->caps & HPET_TCNF_FSB_EN) == 0) + return (ENOTSUP); + + if (!t->nmi) + return (ENOENT); + + val = bus_read_4(sc->mem_res, HPET_ISR); + if ((val & (1 << t->num)) == 0) + return (ENOENT); + + /* + * Clear the interrupt stats bit as well as interrupt enable bit + * to avoid another NMI after the counter wrap-around. + * With a 32-bit counter and a typical frequency the wrap-around + * happens in less than 5 minutes. + */ + bus_write_4(sc->mem_res, HPET_ISR, 1 << t->num); + t->caps &= ~HPET_TCNF_INT_ENB; + bus_write_4(sc->mem_res, HPET_TIMER_CAP_CNF(t->num), t->caps); + return (0); +#else /* DEV_APIC */ + return (ENOTSUP); +#endif /* DEV_APIC */ +} + +static int hpet_intr_single(void *arg) { struct hpet_timer *t = (struct hpet_timer *)arg; @@ -743,18 +848,9 @@ } else #ifdef DEV_APIC if ((t->caps & HPET_TCAP_FSB_INT_DEL) && t->irq >= 0) { - uint64_t addr; - uint32_t data; - - if (PCIB_MAP_MSI( - device_get_parent(device_get_parent(dev)), dev, - t->irq, &addr, &data) == 0) { - bus_write_4(sc->mem_res, - HPET_TIMER_FSB_ADDR(i), addr); - bus_write_4(sc->mem_res, - HPET_TIMER_FSB_VAL(i), data); + if (hpet_fsb_setup(t) == 0) t->caps |= HPET_TCNF_FSB_EN; - } else + else t->irq = -2; } else #endif @@ -775,6 +871,10 @@ t->et.et_name = t->name; } t->et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT; +#if defined(DEV_APIC) + if ((t->caps & HPET_TCNF_FSB_EN) != 0) + t->et.et_flags |= ET_FLAGS_NMI; +#endif t->et.et_quality = 450; if (t->pcpu_master >= 0) { t->et.et_flags |= ET_FLAGS_PERCPU; @@ -789,6 +889,8 @@ t->et.et_max_period = (0xfffffffeLLU << 32) / sc->freq; t->et.et_start = hpet_start; t->et.et_stop = hpet_stop; + t->et.et_set_nmi_mode = hpet_set_nmi_mode; + t->et.et_check_nmi = hpet_check_nmi; t->et.et_priv = &sc->t[i]; if (t->pcpu_master < 0 || t->pcpu_master == i) { et_register(&t->et); @@ -868,19 +970,8 @@ for (i = 0; i < sc->num_timers; i++) { t = &sc->t[i]; #ifdef DEV_APIC - if (t->irq >= 0 && (sc->legacy_route == 0 || i >= 2)) { - uint64_t addr; - uint32_t data; - - if (PCIB_MAP_MSI( - device_get_parent(device_get_parent(dev)), dev, - t->irq, &addr, &data) == 0) { - bus_write_4(sc->mem_res, - HPET_TIMER_FSB_ADDR(i), addr); - bus_write_4(sc->mem_res, - HPET_TIMER_FSB_VAL(i), data); - } - } + if ((t->caps & HPET_TCNF_FSB_EN) != 0) + (void)hpet_fsb_setup(t); #endif if (t->mode == TIMER_STOPPED) continue; @@ -897,7 +988,7 @@ bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->div); } else { - t->next += sc->freq / 1024; + t->next += t->nmi ? t->div : sc->freq / 1024; bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->next); } @@ -942,24 +1033,16 @@ { struct hpet_softc *sc = device_get_softc(dev); struct hpet_timer *t; - uint64_t addr; - uint32_t data; int error, i; for (i = 0; i < sc->num_timers; i++) { t = &sc->t[i]; if (t->irq != irq) continue; - error = PCIB_MAP_MSI( - device_get_parent(device_get_parent(dev)), dev, - irq, &addr, &data); - if (error) - return (error); hpet_disable(sc); /* Stop timer to avoid interrupt loss. */ - bus_write_4(sc->mem_res, HPET_TIMER_FSB_ADDR(i), addr); - bus_write_4(sc->mem_res, HPET_TIMER_FSB_VAL(i), data); + error = hpet_fsb_setup(t); hpet_enable(sc); - return (0); + return (error); } return (ENOENT); } Index: sys/kern/kern_clocksource.c =================================================================== --- sys/kern/kern_clocksource.c +++ sys/kern/kern_clocksource.c @@ -34,6 +34,7 @@ */ #include "opt_device_polling.h" +#include "opt_watchdog.h" #include #include @@ -50,6 +51,9 @@ #include #include #include +#if defined(NMI_WATCHDOG) +#include +#endif #include #include @@ -586,6 +590,162 @@ return (freq); } +#if defined(NMI_WATCHDOG) +static int nmi_wd_enable; +static struct eventtimer *nmi_wd_et; +static eventhandler_tag nmi_wd_event; +static sbintime_t nmi_wd_period; + +static void +nmicb(struct eventtimer *et, void *arg) +{ + + printf("bug: timer callback invoked while in NMI mode\n"); +} + +int +nmi_watchdog_check(void) +{ + int err; + + if (nmi_wd_et == NULL) + return (0); + err = et_check_nmi(nmi_wd_et); + return (err == 0); +} + +static void +nmi_watchdog_config(struct eventtimer *et, u_int cmd, int *error) +{ + sbintime_t when; + uint64_t ns; + uint64_t hi; + uint64_t lo; + int ret; + + if (cmd != 0) { + /* + * Nanoseconds are multiplied by 2^32 and divided by 10^9. + * To avoid overflow and loss of precision the above order + * is used for the lower 32 bits, while the upper 32-bits + * are first divided and then multiplied. + */ + cmd &= WD_INTERVAL; + ns = (uint64_t)1 << cmd; + hi = (ns >> 32) << 32; + lo = ns & (((uint64_t)1 << 32) - 1); + when = ((hi / 1000000000) << 32) + (lo << 32) / 1000000000; + + if (when < et->et_min_period) + when = et->et_min_period; + if (when <= et->et_max_period) { + nmi_wd_period = when; + ret = et_start(et, when, 0); + + /* If success, report it. */ + if (ret == 0) + *error = 0; + return; + } + } + + /* + * The timer is stopped if the requested timeout is too large + * or we are asked to stop the timer. + * If we were asked to stop and failed, then report the error. + */ + nmi_wd_period = 0; + ret = et_stop(et); + if (cmd == 0 && ret != 0) + *error = EOPNOTSUPP; +} + +static int +init_nmi_watchdog(void) +{ + int error; + + /* + * For now do not use per-cpu timers, so that only one NMI is delivered. + * Also, it would be nice to be able to specify a minimum requirement + * on et_max_period. + */ + nmi_wd_et = et_find(NULL, + ET_FLAGS_ONESHOT | ET_FLAGS_NMI | ET_FLAGS_PERCPU, + ET_FLAGS_ONESHOT | ET_FLAGS_NMI); + if (nmi_wd_et == NULL) { + printf("NMI watchdog: failed to find suitable timer\n"); + return (ENXIO); + } + + printf("NMI watchdog: found timer %s\n", nmi_wd_et->et_name); + error = et_init(nmi_wd_et, nmicb, NULL, NULL); + if (error != 0) { + printf("NMI watchdog: failed to claim the timer\n"); + nmi_wd_et = NULL; + return (error); + } + error = et_set_nmi_mode(nmi_wd_et, true); + if (error == 0) { + printf("NMI watchdog: using timer %s\n", nmi_wd_et->et_name); + printf("maximum supported timeout is %ju seconds\n", + (uintmax_t)nmi_wd_et->et_max_period / SBT_1S); + nmi_wd_event = EVENTHANDLER_REGISTER(watchdog_list, + nmi_watchdog_config, nmi_wd_et, 0); + } else { + printf("NMI watchdog: failed to set NMI mode\n"); + (void)et_free(nmi_wd_et); + nmi_wd_et = NULL; + } + return (error); +} + +static int +disable_nmi_watchdog(void) +{ + int error = 0; + + /* First, disengage from consumers. */ + EVENTHANDLER_DEREGISTER(watchdog_list, nmi_wd_event); + + /* Stop the timer. */ + nmi_watchdog_config(nmi_wd_et, 0, &error); + if (error == 0) { + error = et_set_nmi_mode(nmi_wd_et, false); + if (error == 0) { + (void)et_free(nmi_wd_et); + nmi_wd_et = NULL; + printf("NMI watchdog: disabled\n"); + } + } + if (error != 0) + nmi_wd_event = EVENTHANDLER_REGISTER(watchdog_list, + nmi_watchdog_config, nmi_wd_et, 0); + return (error); +} + +static int +nmi_wd_sysctl(SYSCTL_HANDLER_ARGS) +{ + int enable; + int error; + + enable = nmi_wd_et != NULL; + error = sysctl_handle_int(oidp, &enable, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (enable != 0) + error = init_nmi_watchdog(); + else + error = disable_nmi_watchdog(); + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, nmi_watchdog_enable, + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH, + NULL, 0, nmi_wd_sysctl, "I", "Enable NMI watchdog based on event timers"); +#endif + /* * Configure and start event timers (BSP part). */ @@ -672,6 +832,12 @@ ET_LOCK(); configtimer(1); ET_UNLOCK(); + +#if defined(NMI_WATCHDOG) + TUNABLE_INT_FETCH("kern.nmi_watchdog_enable", &nmi_wd_enable); + if (nmi_wd_enable) + init_nmi_watchdog(); +#endif } /* Index: sys/kern/kern_et.c =================================================================== --- sys/kern/kern_et.c +++ sys/kern/kern_et.c @@ -68,6 +68,10 @@ } } KASSERT(et->et_start, ("et_register: timer has no start function")); + if ((et->et_flags & ET_FLAGS_NMI) != 0) { + KASSERT(et->et_set_nmi_mode != NULL && et->et_check_nmi != NULL, + ("timer claims to support NMI but does not provide hooks")); + } et->et_sysctl = SYSCTL_ADD_NODE_WITH_LABEL(NULL, SYSCTL_STATIC_CHILDREN(_kern_eventtimer_et), OID_AUTO, et->et_name, CTLFLAG_RW, 0, "event timer description", "eventtimer"); @@ -235,6 +239,29 @@ et->et_active = 0; return (0); +} + +/* Enable or disable Non-Maskable mode of timer interrupt delivery. */ +int +et_set_nmi_mode(struct eventtimer *et, boolean_t enable) +{ + + if (!et->et_active) + return (ENXIO); + if ((et->et_flags & ET_FLAGS_NMI) == 0) + return (ENOTSUP); + return (et->et_set_nmi_mode(et, enable)); +} + +int +et_check_nmi(struct eventtimer *et) +{ + + if (!et->et_active) + return (ENXIO); + if ((et->et_flags & ET_FLAGS_NMI) == 0) + return (ENOTSUP); + return (et->et_check_nmi(et)); } /* Report list of supported event timer hardware via sysctl. */ Index: sys/sys/timeet.h =================================================================== --- sys/sys/timeet.h +++ sys/sys/timeet.h @@ -51,6 +51,8 @@ typedef int et_stop_t(struct eventtimer *et); typedef void et_event_cb_t(struct eventtimer *et, void *arg); typedef int et_deregister_cb_t(struct eventtimer *et, void *arg); +typedef int et_set_nmi_mode_t(struct eventtimer *et, boolean_t enable); +typedef int et_check_nmi_t(struct eventtimer *et); struct eventtimer { SLIST_ENTRY(eventtimer) et_all; @@ -59,11 +61,12 @@ /* Name of the event timer. */ int et_flags; /* Set of capabilities flags: */ -#define ET_FLAGS_PERIODIC 1 -#define ET_FLAGS_ONESHOT 2 -#define ET_FLAGS_PERCPU 4 -#define ET_FLAGS_C3STOP 8 -#define ET_FLAGS_POW2DIV 16 +#define ET_FLAGS_PERIODIC 0x01 +#define ET_FLAGS_ONESHOT 0x02 +#define ET_FLAGS_PERCPU 0x04 +#define ET_FLAGS_C3STOP 0x08 +#define ET_FLAGS_POW2DIV 0x10 +#define ET_FLAGS_NMI 0x20 /* timer can deliver NMI */ int et_quality; /* * Used to determine if this timecounter is better than @@ -78,6 +81,8 @@ et_stop_t *et_stop; et_event_cb_t *et_event_cb; et_deregister_cb_t *et_deregister_cb; + et_set_nmi_mode_t *et_set_nmi_mode; + et_check_nmi_t *et_check_nmi; void *et_arg; void *et_priv; struct sysctl_oid *et_sysctl; @@ -100,6 +105,8 @@ int et_stop(struct eventtimer *et); int et_ban(struct eventtimer *et); int et_free(struct eventtimer *et); +int et_set_nmi_mode(struct eventtimer *et, boolean_t enable); +int et_check_nmi(struct eventtimer *et); #ifdef SYSCTL_DECL SYSCTL_DECL(_kern_eventtimer); Index: sys/sys/watchdog.h =================================================================== --- sys/sys/watchdog.h +++ sys/sys/watchdog.h @@ -118,8 +118,11 @@ * if no hardware watchdog has been attached, and if the software module * has initialized the function pointer. */ - extern void (*wdog_software_attach)(void); -#endif + +#ifdef NMI_WATCHDOG +int nmi_watchdog_check(void); +#endif /* NMI_WATCHDOG */ +#endif /* _KERNEL */ #endif /* _SYS_WATCHDOG_H */ Index: sys/x86/include/x86_var.h =================================================================== --- sys/x86/include/x86_var.h +++ sys/x86/include/x86_var.h @@ -86,6 +86,7 @@ extern int pti; extern int hw_ibrs_active; extern int hw_ssb_active; +extern int amd_intr_delmode_bug; struct pcb; struct thread; Index: sys/x86/x86/cpu_machdep.c =================================================================== --- sys/x86/x86/cpu_machdep.c +++ sys/x86/x86/cpu_machdep.c @@ -52,6 +52,7 @@ #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" +#include "opt_watchdog.h" #ifdef __i386__ #include "opt_apic.h" #endif @@ -72,6 +73,9 @@ #include #include #include +#ifdef NMI_WATCHDOG +#include +#endif #include #include @@ -743,6 +747,12 @@ { bool claimed = false; +#ifdef NMI_WATCHDOG + if (nmi_watchdog_check()) { + claimed = true; + panic("NMI watchdog"); + } +#endif /* NMI_WATCHDOG */ #ifdef DEV_ISA /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err)) { Index: sys/x86/x86/io_apic.c =================================================================== --- sys/x86/x86/io_apic.c +++ sys/x86/x86/io_apic.c @@ -149,6 +149,12 @@ SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0, "Enable the ExtINT pin in the first I/O APIC"); +int amd_intr_delmode_bug; +SYSCTL_INT(_hw_apic, OID_AUTO, amd_intr_delmode_bug, CTLFLAG_RDTUN, + &amd_intr_delmode_bug, 0, + "IO-APIC and MSI Interrupt delivery mode is interpreted according to " + "HyperTransport specification"); + static void _ioapic_eoi_source(struct intsrc *isrc, int locked) {