Index: sys/amd64/amd64/apic_vector.S =================================================================== --- sys/amd64/amd64/apic_vector.S +++ sys/amd64/amd64/apic_vector.S @@ -297,6 +297,18 @@ jmp doreti /* + * Executed by a CPU when it receives an IPI_KLOAD from another CPU. + */ + .text + SUPERALIGN_TEXT +IDTVEC(cpukload) + PUSH_FRAME + + call as_lapic_eoi + call cpukload_handler + jmp doreti + +/* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. * * - Calls the generic rendezvous action function. Index: sys/amd64/amd64/mp_machdep.c =================================================================== --- sys/amd64/amd64/mp_machdep.c +++ sys/amd64/amd64/mp_machdep.c @@ -160,6 +160,9 @@ /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); + /* Install an inter-CPU IPI for kload shutdown */ + setidt(IPI_KLOAD, IDTVEC(cpukload), SDT_SYSIGT, SEL_KPL, 0); + /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); Index: sys/amd64/include/intr_machdep.h =================================================================== --- sys/amd64/include/intr_machdep.h +++ sys/amd64/include/intr_machdep.h @@ -173,6 +173,7 @@ int intr_register_source(struct intsrc *isrc); int intr_remove_handler(void *cookie); void intr_resume(bool suspend_cancelled); +int intr_clear_all_handlers(void); void intr_suspend(void); void intr_reprogram(void); void intrcnt_add(const char *name, u_long **countp); Index: sys/kern/kern_module.c =================================================================== --- sys/kern/kern_module.c +++ sys/kern/kern_module.c @@ -63,7 +63,6 @@ static TAILQ_HEAD(modulelist, module) modules; struct sx modules_sx; static int nextid = 1; -static void module_shutdown(void *, int); static int modevent_nop(module_t mod, int what, void *arg) @@ -91,7 +90,7 @@ SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0); -static void +void module_shutdown(void *arg1, int arg2) { module_t mod; Index: sys/kern/kern_shutdown.c =================================================================== --- sys/kern/kern_shutdown.c +++ sys/kern/kern_shutdown.c @@ -318,6 +318,12 @@ { static int once = 0; + /* + * Do not use kload if we're coming to this code via panic + */ + if (panicstr != NULL) + howto |= RB_ABORT_KLOAD; + #if defined(SMP) /* * Bind us to CPU 0 so that all shutdown code runs there. Some Index: sys/kern/subr_smp.c =================================================================== --- sys/kern/subr_smp.c +++ sys/kern/subr_smp.c @@ -218,7 +218,8 @@ KASSERT( #if defined(__amd64__) || defined(__i386__) - type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND, + type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND + || type == IPI_KLOAD, #else type == IPI_STOP || type == IPI_STOP_HARD, #endif @@ -238,7 +239,7 @@ * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_KLOAD) mtx_lock_spin(&smp_ipi_mtx); #endif @@ -252,7 +253,7 @@ ipi_selected(map, type); #if defined(__amd64__) || defined(__i386__) - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_KLOAD) cpus = &suspended_cpus; else #endif @@ -270,7 +271,7 @@ } #if defined(__amd64__) || defined(__i386__) - if (type == IPI_SUSPEND) + if (type == IPI_SUSPEND || type == IPI_KLOAD) mtx_unlock_spin(&smp_ipi_mtx); #endif @@ -299,6 +300,13 @@ return (generic_stop_cpus(map, IPI_SUSPEND)); } + +int +kload_suspend_cpus(cpuset_t map) +{ + + return (generic_stop_cpus(map, IPI_KLOAD)); +} #endif /* Index: sys/sys/eventhandler.h =================================================================== --- sys/sys/eventhandler.h +++ sys/sys/eventhandler.h @@ -173,6 +173,7 @@ #define SHUTDOWN_PRI_FIRST EVENTHANDLER_PRI_FIRST #define SHUTDOWN_PRI_DEFAULT EVENTHANDLER_PRI_ANY #define SHUTDOWN_PRI_LAST EVENTHANDLER_PRI_LAST +#define SHUTDOWN_PRI_KLOAD EVENTHANDLER_PRI_LAST - 100 EVENTHANDLER_DECLARE(shutdown_pre_sync, shutdown_fn); /* before fs sync */ EVENTHANDLER_DECLARE(shutdown_post_sync, shutdown_fn); /* after fs sync */ Index: sys/sys/module.h =================================================================== --- sys/sys/module.h +++ sys/sys/module.h @@ -226,6 +226,7 @@ const char * module_getname(module_t); void module_setspecific(module_t, modspecific_t *); struct linker_file *module_file(module_t); +void module_shutdown(void *arg1, int arg2); #ifdef MOD_DEBUG extern int mod_debug; Index: sys/sys/reboot.h =================================================================== --- sys/sys/reboot.h +++ sys/sys/reboot.h @@ -60,6 +60,7 @@ #define RB_RESERVED2 0x80000 /* reserved for internal use of boot blocks */ #define RB_PAUSE 0x100000 /* pause after each output line during probe */ #define RB_REROOT 0x200000 /* unmount the rootfs and mount it again */ +#define RB_ABORT_KLOAD 0x400000 /* abort kload reboot if system has panicked */ #define RB_MULTIPLE 0x20000000 /* use multiple consoles */ #define RB_BOOTINFO 0x80000000 /* have `struct bootinfo *' arg */ Index: sys/sys/smp.h =================================================================== --- sys/sys/smp.h +++ sys/sys/smp.h @@ -166,6 +166,7 @@ int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); +int kload_suspend_cpus(cpuset_t); int resume_cpus(cpuset_t); #endif Index: sys/x86/include/apicvar.h =================================================================== --- sys/x86/include/apicvar.h +++ sys/x86/include/apicvar.h @@ -123,11 +123,12 @@ #define IPI_STOP (APIC_IPI_INTS + 6) /* Stop CPU until restarted. */ #define IPI_SUSPEND (APIC_IPI_INTS + 7) /* Suspend CPU until restarted. */ +#define IPI_KLOAD (APIC_IPI_INTS + 8) /* Suspend CPU until restarted. */ #ifdef __i386__ -#define IPI_LAZYPMAP (APIC_IPI_INTS + 8) /* Lazy pmap release. */ -#define IPI_DYN_FIRST (APIC_IPI_INTS + 9) +#define IPI_LAZYPMAP (APIC_IPI_INTS + 9) /* Lazy pmap release. */ +#define IPI_DYN_FIRST (APIC_IPI_INTS + 10) #else -#define IPI_DYN_FIRST (APIC_IPI_INTS + 8) +#define IPI_DYN_FIRST (APIC_IPI_INTS + 9) #endif #define IPI_DYN_LAST (253) /* IPIs allocated at runtime */ @@ -213,6 +214,7 @@ int (*id)(void); int (*intr_pending)(u_int); void (*set_logical_id)(u_int, u_int, u_int); + void (*clear_lapic)(u_int); u_int (*cpuid)(u_int); /* Vectors */ @@ -318,6 +320,13 @@ apic_ops.set_logical_id(apic_id, cluster, cluster_id); } +static inline void +lapic_clear_lapic(u_int disable) +{ + + return (apic_ops.clear_lapic(disable)); +} + static inline u_int apic_cpuid(u_int apic_id) { @@ -456,6 +465,7 @@ void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); void hv_vector_handler(struct trapframe *frame); +void lapic_clear_lapic(u_int); extern int x2apic_mode; extern int lapic_eoi_suppression; Index: sys/x86/include/x86_smp.h =================================================================== --- sys/x86/include/x86_smp.h +++ sys/x86/include/x86_smp.h @@ -70,13 +70,15 @@ IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ IDTVEC(cpustop), /* CPU stops & waits to be restarted */ IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */ - IDTVEC(rendezvous); /* handle CPU rendezvous */ + IDTVEC(rendezvous), /* handle CPU rendezvous */ + IDTVEC(cpukload); /* CPU suspends for kload reboots */ /* functions in x86_mp.c */ void assign_cpu_ids(void); void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); void cpususpend_handler(void); +void cpukload_handler(void); void init_secondary_tail(void); void invltlb_handler(void); void invlpg_handler(void); Index: sys/x86/x86/intr_machdep.c =================================================================== --- sys/x86/x86/intr_machdep.c +++ sys/x86/x86/intr_machdep.c @@ -194,6 +194,24 @@ } int +intr_clear_all_handlers(void) +{ + struct intsrc *isrc; + int i; + + mtx_lock(&intr_table_lock); + for (i = 0; i < NUM_IO_INTS; i++) { + isrc = interrupt_sources[i]; + if (isrc != NULL) { + isrc->is_pic->pic_disable_source(isrc, PIC_EOI); + isrc->is_pic->pic_disable_intr(isrc); + } + } + mtx_unlock(&intr_table_lock); + return (0); +} + +int intr_remove_handler(void *cookie) { struct intsrc *isrc; Index: sys/x86/x86/local_apic.c =================================================================== --- sys/x86/x86/local_apic.c +++ sys/x86/x86/local_apic.c @@ -290,6 +290,7 @@ static void native_lapic_setup(int boot); static void native_lapic_dump(const char *str); static void native_lapic_disable(void); +static void native_lapic_clear_lapic(u_int); static void native_lapic_eoi(void); static int native_lapic_id(void); static int native_lapic_intr_pending(u_int vector); @@ -333,6 +334,7 @@ .id = native_lapic_id, .intr_pending = native_lapic_intr_pending, .set_logical_id = native_lapic_set_logical_id, + .clear_lapic = native_lapic_clear_lapic, .cpuid = native_apic_cpuid, .alloc_vector = native_apic_alloc_vector, .alloc_vectors = native_apic_alloc_vectors, @@ -590,6 +592,68 @@ intr_restore(saveintr); } +void +native_lapic_clear_lapic(u_int disable) +{ + struct lapic *la; + uint32_t value; + uint32_t maxlvt; + + la = &lapics[lapic_id()]; + maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT; + + if (bootverbose) + printf("%s lapic_id(%d) cpu(%d) la %p lapic_map %p maxlvt %u\n", + __func__, lapic_id(), PCPU_GET(cpuid), la, + lapic_map, maxlvt); + + /* + * Fist we set the mask bit to keep and new interrupts from + * arriving but allowing any pending interrupts to finish + * *THEN* set the registers to default values + * If the interrupts are not allowed to clear a kload'ed / booted + * kernel will see the old interrupts before the appropriate handlers + * are in place and trigger a panic. + */ + if (maxlvt >= APIC_LVT_ERROR) { /* aka 3 */ + value = lapic_read32(LAPIC_LVT_ERROR); + lapic_write32(LAPIC_LVT_ERROR, value | APIC_LVT_M); + } + + value = lapic_read32(LAPIC_LVT_TIMER); + lapic_write32(LAPIC_LVT_TIMER, value | APIC_LVT_M); + + value = lapic_read32(LAPIC_LVT_LINT0); + lapic_write32(LAPIC_LVT_LINT0, value | APIC_LVT_M); + + value = lapic_read32(LAPIC_LVT_LINT1); + lapic_write32(LAPIC_LVT_LINT1, value | APIC_LVT_M); + + if (maxlvt >= APIC_LVT_PMC) { /* aka 4 */ + value = lapic_read32(LAPIC_LVT_PCINT); + lapic_write32(LAPIC_LVT_PCINT, value | APIC_LVT_M); + } + if (maxlvt >= APIC_LVT_THERMAL) /* aka 5 */ + printf("%s Therm Vector\n", __func__); + if (maxlvt >= APIC_LVT_CMCI) /* aka 6 */ + printf("%s Intel MCE\n", __func__); + + /* Program timer LVT and setup handler. */ + lapic_write32(LAPIC_LVT_TIMER, APIC_LVTT_M); /* masked */ + lapic_write32(LAPIC_LVT_LINT0, APIC_LVT_M); /* masked */ + lapic_write32(LAPIC_LVT_LINT1, APIC_LVT_M); /* masked */ + if (maxlvt >= APIC_LVT_ERROR) /* aka 3 */ + lapic_write32(LAPIC_LVT_ERROR, APIC_LVT_M); + if (maxlvt >= APIC_LVT_PMC) /* aka 4 */ + lapic_write32(LAPIC_LVT_PCINT, APIC_LVT_M); /* masked */ + + if (disable) { + if (bootverbose) + printf("lapic disable\n"); + lapic_disable(); + } +} + static void native_lapic_setup(int boot) { Index: sys/x86/x86/mp_x86.c =================================================================== --- sys/x86/x86/mp_x86.c +++ sys/x86/x86/mp_x86.c @@ -35,6 +35,7 @@ #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" +#include "opt_kload.h" #include #include @@ -90,6 +91,13 @@ extern struct pcpu __pcpu[]; +#ifdef KLOAD +/* page table setup by kload so we can set the APs to a known page table */ +extern pt_entry_t kload_pgtbl; +#else +static pt_entry_t kload_pgtbl = 0; +#endif + /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; int bootAP; @@ -962,6 +970,73 @@ } } + +/* + * Handle an IPI_SUSPEND by saving our current context and spinning until we + * are resumed. + */ +void +cpukload_handler(void) +{ + u_int cpu; + + mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); + + /* + * shutdown interrupts to the cpu + * and then set the mask as stopped + */ + lapic_clear_lapic(1 /* disable lapic */); + + cpu = PCPU_GET(cpuid); + printf("%s cpu %d\n", __func__, cpu); + if (savectx(&susppcbs[cpu]->sp_pcb)) { +#ifdef __amd64__ + fpususpend(susppcbs[cpu]->sp_fpususpend); +#else + npxsuspend(susppcbs[cpu]->sp_fpususpend); +#endif + wbinvd(); + CPU_SET_ATOMIC(cpu, &suspended_cpus); + } else { +#ifdef __amd64__ + fpuresume(susppcbs[cpu]->sp_fpususpend); +#else + npxresume(susppcbs[cpu]->sp_fpususpend); +#endif + pmap_init_pat(); + initializecpu(); + PCPU_SET(switchtime, 0); + PCPU_SET(switchticks, ticks); + + /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &suspended_cpus); + } + + if (kload_pgtbl) { + /* + * Set the pagetable to boot capable PT in case this is + * kload suspend. If a normal suspend resume we restore + * the originnal page table + */ + (void)intr_disable(); + load_cr3(kload_pgtbl); + + /* Disable PGE. */ + load_cr4(rcr4() & ~CR4_PGE); + + /* Disable caches (CD = 1, NW = 0) and paging*/ + load_cr0((rcr0() & ~CR0_NW) | CR0_CD | CR0_PG); + + /* Flushes caches and TLBs. */ + wbinvd(); + invltlb(); + + halt(); + + } +} + /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed.