diff --git a/sys/i386/include/xen/evtchn.h b/sys/i386/include/xen/evtchn.h index 1124105573b8..92ee578601e7 100644 --- a/sys/i386/include/xen/evtchn.h +++ b/sys/i386/include/xen/evtchn.h @@ -1,82 +1,83 @@ /****************************************************************************** * evtchn.h * * Communication via Xen event channels. * Also definitions for the device that demuxes notifications to userspace. * * Copyright (c) 2004, K A Fraser * * $FreeBSD$ */ #ifndef __ASM_EVTCHN_H__ #define __ASM_EVTCHN_H__ #include #include #include #include /* * LOW-LEVEL DEFINITIONS */ /* * Unlike notify_remote_via_evtchn(), this is safe to use across * save/restore. Notifications on a broken connection are silently dropped. */ void notify_remote_via_irq(int irq); /* Entry point for notifications into Linux subsystems. */ void evtchn_do_upcall(struct trapframe *frame); /* Entry point for notifications into the userland character device. */ void evtchn_device_upcall(int port); void mask_evtchn(int port); void unmask_evtchn(int port); static inline void clear_evtchn(int port) { shared_info_t *s = HYPERVISOR_shared_info; synch_clear_bit(port, &s->evtchn_pending[0]); } static inline void notify_remote_via_evtchn(int port) { struct evtchn_send send = { .port = port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); } /* * Use these to access the event channel underlying the IRQ handle returned * by bind_*_to_irqhandler(). */ int irq_to_evtchn_port(int irq); +void ipi_pcpu(unsigned int cpu, int vector); /* * CHARACTER-DEVICE DEFINITIONS */ #define PORT_NORMAL 0x0000 #define PORT_EXCEPTION 0x8000 #define PORTIDX_MASK 0x7fff /* /dev/xen/evtchn resides at device number major=10, minor=200 */ #define EVTCHN_MINOR 200 /* /dev/xen/evtchn ioctls: */ /* EVTCHN_RESET: Clear and reinit the event buffer. Clear error condition. */ #define EVTCHN_RESET _IO('E', 1) /* EVTCHN_BIND: Bind to the specified event-channel port. */ #define EVTCHN_BIND _IO('E', 2) /* EVTCHN_UNBIND: Unbind from the specified event-channel port. */ #define EVTCHN_UNBIND _IO('E', 3) #endif /* __ASM_EVTCHN_H__ */ diff --git a/sys/i386/xen/mp_machdep.c b/sys/i386/xen/mp_machdep.c index 651d14d3e6dc..c1af917a8a1f 100644 --- a/sys/i386/xen/mp_machdep.c +++ b/sys/i386/xen/mp_machdep.c @@ -1,1120 +1,1110 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2008, by Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_mp_watchdog.h" #include "opt_sched.h" #include "opt_smp.h" #if !defined(lint) #if !defined(SMP) #error How did you get here? #endif #ifndef DEV_APIC #error The apic device is required for SMP, add "device apic" to your config file. #endif #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) #error SMP not supported with CPU_DISABLE_CMPXCHG #endif #endif /* not lint */ #include #include #include #include /* cngetc() */ #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define stop_cpus_with_nmi 0 int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; static int bootAP; static union descriptor *bootAPgdt; /* Free these after use */ void *bootstacks[MAXCPU]; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; struct pcb stoppcbs[MAXCPU]; /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; volatile int smp_tlb_wait; static u_int logical_cpus; /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; } static cpu_info[MAX_APIC_ID + 1]; int cpu_apic_ids[MAXCPU]; /* Holds pending bitmap based IPIs per CPU */ static volatile u_int cpu_ipi_pending[MAXCPU]; static u_int boot_address; static void assign_cpu_ids(void); static void set_interrupt_apic_ids(void); int start_all_aps(void); static int start_ap(int apic_id); static void release_aps(void *dummy); static u_int hyperthreading_cpus; static cpumask_t hyperthreading_cpus_mask; extern void Xhypervisor_callback(void); extern void failsafe_callback(void); struct cpu_group * cpu_topo(void) { if (cpu_cores == 0) cpu_cores = 1; if (cpu_logical == 0) cpu_logical = 1; if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { printf("WARNING: Non-uniform processors.\n"); printf("WARNING: Using suboptimal topology.\n"); return (smp_topo_none()); } /* * No multi-core or hyper-threaded. */ if (cpu_logical * cpu_cores == 1) return (smp_topo_none()); /* * Only HTT no multi-core. */ if (cpu_logical > 1 && cpu_cores == 1) return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); /* * Only multi-core no HTT. */ if (cpu_cores > 1 && cpu_logical == 1) return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); /* * Both HTT and multi-core. */ return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { return (basemem); } void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) mp_ncpus++; if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { mp_maxid = MAXCPU - 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ all_cpus = 1; if (mp_ncpus == 0) { /* * No CPUs were found, so this must be a UP system. Setup * the variables to represent a system with a single CPU * with an id of 0. */ mp_ncpus = 1; return (0); } /* At least one CPU was found. */ if (mp_ncpus == 1) { /* * One CPU was found, so this must be a UP system with * an I/O APIC. */ return (0); } /* At least two CPUs were found. */ return (1); } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; cpu_ipi_pending[i] = 0; } /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); cpu_apic_ids[0] = boot_cpu_id; assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); /* Setup the initial logical CPUs info. */ logical_cpus = logical_cpus_mask = 0; if (cpu_feature & CPUID_HTT) logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; set_interrupt_apic_ids(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int i, x; /* List CPUs */ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) continue; if (cpu_info[x].cpu_disabled) printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); else { KASSERT(i < mp_ncpus, ("mp_ncpus and actual cpus are out of whack")); printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); } } } #define MTOPSIZE (1<<(14 + PAGE_SHIFT)) /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { vm_offset_t addr; int gsel_tss; /* bootAP is set in start_ap() to our ID. */ PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); #if 0 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; #endif PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); #if 0 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); #endif PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ /* * signal our startup to the BSP. */ mp_naps++; /* Spin until the BSP releases the AP's. */ while (!aps_ready) ia32_pause(); /* BSP may have changed PTD while we were waiting */ invltlb(); for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) invlpg(addr); /* set up FPU state on the AP */ npxinit(__INITIAL_NPXCW__); #if 0 /* set up SSE registers */ enable_sse(); #endif #if 0 && defined(PAE) /* Enable the PTE no-execute bit. */ if ((amd_feature & AMDID_NX) != 0) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); } #endif #if 0 /* A quick check from sanity claus */ if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } #endif /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mtx_lock_spin(&ap_boot_mtx); #if 0 /* Init local apic for irq's */ lapic_setup(1); #endif smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* Determine if we are a logical CPU. */ if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) logical_cpus_mask |= PCPU_GET(cpumask); /* Determine if we are a hyperthread. */ if (hyperthreading_cpus > 1 && PCPU_GET(apic_id) % hyperthreading_cpus != 0) hyperthreading_cpus_mask |= PCPU_GET(cpumask); /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); #if 0 if (bootverbose) lapic_dump("AP"); #endif if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); smp_active = 1; /* historic */ } mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ia32_pause(); PCPU_SET(curthread, PCPU_GET(idlethread)); /* enter the scheduler */ sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ static void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (hyperthreading_cpus > 1 && apic_id % hyperthreading_cpus != 0) continue; intr_add_cpu(i); } } /* * Assign logical CPU IDs to local APICs. */ static void assign_cpu_ids(void) { u_int i; /* Check for explicitly disabled CPUs. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) continue; /* Don't use this CPU if it has been disabled by a tunable. */ if (resource_disabled("lapic", i)) { cpu_info[i].cpu_disabled = 1; continue; } } /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 has already been assigned to the BSP, * so we only have to assign IDs for APs. */ mp_ncpus = 1; for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || cpu_info[i].cpu_disabled) continue; if (mp_ncpus < MAXCPU) { cpu_apic_ids[mp_ncpus] = i; mp_ncpus++; } else cpu_info[i].cpu_disabled = 1; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } /* * start each AP in our list */ /* Lowest 1MB is already mapped: don't touch*/ #define TMPMAP_START 1 int start_all_aps(void) { u_int32_t mpbioswarmvec; int x,apic_id, cpu; struct pcpu *pc; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); bootAP = cpu; bootAPgdt = gdt + (512*cpu); /* Get per-cpu data */ pc = &__pcpu[bootAP]; pcpu_init(pc, bootAP, sizeof(struct pcpu)); pc->pc_apic_id = cpu_apic_ids[bootAP]; pc->pc_prvspace = pc; pc->pc_curthread = 0; gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); bzero(bootAPgdt, PAGE_SIZE); for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); #ifdef notyet if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); #ifdef CONFIG_ACPI if (acpiid != 0xff) x86_acpiid_to_apicid[acpiid] = apicid; #endif } #endif /* attempt to start the Application Processor */ if (!start_ap(cpu)) { printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } all_cpus |= (1 << cpu); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); /* number of APs actually started */ return mp_naps; } extern uint8_t *pcpu_boot_stack; extern trap_info_t trap_table[]; static void smp_trap_init(trap_info_t *trap_ctxt) { const trap_info_t *t = trap_table; for (t = trap_table; t->address; t++) { trap_ctxt[t->vector].flags = t->flags; trap_ctxt[t->vector].cs = t->cs; trap_ctxt[t->vector].address = t->address; } } void cpu_initialize_context(unsigned int cpu); extern int nkpt; void cpu_initialize_context(unsigned int cpu) { /* vcpu_guest_context_t is too large to allocate on the stack. * Hence we allocate statically and protect it with a lock */ vm_page_t m[4]; static vcpu_guest_context_t ctxt; vm_offset_t boot_stack; vm_offset_t newPTD; vm_paddr_t ma[NPGPTD]; static int color; int i; /* * Page 0,[0-3] PTD * Page 1, [4] boot stack * Page [5] PDPT * */ for (i = 0; i < NPGPTD + 2; i++) { m[i] = vm_page_alloc(NULL, color++, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); pmap_zero_page(m[i]); } boot_stack = kmem_alloc_nofault(kernel_map, 1); newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V; #ifdef PAE pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); for (i = 0; i < NPGPTD; i++) { ((vm_paddr_t *)boot_stack)[i] = ma[i] = xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V; } #endif /* * Copy cpu0 IdlePTD to new IdlePTD - copying only * kernel mappings */ pmap_qenter(newPTD, m, 4); memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), nkpt*sizeof(vm_paddr_t)); pmap_qremove(newPTD, 4); kmem_free(kernel_map, newPTD, 4); /* * map actual idle stack to boot_stack */ pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]))); vm_page_lock_queues(); for (i = 0; i < 4; i++) { int pdir = (PTDPTDI + i) / NPDEPG; int curoffset = (PTDPTDI + i) % NPDEPG; xen_queue_pt_update((vm_paddr_t) ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), ma[i]); } PT_UPDATES_FLUSH(); vm_page_unlock_queues(); memset(&ctxt, 0, sizeof(ctxt)); ctxt.flags = VGCF_IN_KERNEL; ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.eip = (unsigned long)init_secondary; ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); smp_trap_init(ctxt.trap_ctxt); ctxt.ldt_ents = 0; ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); ctxt.gdt_ents = 512; #ifdef __i386__ ctxt.user_regs.esp = boot_stack + PAGE_SIZE; ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); ctxt.kernel_sp = boot_stack + PAGE_SIZE; ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])); #else /* __x86_64__ */ ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); ctxt.kernel_sp = idle->thread.rsp0; ctxt.event_callback_eip = (unsigned long)hypervisor_callback; ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ctxt.syscall_callback_eip = (unsigned long)system_call; ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); #endif printf("gdtpfn=%lx pdptpfn=%lx\n", ctxt.gdt_frames[0], ctxt.ctrlreg[3] >> PAGE_SHIFT); PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); DELAY(3000); PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ int cpus; static int start_ap(int apic_id) { int ms; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; cpu_initialize_context(apic_id); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's */ static void smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) { u_int ncpu; ncpu = mp_ncpus - 1; /* does not shootdown self */ if (ncpu < 1) return; /* no other cpus */ if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } static void smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { int ncpu, othercpus; othercpus = mp_ncpus - 1; if (mask == (u_int)-1) { ncpu = othercpus; if (ncpu < 1) return; } else { mask &= ~PCPU_GET(cpumask); if (mask == 0) return; ncpu = bitcount32(mask); if (ncpu > othercpus) { /* XXX this should be a panic offence */ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", ncpu, othercpus); ncpu = othercpus; } /* XXX should be a panic, implied by mask == 0 above */ if (ncpu < 1) return; } if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); if (mask == (u_int)-1) ipi_all_but_self(vector); else ipi_selected(mask, vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_cache_flush(void) { if (smp_started) smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); } void smp_invltlb(void) { if (smp_started) { smp_tlb_shootdown(IPI_INVLTLB, 0, 0); } } void smp_invlpg(vm_offset_t addr) { if (smp_started) { smp_tlb_shootdown(IPI_INVLPG, addr, 0); } } void smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); } } void smp_masked_invltlb(u_int mask) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); } } void smp_masked_invlpg(u_int mask, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); } } void smp_masked_invlpg_range(u_int mask, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); } } -static __inline void -ipi_pcpu(unsigned int cpu, int vector) -{ -#ifdef notyet - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - - notify_remote_via_irq(irq); -#endif -} - - void ipi_bitmap_handler(struct trapframe frame) { int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { sched_preempt(curthread); } } /* * send an IPI to a set of cpus. */ void ipi_selected(u_int32_t cpus, u_int ipi) { int cpu; u_int bitmap = 0; u_int old_pending; u_int new_pending; if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; } #ifdef STOP_NMI if (ipi == IPI_STOP && stop_cpus_with_nmi) { ipi_nmi_selected(cpus); return; } #endif CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); while ((cpu = ffs(cpus)) != 0) { cpu--; cpus &= ~(1 << cpu); KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (bitmap) { do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],old_pending, new_pending)); if (old_pending) continue; } ipi_pcpu(cpu, ipi); } } /* * send an IPI INTerrupt containing 'vector' to all CPUs, including myself */ void ipi_all(u_int ipi) { if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { ipi_selected(all_cpus, ipi); return; } CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); /* * */ ipi_selected(-1, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { ipi_selected(PCPU_GET(other_cpus), ipi); return; } CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); ipi_selected(((int)-1 & ~(1 << curcpu)), ipi); } /* * send an IPI to myself */ void ipi_self(u_int ipi) { if (IPI_IS_BITMAPED(ipi) || (ipi == IPI_STOP && stop_cpus_with_nmi)) { ipi_selected(PCPU_GET(cpumask), ipi); return; } CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); ipi_pcpu(curcpu, ipi); } #ifdef STOP_NMI /* * send NMI IPI to selected CPUs */ #define BEFORE_SPIN 1000000 void ipi_nmi_selected(u_int32_t cpus) { int cpu; register_t icrlo; icrlo = APIC_DELMODE_NMI | APIC_DESTMODE_PHY | APIC_LEVEL_ASSERT | APIC_TRIGMOD_EDGE; CTR2(KTR_SMP, "%s: cpus: %x nmi", __func__, cpus); atomic_set_int(&ipi_nmi_pending, cpus); while ((cpu = ffs(cpus)) != 0) { cpu--; cpus &= ~(1 << cpu); KASSERT(cpu_apic_ids[cpu] != -1, ("IPI NMI to non-existent CPU %d", cpu)); /* Wait for an earlier IPI to finish. */ if (!lapic_ipi_wait(BEFORE_SPIN)) panic("ipi_nmi_selected: previous IPI has not cleared"); lapic_ipi_raw(icrlo, cpu_apic_ids[cpu]); } } int ipi_nmi_handler(void) { int cpumask = PCPU_GET(cpumask); if (!(ipi_nmi_pending & cpumask)) return 1; atomic_clear_int(&ipi_nmi_pending, cpumask); cpustop_handler(); return 0; } #endif /* STOP_NMI */ /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { int cpu = PCPU_GET(cpuid); int cpumask = PCPU_GET(cpumask); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ atomic_set_int(&stopped_cpus, cpumask); /* Wait for restart */ while (!(started_cpus & cpumask)) ia32_pause(); atomic_clear_int(&started_cpus, cpumask); atomic_clear_int(&stopped_cpus, cpumask); if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/xen/evtchn/evtchn.c b/sys/xen/evtchn/evtchn.c index de3cf0c05628..53b41accb427 100644 --- a/sys/xen/evtchn/evtchn.c +++ b/sys/xen/evtchn/evtchn.c @@ -1,1107 +1,1115 @@ /****************************************************************************** * evtchn.c * * Communication via Xen event channels. * * Copyright (c) 2002-2005, K A Fraser * Copyright (c) 2005-2006 Kip Macy */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* linux helper functions that got sucked in * rename and move XXX */ static inline int find_first_bit(const unsigned long *addr, unsigned size) { int d0, d1; int res; /* This looks at memory. Mark it volatile to tell gcc not to move it around */ __asm__ __volatile__( "xorl %%eax,%%eax\n\t" "repe; scasl\n\t" "jz 1f\n\t" "leal -4(%%edi),%%edi\n\t" "bsfl (%%edi),%%eax\n" "1:\tsubl %%ebx,%%edi\n\t" "shll $3,%%edi\n\t" "addl %%edi,%%eax" :"=a" (res), "=&c" (d0), "=&D" (d1) :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); return res; } #define min_t(type,x,y) \ ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) #define first_cpu(src) __first_cpu(&(src), NR_CPUS) static inline int __first_cpu(const xen_cpumask_t *srcp, int nbits) { return min_t(int, nbits, find_first_bit(srcp->bits, nbits)); } static inline unsigned long __ffs(unsigned long word) { __asm__("bsfl %1,%0" :"=r" (word) :"rm" (word)); return word; } static struct mtx irq_mapping_update_lock; static struct xenpic *xp; struct xenpic_intsrc { struct intsrc xp_intsrc; uint8_t xp_vector; boolean_t xp_masked; }; struct xenpic { struct pic *xp_dynirq_pic; struct pic *xp_pirq_pic; uint16_t xp_numintr; struct xenpic_intsrc xp_pins[0]; }; #define TODO printf("%s: not implemented!\n", __func__) /* IRQ <-> event-channel mappings. */ static int evtchn_to_irq[NR_EVENT_CHANNELS]; /* Packed IRQ information: binding type, sub-type index, and event channel. */ static uint32_t irq_info[NR_IRQS]; /* Binding types. */ enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_LOCAL_PORT, IRQT_CALLER_PORT }; /* Constructor for packed IRQ information. */ #define mk_irq_info(type, index, evtchn) \ (((uint32_t)(type) << 24) | ((uint32_t)(index) << 16) | (uint32_t)(evtchn)) /* Convenient shorthand for packed representation of an unbound IRQ. */ #define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) /* Accessor macros for packed IRQ information. */ #define evtchn_from_irq(irq) ((uint16_t)(irq_info[irq])) #define index_from_irq(irq) ((uint8_t)(irq_info[irq] >> 16)) #define type_from_irq(irq) ((uint8_t)(irq_info[irq] >> 24)) -/* IRQ <-> VIRQ mapping. */ -DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]); - -/* IRQ <-> IPI mapping. */ -#ifndef NR_IPIS +/* IRQ <-> VIRQ mapping. */ +DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping. */ +#ifndef NR_IPIS #define NR_IPIS 1 -#endif -DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]); +#endif +DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1}; /* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ static unsigned long pirq_needs_unmask_notify[NR_PIRQS/sizeof(unsigned long)]; /* Reference counts for bindings to IRQs. */ static int irq_bindcount[NR_IRQS]; #define VALID_EVTCHN(_chn) ((_chn) != 0) #ifdef CONFIG_SMP static u8 cpu_evtchn[NR_EVENT_CHANNELS]; static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; #define active_evtchns(cpu,sh,idx) \ ((sh)->evtchn_pending[idx] & \ cpu_evtchn_mask[cpu][idx] & \ ~(sh)->evtchn_mask[idx]) static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) { clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); cpu_evtchn[chn] = cpu; } static void init_evtchn_cpu_bindings(void) { /* By default all event channels notify CPU#0. */ memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); } #define cpu_from_evtchn(evtchn) (cpu_evtchn[evtchn]) #else #define active_evtchns(cpu,sh,idx) \ ((sh)->evtchn_pending[idx] & \ ~(sh)->evtchn_mask[idx]) #define bind_evtchn_to_cpu(chn,cpu) ((void)0) #define init_evtchn_cpu_bindings() ((void)0) #define cpu_from_evtchn(evtchn) (0) #endif /* * Force a proper event-channel callback from Xen after clearing the * callback mask. We do this in a very simple manner, by making a call * down into Xen. The pending flag will be checked by Xen on return. */ void force_evtchn_callback(void) { (void)HYPERVISOR_xen_version(0, NULL); } void evtchn_do_upcall(struct trapframe *frame) { unsigned long l1, l2; unsigned int l1i, l2i, port; int irq, cpu; shared_info_t *s; vcpu_info_t *vcpu_info; cpu = smp_processor_id(); s = HYPERVISOR_shared_info; vcpu_info = &s->vcpu_info[cpu]; vcpu_info->evtchn_upcall_pending = 0; /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ l1 = xen_xchg(&vcpu_info->evtchn_pending_sel, 0); while (l1 != 0) { l1i = __ffs(l1); l1 &= ~(1 << l1i); while ((l2 = active_evtchns(cpu, s, l1i)) != 0) { l2i = __ffs(l2); port = (l1i * BITS_PER_LONG) + l2i; if ((irq = evtchn_to_irq[port]) != -1) { struct intsrc *isrc = intr_lookup_source(irq); /* * ack */ mask_evtchn(port); clear_evtchn(port); intr_execute_handlers(isrc, frame); } else { evtchn_device_upcall(port); } } } } +void +ipi_pcpu(unsigned int cpu, int vector) +{ + int irq = per_cpu(ipi_to_irq, cpu)[vector]; + + notify_remote_via_irq(irq); +} + static int find_unbound_irq(void) { int dynirq, irq; for (dynirq = 0; dynirq < NR_IRQS; dynirq++) { irq = dynirq_to_irq(dynirq); if (irq_bindcount[irq] == 0) break; } if (irq == NR_IRQS) panic("No available IRQ to bind to: increase NR_IRQS!\n"); return (irq); } static int bind_caller_port_to_irq(unsigned int caller_port) { int irq; mtx_lock_spin(&irq_mapping_update_lock); if ((irq = evtchn_to_irq[caller_port]) == -1) { if ((irq = find_unbound_irq()) < 0) goto out; evtchn_to_irq[caller_port] = irq; irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port); } irq_bindcount[irq]++; out: mtx_unlock_spin(&irq_mapping_update_lock); return irq; } static int bind_local_port_to_irq(unsigned int local_port) { int irq; mtx_lock_spin(&irq_mapping_update_lock); PANIC_IF(evtchn_to_irq[local_port] != -1); if ((irq = find_unbound_irq()) < 0) { struct evtchn_close close = { .port = local_port }; PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)); goto out; } evtchn_to_irq[local_port] = irq; irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); irq_bindcount[irq]++; out: mtx_unlock_spin(&irq_mapping_update_lock); return irq; } static int bind_listening_port_to_irq(unsigned int remote_domain) { struct evtchn_alloc_unbound alloc_unbound; int err; alloc_unbound.dom = DOMID_SELF; alloc_unbound.remote_dom = remote_domain; err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &alloc_unbound); return err ? : bind_local_port_to_irq(alloc_unbound.port); } static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, unsigned int remote_port) { struct evtchn_bind_interdomain bind_interdomain; int err; bind_interdomain.remote_dom = remote_domain; bind_interdomain.remote_port = remote_port; err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &bind_interdomain); return err ? : bind_local_port_to_irq(bind_interdomain.local_port); } static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; int evtchn, irq; mtx_lock_spin(&irq_mapping_update_lock); if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { bind_virq.virq = virq; bind_virq.vcpu = cpu; PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq) != 0); evtchn = bind_virq.port; irq = find_unbound_irq(); evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); per_cpu(virq_to_irq, cpu)[virq] = irq; bind_evtchn_to_cpu(evtchn, cpu); } irq_bindcount[irq]++; mtx_unlock_spin(&irq_mapping_update_lock); return irq; } static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; int evtchn, irq; mtx_lock_spin(&irq_mapping_update_lock); if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { if ((irq = find_unbound_irq()) < 0) goto out; bind_ipi.vcpu = cpu; PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi) != 0); evtchn = bind_ipi.port; irq = find_unbound_irq(); evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); per_cpu(ipi_to_irq, cpu)[ipi] = irq; bind_evtchn_to_cpu(evtchn, cpu); } irq_bindcount[irq]++; out: mtx_unlock_spin(&irq_mapping_update_lock); return irq; } void unbind_from_irq(int irq) { struct evtchn_close close; int evtchn = evtchn_from_irq(irq); mtx_lock_spin(&irq_mapping_update_lock); if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { close.port = evtchn; PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0); switch (type_from_irq(irq)) { case IRQT_VIRQ: per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))[index_from_irq(irq)] = -1; break; case IRQT_IPI: per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))[index_from_irq(irq)] = -1; break; default: break; } /* Closed ports are implicitly re-bound to VCPU0. */ bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; irq_info[irq] = IRQ_UNBOUND; } mtx_unlock_spin(&irq_mapping_update_lock); } int bind_caller_port_to_irqhandler(unsigned int caller_port, const char *devname, driver_intr_t handler, void *arg, unsigned long irqflags, void **cookiep) { unsigned int irq; int retval; irq = bind_caller_port_to_irq(caller_port); intr_register_source(&xp->xp_pins[irq].xp_intsrc); retval = intr_add_handler(devname, irq, NULL, handler, arg, irqflags, cookiep); if (retval != 0) { unbind_from_irq(irq); return -retval; } return irq; } int bind_listening_port_to_irqhandler( unsigned int remote_domain, const char *devname, driver_intr_t handler, void *arg, unsigned long irqflags, void **cookiep) { unsigned int irq; int retval; irq = bind_listening_port_to_irq(remote_domain); intr_register_source(&xp->xp_pins[irq].xp_intsrc); retval = intr_add_handler(devname, irq, NULL, handler, arg, irqflags, cookiep); if (retval != 0) { unbind_from_irq(irq); return -retval; } return irq; } int bind_interdomain_evtchn_to_irqhandler( unsigned int remote_domain, unsigned int remote_port, const char *devname, driver_filter_t filter, driver_intr_t handler, unsigned long irqflags) { unsigned int irq; int retval; irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); intr_register_source(&xp->xp_pins[irq].xp_intsrc); retval = intr_add_handler(devname, irq, filter, handler, NULL, irqflags, NULL); if (retval != 0) { unbind_from_irq(irq); return -retval; } return irq; } int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, const char *devname, driver_filter_t filter, driver_intr_t handler, unsigned long irqflags) { unsigned int irq; int retval; irq = bind_virq_to_irq(virq, cpu); intr_register_source(&xp->xp_pins[irq].xp_intsrc); retval = intr_add_handler(devname, irq, filter, handler, NULL, irqflags, NULL); if (retval != 0) { unbind_from_irq(irq); return -retval; } return irq; } int bind_ipi_to_irqhandler(unsigned int ipi, unsigned int cpu, const char *devname, driver_intr_t handler, unsigned long irqflags) { unsigned int irq; int retval; irq = bind_ipi_to_irq(ipi, cpu); intr_register_source(&xp->xp_pins[irq].xp_intsrc); retval = intr_add_handler(devname, irq, NULL, handler, NULL, irqflags, NULL); if (retval != 0) { unbind_from_irq(irq); return -retval; } return irq; } void unbind_from_irqhandler(unsigned int irq, void *dev_id) { if (dev_id) intr_remove_handler(dev_id); /* XXX */ unbind_from_irq(irq); } #if 0 /* Rebind an evtchn so that it gets delivered to a specific cpu */ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) { evtchn_op_t op = { .cmd = EVTCHNOP_bind_vcpu }; int evtchn; mtx_lock_spin(&irq_mapping_update_lock); evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) { mtx_unlock_spin(&irq_mapping_update_lock); return; } /* Send future instances of this interrupt to other vcpu. */ bind_vcpu.port = evtchn; bind_vcpu.vcpu = tcpu; /* * If this fails, it usually just indicates that we're dealing with a * virq or IPI channel, which don't actually need to be rebound. Ignore * it, but don't do the xenlinux-level rebind in that case. */ if (HYPERVISOR_event_channel_op(&op) >= 0) bind_evtchn_to_cpu(evtchn, tcpu); mtx_unlock_spin(&irq_mapping_update_lock); } static void set_affinity_irq(unsigned irq, xen_cpumask_t dest) { unsigned tcpu = first_cpu(dest); rebind_irq_to_cpu(irq, tcpu); } #endif /* * Interface to generic handling in intr_machdep.c */ /*------------ interrupt handling --------------------------------------*/ #define TODO printf("%s: not implemented!\n", __func__) static void xenpic_dynirq_enable_source(struct intsrc *isrc); static void xenpic_dynirq_disable_source(struct intsrc *isrc, int); static void xenpic_dynirq_eoi_source(struct intsrc *isrc); static void xenpic_dynirq_enable_intr(struct intsrc *isrc); static void xenpic_dynirq_disable_intr(struct intsrc *isrc); static void xenpic_pirq_enable_source(struct intsrc *isrc); static void xenpic_pirq_disable_source(struct intsrc *isrc, int); static void xenpic_pirq_eoi_source(struct intsrc *isrc); static void xenpic_pirq_enable_intr(struct intsrc *isrc); static void xenpic_pirq_disable_intr(struct intsrc *isrc); static int xenpic_vector(struct intsrc *isrc); static int xenpic_source_pending(struct intsrc *isrc); static void xenpic_suspend(struct pic* pic); static void xenpic_resume(struct pic* pic); static void xenpic_assign_cpu(struct intsrc *, u_int apic_id); struct pic xenpic_dynirq_template = { .pic_enable_source = xenpic_dynirq_enable_source, .pic_disable_source = xenpic_dynirq_disable_source, .pic_eoi_source = xenpic_dynirq_eoi_source, .pic_enable_intr = xenpic_dynirq_enable_intr, .pic_disable_intr = xenpic_dynirq_disable_intr, .pic_vector = xenpic_vector, .pic_source_pending = xenpic_source_pending, .pic_suspend = xenpic_suspend, .pic_resume = xenpic_resume }; struct pic xenpic_pirq_template = { .pic_enable_source = xenpic_pirq_enable_source, .pic_disable_source = xenpic_pirq_disable_source, .pic_eoi_source = xenpic_pirq_eoi_source, .pic_enable_intr = xenpic_pirq_enable_intr, .pic_disable_intr = xenpic_pirq_disable_intr, .pic_vector = xenpic_vector, .pic_source_pending = xenpic_source_pending, .pic_suspend = xenpic_suspend, .pic_resume = xenpic_resume, .pic_assign_cpu = xenpic_assign_cpu }; void xenpic_dynirq_enable_source(struct intsrc *isrc) { unsigned int irq; struct xenpic_intsrc *xp; xp = (struct xenpic_intsrc *)isrc; mtx_lock_spin(&irq_mapping_update_lock); if (xp->xp_masked) { irq = xenpic_vector(isrc); unmask_evtchn(evtchn_from_irq(irq)); xp->xp_masked = FALSE; } mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_dynirq_disable_source(struct intsrc *isrc, int foo) { unsigned int irq; struct xenpic_intsrc *xp; xp = (struct xenpic_intsrc *)isrc; mtx_lock_spin(&irq_mapping_update_lock); if (!xp->xp_masked) { irq = xenpic_vector(isrc); mask_evtchn(evtchn_from_irq(irq)); xp->xp_masked = TRUE; } mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_dynirq_enable_intr(struct intsrc *isrc) { unsigned int irq; struct xenpic_intsrc *xp; xp = (struct xenpic_intsrc *)isrc; mtx_lock_spin(&irq_mapping_update_lock); xp->xp_masked = 0; irq = xenpic_vector(isrc); unmask_evtchn(evtchn_from_irq(irq)); mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_dynirq_disable_intr(struct intsrc *isrc) { unsigned int irq; struct xenpic_intsrc *xp; xp = (struct xenpic_intsrc *)isrc; mtx_lock_spin(&irq_mapping_update_lock); xp->xp_masked = 1; irq = xenpic_vector(isrc); mask_evtchn(evtchn_from_irq(irq)); mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_dynirq_eoi_source(struct intsrc *isrc) { unsigned int irq; struct xenpic_intsrc *xp; xp = (struct xenpic_intsrc *)isrc; mtx_lock_spin(&irq_mapping_update_lock); xp->xp_masked = 0; irq = xenpic_vector(isrc); unmask_evtchn(evtchn_from_irq(irq)); mtx_unlock_spin(&irq_mapping_update_lock); } static int xenpic_vector(struct intsrc *isrc) { struct xenpic_intsrc *pin; pin = (struct xenpic_intsrc *)isrc; //printf("xenpic_vector(): isrc=%p,vector=%u\n", pin, pin->xp_vector); return (pin->xp_vector); } static int xenpic_source_pending(struct intsrc *isrc) { struct xenpic_intsrc *pin = (struct xenpic_intsrc *)isrc; /* XXXEN: TODO */ printf("xenpic_source_pending(): vector=%x,masked=%x\n", pin->xp_vector, pin->xp_masked); /* notify_remote_via_evtchn(pin->xp_vector); // XXX RS: Is this correct? */ return 0; } static void xenpic_suspend(struct pic* pic) { TODO; } static void xenpic_resume(struct pic* pic) { TODO; } static void xenpic_assign_cpu(struct intsrc *isrc, u_int apic_id) { TODO; } void notify_remote_via_irq(int irq) { int evtchn = evtchn_from_irq(irq); if (VALID_EVTCHN(evtchn)) notify_remote_via_evtchn(evtchn); } /* required for support of physical devices */ static inline void pirq_unmask_notify(int pirq) { struct physdev_eoi eoi = { .irq = pirq }; if (unlikely(test_bit(pirq, &pirq_needs_unmask_notify[0]))) { (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); } } static inline void pirq_query_unmask(int pirq) { struct physdev_irq_status_query irq_status_query; irq_status_query.irq = pirq; (void)HYPERVISOR_physdev_op(PHYSDEVOP_IRQ_STATUS_QUERY, &irq_status_query); clear_bit(pirq, &pirq_needs_unmask_notify[0]); if ( irq_status_query.flags & PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY ) set_bit(pirq, &pirq_needs_unmask_notify[0]); } /* * On startup, if there is no action associated with the IRQ then we are * probing. In this case we should not share with others as it will confuse us. */ #define probing_irq(_irq) (intr_lookup_source(irq) == NULL) static void xenpic_pirq_enable_intr(struct intsrc *isrc) { struct evtchn_bind_pirq bind_pirq; int evtchn; unsigned int irq; mtx_lock_spin(&irq_mapping_update_lock); irq = xenpic_vector(isrc); evtchn = evtchn_from_irq(irq); if (VALID_EVTCHN(evtchn)) goto out; bind_pirq.pirq = irq; /* NB. We are happy to share unless we are probing. */ bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE; if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) { if (!probing_irq(irq)) /* Some failures are expected when probing. */ printf("Failed to obtain physical IRQ %d\n", irq); mtx_unlock_spin(&irq_mapping_update_lock); return; } evtchn = bind_pirq.port; pirq_query_unmask(irq_to_pirq(irq)); bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn); out: unmask_evtchn(evtchn); pirq_unmask_notify(irq_to_pirq(irq)); mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_pirq_disable_intr(struct intsrc *isrc) { unsigned int irq; int evtchn; struct evtchn_close close; mtx_lock_spin(&irq_mapping_update_lock); irq = xenpic_vector(isrc); evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) goto done; mask_evtchn(evtchn); close.port = evtchn; PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0); bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; irq_info[irq] = IRQ_UNBOUND; done: mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_pirq_enable_source(struct intsrc *isrc) { int evtchn; unsigned int irq; mtx_lock_spin(&irq_mapping_update_lock); irq = xenpic_vector(isrc); evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) goto done; unmask_evtchn(evtchn); pirq_unmask_notify(irq_to_pirq(irq)); done: mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_pirq_disable_source(struct intsrc *isrc, int eoi) { int evtchn; unsigned int irq; mtx_lock_spin(&irq_mapping_update_lock); irq = xenpic_vector(isrc); evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) goto done; mask_evtchn(evtchn); done: mtx_unlock_spin(&irq_mapping_update_lock); } static void xenpic_pirq_eoi_source(struct intsrc *isrc) { int evtchn; unsigned int irq; mtx_lock_spin(&irq_mapping_update_lock); irq = xenpic_vector(isrc); evtchn = evtchn_from_irq(irq); if (!VALID_EVTCHN(evtchn)) goto done; unmask_evtchn(evtchn); pirq_unmask_notify(irq_to_pirq(irq)); done: mtx_unlock_spin(&irq_mapping_update_lock); } int irq_to_evtchn_port(int irq) { return evtchn_from_irq(irq); } void mask_evtchn(int port) { shared_info_t *s = HYPERVISOR_shared_info; synch_set_bit(port, &s->evtchn_mask[0]); } void unmask_evtchn(int port) { shared_info_t *s = HYPERVISOR_shared_info; unsigned int cpu = smp_processor_id(); vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; /* Slow path (hypercall) if this is a non-local port. */ if (unlikely(cpu != cpu_from_evtchn(port))) { struct evtchn_unmask unmask = { .port = port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); return; } synch_clear_bit(port, &s->evtchn_mask); /* * The following is basically the equivalent of 'hw_resend_irq'. Just * like a real IO-APIC we 'lose the interrupt edge' if the channel is * masked. */ if (synch_test_bit(port, &s->evtchn_pending) && !synch_test_and_set_bit(port / BITS_PER_LONG, &vcpu_info->evtchn_pending_sel)) { vcpu_info->evtchn_upcall_pending = 1; if (!vcpu_info->evtchn_upcall_mask) force_evtchn_callback(); } } void irq_resume(void) { evtchn_op_t op; int cpu, pirq, virq, ipi, irq, evtchn; struct evtchn_bind_virq bind_virq; struct evtchn_bind_ipi bind_ipi; init_evtchn_cpu_bindings(); /* New event-channel space is not 'live' yet. */ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) mask_evtchn(evtchn); /* Check that no PIRQs are still bound. */ for (pirq = 0; pirq < NR_PIRQS; pirq++) PANIC_IF(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND); /* Secondary CPUs must have no VIRQ or IPI bindings. */ for (cpu = 1; cpu < NR_CPUS; cpu++) { for (virq = 0; virq < NR_VIRQS; virq++) PANIC_IF(per_cpu(virq_to_irq, cpu)[virq] != -1); for (ipi = 0; ipi < NR_IPIS; ipi++) PANIC_IF(per_cpu(ipi_to_irq, cpu)[ipi] != -1); } /* No IRQ <-> event-channel mappings. */ for (irq = 0; irq < NR_IRQS; irq++) irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) evtchn_to_irq[evtchn] = -1; /* Primary CPU: rebind VIRQs automatically. */ for (virq = 0; virq < NR_VIRQS; virq++) { if ((irq = per_cpu(virq_to_irq, 0)[virq]) == -1) continue; PANIC_IF(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0)); /* Get a new binding from Xen. */ bind_virq.virq = virq; bind_virq.vcpu = 0; PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq) != 0); evtchn = bind_virq.port; /* Record the new mapping. */ evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); /* Ready for use. */ unmask_evtchn(evtchn); } /* Primary CPU: rebind IPIs automatically. */ for (ipi = 0; ipi < NR_IPIS; ipi++) { if ((irq = per_cpu(ipi_to_irq, 0)[ipi]) == -1) continue; PANIC_IF(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0)); /* Get a new binding from Xen. */ memset(&op, 0, sizeof(op)); bind_ipi.vcpu = 0; PANIC_IF(HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi) != 0); evtchn = bind_ipi.port; /* Record the new mapping. */ evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); /* Ready for use. */ unmask_evtchn(evtchn); } } static void evtchn_init(void *dummy __unused) { int i, cpu; struct xenpic_intsrc *pin, *tpin; /* No VIRQ or IPI bindings. */ for (cpu = 0; cpu < NR_CPUS; cpu++) { for (i = 0; i < NR_VIRQS; i++) per_cpu(virq_to_irq, cpu)[i] = -1; for (i = 0; i < NR_IPIS; i++) per_cpu(ipi_to_irq, cpu)[i] = -1; } /* No event-channel -> IRQ mappings. */ for (i = 0; i < NR_EVENT_CHANNELS; i++) { evtchn_to_irq[i] = -1; mask_evtchn(i); /* No event channels are 'live' right now. */ } /* No IRQ -> event-channel mappings. */ for (i = 0; i < NR_IRQS; i++) irq_info[i] = IRQ_UNBOUND; xp = malloc(sizeof(struct xenpic) + NR_IRQS*sizeof(struct xenpic_intsrc), M_DEVBUF, M_WAITOK); xp->xp_dynirq_pic = &xenpic_dynirq_template; xp->xp_pirq_pic = &xenpic_pirq_template; xp->xp_numintr = NR_IRQS; bzero(xp->xp_pins, sizeof(struct xenpic_intsrc) * NR_IRQS); /* We need to register our PIC's beforehand */ if (intr_register_pic(&xenpic_pirq_template)) panic("XEN: intr_register_pic() failure"); if (intr_register_pic(&xenpic_dynirq_template)) panic("XEN: intr_register_pic() failure"); /* * Initialize the dynamic IRQ's - we initialize the structures, but * we do not bind them (bind_evtchn_to_irqhandle() does this) */ pin = xp->xp_pins; for (i = 0; i < NR_DYNIRQS; i++) { /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ irq_bindcount[dynirq_to_irq(i)] = 0; tpin = &pin[dynirq_to_irq(i)]; tpin->xp_intsrc.is_pic = xp->xp_dynirq_pic; tpin->xp_vector = dynirq_to_irq(i); } /* * Now, we go ahead and claim every PIRQ there is. */ pin = xp->xp_pins; for (i = 0; i < NR_PIRQS; i++) { /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ irq_bindcount[pirq_to_irq(i)] = 0; #ifdef RTC_IRQ /* If not domain 0, force our RTC driver to fail its probe. */ if ((i == RTC_IRQ) && !(xen_start_info->flags & SIF_INITDOMAIN)) continue; #endif tpin = &pin[pirq_to_irq(i)]; tpin->xp_intsrc.is_pic = xp->xp_pirq_pic; tpin->xp_vector = pirq_to_irq(i); } } SYSINIT(evtchn_init, SI_SUB_INTR, SI_ORDER_ANY, evtchn_init, NULL); /* * irq_mapping_update_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ MTX_SYSINIT(irq_mapping_update_lock, &irq_mapping_update_lock, "xp", MTX_SPIN);