diff --git a/usr.sbin/bhyve/amd64/bhyverun_machdep.c b/usr.sbin/bhyve/amd64/bhyverun_machdep.c index 40325a7f52c9..4f482586f900 100644 --- a/usr.sbin/bhyve/amd64/bhyverun_machdep.c +++ b/usr.sbin/bhyve/amd64/bhyverun_machdep.c @@ -1,187 +1,187 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include "acpi.h" #include "atkbdc.h" #include "bhyverun.h" #include "config.h" +#include "debug.h" #include "e820.h" #include "fwctl.h" #include "ioapic.h" #include "inout.h" #include "kernemu_dev.h" #include "mptbl.h" #include "pci_irq.h" #include "pci_lpc.h" #include "rtc.h" #include "smbiostbl.h" #include "xmsr.h" void bhyve_init_config(void) { init_config(); /* Set default values prior to option parsing. */ set_config_bool("acpi_tables", true); set_config_bool("acpi_tables_in_memory", true); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); set_config_value("lpc.fwcfg", "bhyve"); } void bhyve_init_vcpu(struct vcpu *vcpu) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(vcpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { - fprintf(stderr, "VM exit on HLT not supported\n"); + EPRINTLN("VM exit on HLT not supported"); exit(4); } vm_set_capability(vcpu, VM_CAP_HALT_EXIT, 1); } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(vcpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { - fprintf(stderr, - "SMP mux requested, no pause support\n"); + EPRINTLN("SMP mux requested, no pause support"); exit(4); } vm_set_capability(vcpu, VM_CAP_PAUSE_EXIT, 1); } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(vcpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(vcpu, X2APIC_DISABLED); if (err) { - fprintf(stderr, "Unable to set x2apic state (%d)\n", err); + EPRINTLN("Unable to set x2apic state (%d)", err); exit(4); } vm_set_capability(vcpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(vcpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } void bhyve_start_vcpu(struct vcpu *vcpu, bool bsp) { int error; if (bsp) { if (lpc_bootrom()) { error = vm_set_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); if (error != 0) { err(4, "ROM boot failed: unrestricted guest " "capability not available"); } error = vcpu_reset(vcpu); assert(error == 0); } } else { bhyve_init_vcpu(vcpu); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(vcpu_id(vcpu)); } int bhyve_init_platform(struct vmctx *ctx, struct vcpu *bsp __unused) { int error; error = init_msr(); if (error != 0) return (error); init_inout(); kernemu_dev_init(); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); error = e820_init(ctx); if (error != 0) return (error); return (0); } int bhyve_init_platform_late(struct vmctx *ctx, struct vcpu *bsp __unused) { int error; if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error != 0) return (error); } error = smbios_build(ctx); if (error != 0) return (error); error = e820_finalize(); if (error != 0) return (error); if (lpc_bootrom() && strcmp(lpc_fwcfg(), "bhyve") == 0) fwctl_init(); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } return (0); } diff --git a/usr.sbin/bhyve/amd64/e820.c b/usr.sbin/bhyve/amd64/e820.c index 545878aad39f..456ce0330b50 100644 --- a/usr.sbin/bhyve/amd64/e820.c +++ b/usr.sbin/bhyve/amd64/e820.c @@ -1,490 +1,491 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG * Author: Corvin Köhne */ #include #include #include #include #include #include #include #include #include +#include "debug.h" #include "e820.h" #include "qemu_fwcfg.h" /* * E820 always uses 64 bit entries. Emulation code will use vm_paddr_t since it * works on physical addresses. If vm_paddr_t is larger than uint64_t E820 can't * hold all possible physical addresses and we can get into trouble. */ static_assert(sizeof(vm_paddr_t) <= sizeof(uint64_t), "Unable to represent physical memory by E820 table"); #define E820_FWCFG_FILE_NAME "etc/e820" #define KB (1024UL) #define MB (1024 * KB) #define GB (1024 * MB) /* * Fix E820 memory holes: * [ A0000, C0000) VGA * [ C0000, 100000) ROM */ #define E820_VGA_MEM_BASE 0xA0000 #define E820_VGA_MEM_END 0xC0000 #define E820_ROM_MEM_BASE 0xC0000 #define E820_ROM_MEM_END 0x100000 struct e820_element { TAILQ_ENTRY(e820_element) chain; uint64_t base; uint64_t end; enum e820_memory_type type; }; static TAILQ_HEAD(e820_table, e820_element) e820_table = TAILQ_HEAD_INITIALIZER( e820_table); static struct e820_element * e820_element_alloc(uint64_t base, uint64_t end, enum e820_memory_type type) { struct e820_element *element; element = calloc(1, sizeof(*element)); if (element == NULL) { return (NULL); } element->base = base; element->end = end; element->type = type; return (element); } static const char * e820_get_type_name(const enum e820_memory_type type) { switch (type) { case E820_TYPE_MEMORY: return ("RAM"); case E820_TYPE_RESERVED: return ("Reserved"); case E820_TYPE_ACPI: return ("ACPI"); case E820_TYPE_NVS: return ("NVS"); default: return ("Unknown"); } } void e820_dump_table(void) { struct e820_element *element; uint64_t i; - fprintf(stderr, "E820 map:\n"); + EPRINTLN("E820 map:"); i = 0; TAILQ_FOREACH(element, &e820_table, chain) { - fprintf(stderr, " (%4lu) [%16lx, %16lx] %s\n", i, + EPRINTLN(" (%4lu) [%16lx, %16lx] %s", i, element->base, element->end, e820_get_type_name(element->type)); ++i; } } static struct qemu_fwcfg_item * e820_get_fwcfg_item(void) { struct qemu_fwcfg_item *fwcfg_item; struct e820_element *element; struct e820_entry *entries; int count, i; count = 0; TAILQ_FOREACH(element, &e820_table, chain) { ++count; } if (count == 0) { warnx("%s: E820 table empty", __func__); return (NULL); } fwcfg_item = calloc(1, sizeof(struct qemu_fwcfg_item)); if (fwcfg_item == NULL) { return (NULL); } fwcfg_item->size = count * sizeof(struct e820_entry); fwcfg_item->data = calloc(count, sizeof(struct e820_entry)); if (fwcfg_item->data == NULL) { free(fwcfg_item); return (NULL); } i = 0; entries = (struct e820_entry *)fwcfg_item->data; TAILQ_FOREACH(element, &e820_table, chain) { struct e820_entry *entry = &entries[i]; entry->base = element->base; entry->length = element->end - element->base; entry->type = element->type; ++i; } return (fwcfg_item); } static int e820_add_entry(const uint64_t base, const uint64_t end, const enum e820_memory_type type) { struct e820_element *new_element; struct e820_element *element; struct e820_element *ram_element; assert(end >= base); new_element = e820_element_alloc(base, end, type); if (new_element == NULL) { return (ENOMEM); } /* * E820 table should always be sorted in ascending order. Therefore, * search for a range whose end is larger than the base parameter. */ TAILQ_FOREACH(element, &e820_table, chain) { if (element->end > base) { break; } } /* * System memory requires special handling. */ if (type == E820_TYPE_MEMORY) { /* * base is larger than of any existing element. Add new system * memory at the end of the table. */ if (element == NULL) { TAILQ_INSERT_TAIL(&e820_table, new_element, chain); return (0); } /* * System memory shouldn't overlap with any existing element. */ assert(end >= element->base); TAILQ_INSERT_BEFORE(element, new_element, chain); return (0); } /* * If some one tries to allocate a specific address, it could happen, that * this address is not allocatable. Therefore, do some checks. If the * address is not allocatable, don't panic. The user may have a fallback and * tries to allocate another address. This is true for the GVT-d emulation * which tries to reuse the host address of the graphics stolen memory and * falls back to allocating the highest address below 4 GB. */ if (element == NULL || element->type != E820_TYPE_MEMORY || (base < element->base || end > element->end)) return (ENOMEM); if (base == element->base) { /* * New element at system memory base boundary. Add new * element before current and adjust the base of the old * element. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x2000] Reserved * [ 0x2000, 0x4000] RAM <-- element */ TAILQ_INSERT_BEFORE(element, new_element, chain); element->base = end; } else if (end == element->end) { /* * New element at system memory end boundary. Add new * element after current and adjust the end of the * current element. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x3000] RAM <-- element * [ 0x3000, 0x4000] Reserved */ TAILQ_INSERT_AFTER(&e820_table, element, new_element, chain); element->end = base; } else { /* * New element inside system memory entry. Split it by * adding a system memory element and the new element * before current. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x2000] RAM * [ 0x2000, 0x3000] Reserved * [ 0x3000, 0x4000] RAM <-- element */ ram_element = e820_element_alloc(element->base, base, E820_TYPE_MEMORY); if (ram_element == NULL) { return (ENOMEM); } TAILQ_INSERT_BEFORE(element, ram_element, chain); TAILQ_INSERT_BEFORE(element, new_element, chain); element->base = end; } return (0); } static int e820_add_memory_hole(const uint64_t base, const uint64_t end) { struct e820_element *element; struct e820_element *ram_element; assert(end >= base); /* * E820 table should be always sorted in ascending order. Therefore, * search for an element which end is larger than the base parameter. */ TAILQ_FOREACH(element, &e820_table, chain) { if (element->end > base) { break; } } if (element == NULL || end <= element->base) { /* Nothing to do. Hole already exists */ return (0); } /* Memory holes are only allowed in system memory */ assert(element->type == E820_TYPE_MEMORY); if (base == element->base) { /* * New hole at system memory base boundary. * * Old table: * [ 0x1000, 0x4000] RAM * New table: * [ 0x2000, 0x4000] RAM */ element->base = end; } else if (end == element->end) { /* * New hole at system memory end boundary. * * Old table: * [ 0x1000, 0x4000] RAM * New table: * [ 0x1000, 0x3000] RAM */ element->end = base; } else { /* * New hole inside system memory entry. Split the system memory. * * Old table: * [ 0x1000, 0x4000] RAM <-- element * New table: * [ 0x1000, 0x2000] RAM * [ 0x3000, 0x4000] RAM <-- element */ ram_element = e820_element_alloc(element->base, base, E820_TYPE_MEMORY); if (ram_element == NULL) { return (ENOMEM); } TAILQ_INSERT_BEFORE(element, ram_element, chain); element->base = end; } return (0); } static uint64_t e820_alloc_highest(const uint64_t max_address, const uint64_t length, const uint64_t alignment, const enum e820_memory_type type) { struct e820_element *element; TAILQ_FOREACH_REVERSE(element, &e820_table, e820_table, chain) { uint64_t address, base, end; end = MIN(max_address, element->end); base = roundup2(element->base, alignment); /* * If end - length == 0, we would allocate memory at address 0. This * address is mostly unusable and we should avoid allocating it. * Therefore, search for another block in that case. */ if (element->type != E820_TYPE_MEMORY || end < base || end - base < length || end - length == 0) { continue; } address = rounddown2(end - length, alignment); if (e820_add_entry(address, address + length, type) != 0) { return (0); } return (address); } return (0); } static uint64_t e820_alloc_lowest(const uint64_t min_address, const uint64_t length, const uint64_t alignment, const enum e820_memory_type type) { struct e820_element *element; TAILQ_FOREACH(element, &e820_table, chain) { uint64_t base, end; end = element->end; base = MAX(min_address, roundup2(element->base, alignment)); /* * If base == 0, we would allocate memory at address 0. This * address is mostly unusable and we should avoid allocating it. * Therefore, search for another block in that case. */ if (element->type != E820_TYPE_MEMORY || end < base || end - base < length || base == 0) { continue; } if (e820_add_entry(base, base + length, type) != 0) { return (0); } return (base); } return (0); } uint64_t e820_alloc(const uint64_t address, const uint64_t length, const uint64_t alignment, const enum e820_memory_type type, const enum e820_allocation_strategy strategy) { assert(powerof2(alignment)); assert((address & (alignment - 1)) == 0); switch (strategy) { case E820_ALLOCATE_ANY: /* * Allocate any address. Therefore, ignore the address parameter * and reuse the code path for allocating the lowest address. */ return (e820_alloc_lowest(0, length, alignment, type)); case E820_ALLOCATE_LOWEST: return (e820_alloc_lowest(address, length, alignment, type)); case E820_ALLOCATE_HIGHEST: return (e820_alloc_highest(address, length, alignment, type)); case E820_ALLOCATE_SPECIFIC: if (e820_add_entry(address, address + length, type) != 0) { return (0); } return (address); } return (0); } int e820_init(struct vmctx *const ctx) { uint64_t lowmem_size, highmem_size; int error; TAILQ_INIT(&e820_table); lowmem_size = vm_get_lowmem_size(ctx); error = e820_add_entry(0, lowmem_size, E820_TYPE_MEMORY); if (error) { warnx("%s: Could not add lowmem", __func__); return (error); } highmem_size = vm_get_highmem_size(ctx); if (highmem_size != 0) { error = e820_add_entry(4 * GB, 4 * GB + highmem_size, E820_TYPE_MEMORY); if (error) { warnx("%s: Could not add highmem", __func__); return (error); } } error = e820_add_memory_hole(E820_VGA_MEM_BASE, E820_VGA_MEM_END); if (error) { warnx("%s: Could not add VGA memory", __func__); return (error); } error = e820_add_memory_hole(E820_ROM_MEM_BASE, E820_ROM_MEM_END); if (error) { warnx("%s: Could not add ROM area", __func__); return (error); } return (0); } int e820_finalize(void) { struct qemu_fwcfg_item *e820_fwcfg_item; int error; e820_fwcfg_item = e820_get_fwcfg_item(); if (e820_fwcfg_item == NULL) { warnx("invalid e820 table"); return (ENOMEM); } error = qemu_fwcfg_add_file("etc/e820", e820_fwcfg_item->size, e820_fwcfg_item->data); if (error != 0) { warnx("could not add qemu fwcfg etc/e820"); return (error); } free(e820_fwcfg_item); return (0); } diff --git a/usr.sbin/bhyve/amd64/vmexit.c b/usr.sbin/bhyve/amd64/vmexit.c index 5d903d98f277..2c01c63f6454 100644 --- a/usr.sbin/bhyve/amd64/vmexit.c +++ b/usr.sbin/bhyve/amd64/vmexit.c @@ -1,506 +1,506 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "gdb.h" #include "inout.h" #include "mem.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "spinup_ap.h" #include "vmexit.h" #include "xmsr.h" void vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode) { int error, restart_instruction; restart_instruction = 1; error = vm_inject_exception(vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } static int vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; int bytes, port, in; vme = vmrun->vm_exit; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; error = emulate_inout(ctx, vcpu, vme); if (error) { - fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", + EPRINTLN("Unhandled %s%c 0x%04x at 0x%lx", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; uint64_t val; uint32_t eax, edx; int error; vme = vmrun->vm_exit; val = 0; error = emulate_rdmsr(vcpu, vme->u.msr.code, &val); if (error != 0) { - fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", + EPRINTLN("rdmsr to register %#x on vcpu %d", vme->u.msr.code, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; int error; vme = vmrun->vm_exit; error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { - fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", + EPRINTLN("wrmsr to register %#x(%#lx) on vcpu %d", vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu)); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(vcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static int vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; - fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); - fprintf(stderr, "\treason\t\tVMX\n"); - fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); - fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); - fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); - fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, + EPRINTLN("vm exit[%d]", vcpu_id(vcpu)); + EPRINTLN("\treason\t\tVMX"); + EPRINTLN("\trip\t\t0x%016lx", vme->rip); + EPRINTLN("\tinst_length\t%d", vme->inst_length); + EPRINTLN("\tstatus\t\t%d", vme->u.vmx.status); + EPRINTLN("\texit_reason\t%u (%s)", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); - fprintf(stderr, "\tqualification\t0x%016lx\n", + EPRINTLN("\tqualification\t0x%016lx", vme->u.vmx.exit_qualification); - fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); - fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); + EPRINTLN("\tinst_type\t\t%d", vme->u.vmx.inst_type); + EPRINTLN("\tinst_error\t\t%d", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(vcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); - fprintf(stderr, "\tEPT misconfiguration:\n"); - fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); - fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", + EPRINTLN("\tEPT misconfiguration:"); + EPRINTLN("\t\tGPA: %#lx", ept_misconfig_gpa); + EPRINTLN("\t\tPTE(%d): %#lx %#lx %#lx %#lx", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; vme = vmrun->vm_exit; - fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu)); - fprintf(stderr, "\treason\t\tSVM\n"); - fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); - fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); - fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); - fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); - fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); + EPRINTLN("vm exit[%d]", vcpu_id(vcpu)); + EPRINTLN("\treason\t\tSVM"); + EPRINTLN("\trip\t\t0x%016lx", vme->rip); + EPRINTLN("\tinst_length\t%d", vme->inst_length); + EPRINTLN("\texitcode\t%#lx", vme->u.svm.exitcode); + EPRINTLN("\texitinfo1\t%#lx", vme->u.svm.exitinfo1); + EPRINTLN("\texitinfo2\t%#lx", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun __unused) { return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { assert(vmrun->vm_exit->inst_length == 0); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_mtrap(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; struct vie *vie; int err, i, cs_d; enum vm_cpu_mode mode; vme = vmrun->vm_exit; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(vcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static int vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) { struct vm_exit *vme; enum vm_suspend_how how; int vcpuid = vcpu_id(vcpu); vme = vmrun->vm_exit; how = vme->u.suspended.how; fbsdrun_deletecpu(vcpuid); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: - fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); + EPRINTLN("vmexit_suspend: invalid reason %d", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun __unused) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(vcpu_id(vcpu)); #endif gdb_cpu_suspend(vcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(vcpu_id(vcpu)); #endif /* * XXX-MJ sleep for a short period to avoid chewing up the CPU in the * window between activation of the vCPU thread and the STARTUP IPI. */ usleep(1000); return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun) { gdb_cpu_breakpoint(vcpu, vmrun->vm_exit); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, struct vm_run *vmrun) { struct vm_exit *vme; cpuset_t *dmask; int error = -1; int i; dmask = vmrun->cpuset; vme = vmrun->vm_exit; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, dmask) { error = fbsdrun_suspendcpu(i); if (error) { warnx("failed to suspend cpu %d", i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, dmask) { spinup_ap(fbsdrun_vcpu(i), vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *); const vmexit_handler_t vmexit_handlers[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, [VM_EXITCODE_HLT] = vmexit_hlt, [VM_EXITCODE_PAUSE] = vmexit_pause, }; diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index 027042cf0c14..0d7f58509244 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1061 +1,1062 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #include "acpi.h" #include "bhyverun.h" #include "bootrom.h" #include "config.h" #include "debug.h" #ifdef BHYVE_GDB #include "gdb.h" #endif #include "mem.h" #include "mevent.h" #include "pci_emul.h" #ifdef __amd64__ #include "amd64/pci_lpc.h" #endif #include "qemu_fwcfg.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "tpm_device.h" #include "vmgenc.h" #include "vmexit.h" #define MB (1024UL * 1024) #define GB (1024UL * MB) int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu); static struct vcpu_info { struct vmctx *ctx; struct vcpu *vcpu; int vcpuid; } *vcpu_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } struct vcpu * fbsdrun_vcpu(int vcpuid) { return (vcpu_info[vcpuid].vcpu); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct vcpu_info *vi = param; int error; snprintf(tname, sizeof(tname), "vcpu %d", vi->vcpuid); pthread_set_name_np(pthread_self(), tname); if (vcpumap[vi->vcpuid] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vi->vcpuid]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vi->vcpuid); #endif #ifdef BHYVE_GDB gdb_cpu_add(vi->vcpu); #endif vm_loop(vi->ctx, vi->vcpu); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(int vcpuid) { struct vcpu_info *vi; pthread_t thr; int error; vi = &vcpu_info[vcpuid]; error = vm_activate_cpu(vi->vcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", vi->vcpuid); CPU_SET_ATOMIC(vcpuid, &cpumask); vm_suspend_cpu(vi->vcpu); error = pthread_create(&thr, NULL, fbsdrun_start_thread, vi); assert(error == 0); } void fbsdrun_deletecpu(int vcpu) { static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; pthread_mutex_lock(&resetcpu_mtx); if (!CPU_ISSET(vcpu, &cpumask)) { - fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); + EPRINTLN("Attempting to delete unknown cpu %d", vcpu); exit(4); } CPU_CLR(vcpu, &cpumask); if (vcpu != BSP) { pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); /* NOTREACHED */ } while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); } int fbsdrun_suspendcpu(int vcpuid) { return (vm_suspend_cpu(vcpu_info[vcpuid].vcpu)); } static void vm_loop(struct vmctx *ctx, struct vcpu *vcpu) { struct vm_exit vme; struct vm_run vmrun; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus, dmask; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu_id(vcpu), &active_cpus)); vmrun.vm_exit = &vme; vmrun.cpuset = &dmask; vmrun.cpusetsize = sizeof(dmask); while (1) { error = vm_run(vcpu, &vmrun); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || vmexit_handlers[exitcode] == NULL) { warnx("vm_loop: unexpected exitcode 0x%x", exitcode); exit(4); } rc = (*vmexit_handlers[exitcode])(ctx, vcpu, &vmrun); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } - fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); + EPRINTLN("vm_run error %d, errno %d", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx, struct vcpu *vcpu) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(vcpu, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; #ifdef __amd64__ if (lpc_bootrom()) romboot = true; #endif error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } #ifdef BHYVE_GDB static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } #endif int main(int argc, char *argv[]) { int c, error; int max_vcpus, memflags; struct vcpu *bsp; struct vmctx *ctx; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif bhyve_init_config(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:f:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { #ifdef __amd64__ case 'a': set_config_bool("x86.x2apic", false); break; #endif case 'A': /* * NOP. For backward compatibility. Most systems don't * work properly without sane ACPI tables. Therefore, * we're always generating them. */ break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'f': if (qemu_fwcfg_parse_cmdline_arg(optarg) != 0) { errx(EX_USAGE, "invalid fwcfg item '%s'", optarg); } break; #ifdef BHYVE_GDB case 'G': parse_gdb_options(optarg); break; #endif case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; #ifdef __amd64__ case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #endif #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; #ifdef __amd64__ case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; #endif case 'U': set_config_value("uuid", optarg); break; #ifdef __amd64__ case 'w': set_config_bool("x86.strictmsr", false); break; #endif case 'W': set_config_bool("virtio_msix", false); break; #ifdef __amd64__ case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; #endif case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif bsp = vm_vcpu_open(ctx, BSP); max_vcpus = num_vcpus_allowed(ctx, bsp); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } bhyve_init_vcpu(bsp); /* Allocate per-VCPU resources. */ vcpu_info = calloc(guest_ncpus, sizeof(*vcpu_info)); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) { vcpu_info[vcpuid].ctx = ctx; vcpu_info[vcpuid].vcpuid = vcpuid; if (vcpuid == BSP) vcpu_info[vcpuid].vcpu = bsp; else vcpu_info[vcpuid].vcpu = vm_vcpu_open(ctx, vcpuid); } memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (error) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } init_mem(guest_ncpus); init_bootrom(ctx); if (bhyve_init_platform(ctx, bsp) != 0) exit(4); if (qemu_fwcfg_init(ctx) != 0) { - fprintf(stderr, "qemu fwcfg initialization error"); + fprintf(stderr, "qemu fwcfg initialization error\n"); exit(4); } if (qemu_fwcfg_add_file("opt/bhyve/hw.ncpu", sizeof(guest_ncpus), &guest_ncpus) != 0) { - fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu"); + fprintf(stderr, "Could not add qemu fwcfg opt/bhyve/hw.ncpu\n"); exit(4); } /* * Exit if a device emulation finds an error in its initialization */ if (init_pci(ctx) != 0) { - perror("device emulation initialization error"); + EPRINTLN("Device emulation initialization error: %s", + strerror(errno)); exit(4); } if (init_tpm(ctx) != 0) { - fprintf(stderr, "Failed to init TPM device"); + EPRINTLN("Failed to init TPM device"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); #ifdef BHYVE_GDB init_gdb(ctx); #endif /* * Add all vCPUs. */ for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) bhyve_start_vcpu(vcpu_info[vcpuid].vcpu, vcpuid == BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { - fprintf(stdout, "Pausing pci devs...\r\n"); + FPRINTLN(stdout, "Pausing pci devs..."); if (vm_pause_devices() != 0) { - fprintf(stderr, "Failed to pause PCI device state.\n"); + EPRINTLN("Failed to pause PCI device state."); exit(1); } - fprintf(stdout, "Restoring vm mem...\r\n"); + FPRINTLN(stdout, "Restoring vm mem..."); if (restore_vm_mem(ctx, &rstate) != 0) { - fprintf(stderr, "Failed to restore VM memory.\n"); + EPRINTLN("Failed to restore VM memory."); exit(1); } - fprintf(stdout, "Restoring pci devs...\r\n"); + FPRINTLN(stdout, "Restoring pci devs..."); if (vm_restore_devices(&rstate) != 0) { - fprintf(stderr, "Failed to restore PCI device state.\n"); + EPRINTLN("Failed to restore PCI device state."); exit(1); } - fprintf(stdout, "Restoring kernel structs...\r\n"); + FPRINTLN(stdout, "Restoring kernel structs..."); if (vm_restore_kern_structs(ctx, &rstate) != 0) { - fprintf(stderr, "Failed to restore kernel structs.\n"); + EPRINTLN("Failed to restore kernel structs."); exit(1); } - fprintf(stdout, "Resuming pci devs...\r\n"); + FPRINTLN(stdout, "Resuming pci devs..."); if (vm_resume_devices() != 0) { - fprintf(stderr, "Failed to resume PCI device state.\n"); + EPRINTLN("Failed to resume PCI device state."); exit(1); } } #endif if (bhyve_init_platform_late(ctx, bsp) != 0) exit(4); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifdef BHYVE_SNAPSHOT /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) != 0) errx(EX_OSERR, "Failed to start checkpoint thread"); #endif #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { destroy_restore_state(&rstate); if (vm_restore_time(ctx) < 0) err(EX_OSERR, "Unable to restore time"); for (int vcpuid = 0; vcpuid < guest_ncpus; vcpuid++) vm_resume_cpu(vcpu_info[vcpuid].vcpu); } else #endif vm_resume_cpu(bsp); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c index 93ddf6d8e0c6..29b8910101f6 100644 --- a/usr.sbin/bhyve/block_if.c +++ b/usr.sbin/bhyve/block_if.c @@ -1,1042 +1,1042 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Peter Grehan * All rights reserved. * Copyright 2020 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "mevent.h" #include "pci_emul.h" #include "block_if.h" #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_DELETE }; enum blockstat { BST_FREE, BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) be_link; struct blockif_req *be_req; enum blockop be_op; enum blockstat be_status; pthread_t be_tid; off_t be_block; }; struct blockif_ctxt { unsigned int bc_magic; int bc_fd; int bc_ischr; int bc_isgeom; int bc_candelete; int bc_rdonly; off_t bc_size; int bc_sectsz; int bc_psectsz; int bc_psectoff; int bc_closing; int bc_paused; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; pthread_cond_t bc_work_done_cond; blockif_resize_cb *bc_resize_cb; void *bc_resize_cb_arg; struct mevent *bc_resize_event; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; int bc_bootindex; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; struct blockif_sig_elem { pthread_mutex_t bse_mtx; pthread_cond_t bse_cond; int bse_pending; struct blockif_sig_elem *bse_next; }; static struct blockif_sig_elem *blockif_bse_head; static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; off_t off; int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); TAILQ_REMOVE(&bc->bc_freeq, be, be_link); be->be_req = breq; be->be_op = op; switch (op) { case BOP_READ: case BOP_WRITE: case BOP_DELETE: off = breq->br_offset; for (i = 0; i < breq->br_iovcnt; i++) off += breq->br_iov[i].iov_len; break; default: off = OFF_MAX; } be->be_block = off; TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_block == breq->br_offset) break; } if (tbe == NULL) { TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { if (tbe->be_block == breq->br_offset) break; } } if (tbe == NULL) be->be_status = BST_PEND; else be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); return (be->be_status == BST_PEND); } static int blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_status == BST_PEND) break; assert(be->be_status == BST_BLOCK); } if (be == NULL) return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); *bep = be; return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { struct blockif_elem *tbe; if (be->be_status == BST_DONE || be->be_status == BST_BUSY) TAILQ_REMOVE(&bc->bc_busyq, be, be_link); else TAILQ_REMOVE(&bc->bc_pendq, be, be_link); TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { if (tbe->be_req->br_offset == be->be_block) tbe->be_status = BST_PEND; } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } static int blockif_flush_bc(struct blockif_ctxt *bc) { if (bc->bc_ischr) { if (ioctl(bc->bc_fd, DIOCGFLUSH)) return (errno); } else if (fsync(bc->bc_fd)) return (errno); return (0); } static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct spacectl_range range; struct blockif_req *br; off_t arg[2]; ssize_t n; size_t clen, len, off, boff, voff; int i, err; br = be->be_req; assert(br->br_resid >= 0); if (br->br_iovcnt <= 1) buf = NULL; err = 0; switch (be->be_op) { case BOP_READ: if (buf == NULL) { if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= n; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); n = pread(bc->bc_fd, buf, len, br->br_offset + off); if (n < 0) { err = errno; break; } len = (size_t)n; boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy((uint8_t *)br->br_iov[i].iov_base + voff, buf + boff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); off += len; br->br_resid -= len; } break; case BOP_WRITE: if (bc->bc_rdonly) { err = EROFS; break; } if (buf == NULL) { if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, br->br_offset)) < 0) err = errno; else br->br_resid -= n; break; } i = 0; off = voff = 0; while (br->br_resid > 0) { len = MIN(br->br_resid, MAXPHYS); boff = 0; do { clen = MIN(len - boff, br->br_iov[i].iov_len - voff); memcpy(buf + boff, (uint8_t *)br->br_iov[i].iov_base + voff, clen); if (clen < br->br_iov[i].iov_len - voff) voff += clen; else { i++; voff = 0; } boff += clen; } while (boff < len); n = pwrite(bc->bc_fd, buf, len, br->br_offset + off); if (n < 0) { err = errno; break; } off += n; br->br_resid -= n; } break; case BOP_FLUSH: err = blockif_flush_bc(bc); break; case BOP_DELETE: if (!bc->bc_candelete) err = EOPNOTSUPP; else if (bc->bc_rdonly) err = EROFS; else if (bc->bc_ischr) { arg[0] = br->br_offset; arg[1] = br->br_resid; if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) err = errno; else br->br_resid = 0; } else { range.r_offset = br->br_offset; range.r_len = br->br_resid; while (range.r_len > 0) { if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, &range, 0, &range) != 0) { err = errno; break; } } if (err == 0) br->br_resid = 0; } break; default: err = EINVAL; break; } be->be_status = BST_DONE; (*br->br_callback)(br, err); } static inline bool blockif_empty(const struct blockif_ctxt *bc) { return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq)); } static void * blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; pthread_t t; uint8_t *buf; bc = arg; if (bc->bc_isgeom) buf = malloc(MAXPHYS); else buf = NULL; t = pthread_self(); pthread_mutex_lock(&bc->bc_mtx); for (;;) { while (blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } /* If none to work, notify the main thread */ if (blockif_empty(bc)) pthread_cond_broadcast(&bc->bc_work_done_cond); /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) break; pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); if (buf) free(buf); pthread_exit(NULL); return (NULL); } static void blockif_sigcont_handler(int signal __unused, enum ev_type type __unused, void *arg __unused) { struct blockif_sig_elem *bse; for (;;) { /* * Process the entire list even if not intended for * this thread. */ do { bse = blockif_bse_head; if (bse == NULL) return; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)bse, (uintptr_t)bse->bse_next)); pthread_mutex_lock(&bse->bse_mtx); bse->bse_pending = 0; pthread_cond_signal(&bse->bse_cond); pthread_mutex_unlock(&bse->bse_mtx); } } static void blockif_init(void) { mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); (void) signal(SIGCONT, SIG_IGN); } int blockif_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *path; if (opts == NULL) return (0); cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "path", opts); return (0); } path = strndup(opts, cp - opts); set_config_value_node(nvl, "path", path); free(path); return (pci_parse_legacy_config(nvl, cp + 1)); } int blockif_add_boot_device(struct pci_devinst *const pi, struct blockif_ctxt *const bc) { if (bc->bc_bootindex < 0) return (0); return (pci_emul_add_boot_device(pi, bc->bc_bootindex)); } struct blockif_ctxt * blockif_open(nvlist_t *nvl, const char *ident) { char tname[MAXCOMLEN + 1]; char name[MAXPATHLEN]; const char *path, *pssval, *ssval, *bootindex_val; char *cp; struct blockif_ctxt *bc; struct stat sbuf; struct diocgattr_arg arg; off_t size, psectsz, psectoff; int extra, fd, i, sectsz; int ro, candelete, geom, ssopt, pssopt; int nodelete; int bootindex; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; #endif pthread_once(&blockif_once, blockif_init); fd = -1; extra = 0; ssopt = 0; ro = 0; nodelete = 0; bootindex = -1; if (get_config_bool_node_default(nvl, "nocache", false)) extra |= O_DIRECT; if (get_config_bool_node_default(nvl, "nodelete", false)) nodelete = 1; if (get_config_bool_node_default(nvl, "sync", false) || get_config_bool_node_default(nvl, "direct", false)) extra |= O_SYNC; if (get_config_bool_node_default(nvl, "ro", false)) ro = 1; ssval = get_config_value_node(nvl, "sectorsize"); if (ssval != NULL) { ssopt = strtol(ssval, &cp, 10); if (cp == ssval) { EPRINTLN("Invalid sector size \"%s\"", ssval); goto err; } if (*cp == '\0') { pssopt = ssopt; } else if (*cp == '/') { pssval = cp + 1; pssopt = strtol(pssval, &cp, 10); if (cp == pssval || *cp != '\0') { EPRINTLN("Invalid sector size \"%s\"", ssval); goto err; } } else { EPRINTLN("Invalid sector size \"%s\"", ssval); goto err; } } bootindex_val = get_config_value_node(nvl, "bootindex"); if (bootindex_val != NULL) { bootindex = atoi(bootindex_val); } path = get_config_value_node(nvl, "path"); if (path == NULL) { EPRINTLN("Missing \"path\" for block device."); goto err; } fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ fd = open(path, O_RDONLY | extra); ro = 1; } if (fd < 0) { warn("Could not open backing file: %s", path); goto err; } if (fstat(fd, &sbuf) < 0) { warn("Could not stat backing file %s", path); goto err; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); if (ro) cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; candelete = geom = 0; if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); goto err; } assert(size != 0); assert(sectsz != 0); if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) candelete = arg.value.i; if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) geom = 1; } else { psectsz = sbuf.st_blksize; /* Avoid fallback implementation */ candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; } #ifndef WITHOUT_CAPSICUM if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif if (ssopt != 0) { if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || ssopt > pssopt) { EPRINTLN("Invalid sector size %d/%d", ssopt, pssopt); goto err; } /* * Some backend drivers (e.g. cd0, ada0) require that the I/O * size be a multiple of the device's sector size. * * Validate that the emulated sector size complies with this * requirement. */ if (S_ISCHR(sbuf.st_mode)) { if (ssopt < sectsz || (ssopt % sectsz) != 0) { EPRINTLN("Sector size %d incompatible " "with underlying device sector size %d", ssopt, sectsz); goto err; } } sectsz = ssopt; psectsz = pssopt; psectoff = 0; } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { perror("calloc"); goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; bc->bc_ischr = S_ISCHR(sbuf.st_mode); bc->bc_isgeom = geom; bc->bc_candelete = candelete; bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; bc->bc_psectsz = psectsz; bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); bc->bc_paused = 0; pthread_cond_init(&bc->bc_work_done_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); bc->bc_bootindex = bootindex; for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } for (i = 0; i < BLOCKIF_NUMTHR; i++) { pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); pthread_set_name_np(bc->bc_btid[i], tname); } return (bc); err: if (fd >= 0) close(fd); return (NULL); } static void blockif_resized(int fd, enum ev_type type __unused, void *arg) { struct blockif_ctxt *bc; struct stat sb; off_t mediasize; if (fstat(fd, &sb) != 0) return; if (S_ISCHR(sb.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { EPRINTLN("blockif_resized: get mediasize failed: %s", strerror(errno)); return; } } else mediasize = sb.st_size; bc = arg; pthread_mutex_lock(&bc->bc_mtx); if (mediasize != bc->bc_size) { bc->bc_size = mediasize; bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); } pthread_mutex_unlock(&bc->bc_mtx); } int blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, void *cb_arg) { struct stat sb; int err; if (cb == NULL) return (EINVAL); err = 0; pthread_mutex_lock(&bc->bc_mtx); if (bc->bc_resize_cb != NULL) { err = EBUSY; goto out; } assert(bc->bc_closing == 0); if (fstat(bc->bc_fd, &sb) != 0) { err = errno; goto out; } bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, EVFF_ATTRIB, blockif_resized, bc); if (bc->bc_resize_event == NULL) { err = ENXIO; goto out; } bc->bc_resize_cb = cb; bc->bc_resize_cb_arg = cb_arg; out: pthread_mutex_unlock(&bc->bc_mtx); return (err); } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { int err; err = 0; pthread_mutex_lock(&bc->bc_mtx); assert(!bc->bc_paused); if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ if (blockif_enqueue(bc, breq, op)) pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } pthread_mutex_unlock(&bc->bc_mtx); return (err); } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_READ)); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_WRITE)); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_FLUSH)); } int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) { assert(bc->bc_magic == BLOCKIF_SIG); return (blockif_request(bc, breq, BOP_DELETE)); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); /* XXX: not waiting while paused */ /* * Check pending requests. */ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { if (be->be_req == breq) break; } if (be != NULL) { /* * Found it. */ blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); } /* * Check in-flight requests. */ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { if (be->be_req == breq) break; } if (be == NULL) { /* * Didn't find it. */ pthread_mutex_unlock(&bc->bc_mtx); return (EINVAL); } /* * Interrupt the processing thread to force it return * prematurely via it's normal callback path. */ while (be->be_status == BST_BUSY) { struct blockif_sig_elem bse, *old_head; pthread_mutex_init(&bse.bse_mtx, NULL); pthread_cond_init(&bse.bse_cond, NULL); bse.bse_pending = 1; do { old_head = blockif_bse_head; bse.bse_next = old_head; } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); pthread_kill(be->be_tid, SIGCONT); pthread_mutex_lock(&bse.bse_mtx); while (bse.bse_pending) pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); pthread_mutex_unlock(&bse.bse_mtx); } pthread_mutex_unlock(&bc->bc_mtx); /* * The processing thread has been interrupted. Since it's not * clear if the callback has been invoked yet, return EBUSY. */ return (EBUSY); } int blockif_close(struct blockif_ctxt *bc) { void *jval; int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; if (bc->bc_resize_event != NULL) mevent_disable(bc->bc_resize_event); pthread_mutex_unlock(&bc->bc_mtx); pthread_cond_broadcast(&bc->bc_cond); for (i = 0; i < BLOCKIF_NUMTHR; i++) pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ /* * Release resources */ bc->bc_magic = 0; close(bc->bc_fd); free(bc); return (0); } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; assert(bc->bc_magic == BLOCKIF_SIG); sectors = bc->bc_size / bc->bc_sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535L * 16 * 255) sectors = 65535L * 16 * 255; if (sectors >= 65536L * 16 * 63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_size); } int blockif_sectsz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_sectsz); } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { assert(bc->bc_magic == BLOCKIF_SIG); *size = bc->bc_psectsz; *off = bc->bc_psectoff; } int blockif_queuesz(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (BLOCKIF_MAXREQ - 1); } int blockif_is_ro(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } int blockif_candelete(struct blockif_ctxt *bc) { assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } #ifdef BHYVE_SNAPSHOT void blockif_pause(struct blockif_ctxt *bc) { assert(bc != NULL); assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); bc->bc_paused = 1; /* The interface is paused. Wait for workers to finish their work */ while (!blockif_empty(bc)) pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); pthread_mutex_unlock(&bc->bc_mtx); if (!bc->bc_rdonly && blockif_flush_bc(bc)) - fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", + EPRINTLN("%s: [WARN] failed to flush backing file.", __func__); } void blockif_resume(struct blockif_ctxt *bc) { assert(bc != NULL); assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); bc->bc_paused = 0; pthread_mutex_unlock(&bc->bc_mtx); } #endif /* BHYVE_SNAPSHOT */ diff --git a/usr.sbin/bhyve/bootrom.c b/usr.sbin/bhyve/bootrom.c index cf527b98af89..64f1921f901e 100644 --- a/usr.sbin/bhyve/bootrom.c +++ b/usr.sbin/bhyve/bootrom.c @@ -1,313 +1,313 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "bootrom.h" #include "debug.h" #include "mem.h" #define BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */ /* * ROM region is 16 MB at the top of 4GB ("low") memory. * * The size is limited so it doesn't encroach into reserved MMIO space (e.g., * APIC, HPET, MSI). * * It is allocated in page-multiple blocks on a first-come first-serve basis, * from high to low, during initialization, and does not change at runtime. */ static char *romptr; /* Pointer to userspace-mapped bootrom region. */ static vm_paddr_t gpa_base; /* GPA of low end of region. */ static vm_paddr_t gpa_allocbot; /* Low GPA of free region. */ static vm_paddr_t gpa_alloctop; /* High GPA, minus 1, of free region. */ #define CFI_BCS_WRITE_BYTE 0x10 #define CFI_BCS_CLEAR_STATUS 0x50 #define CFI_BCS_READ_STATUS 0x70 #define CFI_BCS_READ_ARRAY 0xff static struct bootrom_var_state { uint8_t *mmap; uint64_t gpa; off_t size; uint8_t cmd; } var = { NULL, 0, 0, CFI_BCS_READ_ARRAY }; /* * Emulate just those CFI basic commands that will convince EDK II * that the Firmware Volume area is writable and persistent. */ static int bootrom_var_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1 __unused, long arg2 __unused) { off_t offset; offset = addr - var.gpa; if (offset + size > var.size || offset < 0 || offset + size <= offset) return (EINVAL); if (dir == MEM_F_WRITE) { switch (var.cmd) { case CFI_BCS_WRITE_BYTE: memcpy(var.mmap + offset, val, size); var.cmd = CFI_BCS_READ_ARRAY; break; default: var.cmd = *(uint8_t *)val; } } else { switch (var.cmd) { case CFI_BCS_CLEAR_STATUS: case CFI_BCS_READ_STATUS: memset(val, 0, size); var.cmd = CFI_BCS_READ_ARRAY; break; default: memcpy(val, var.mmap + offset, size); break; } } return (0); } void init_bootrom(struct vmctx *ctx) { romptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", BOOTROM_SIZE); if (romptr == MAP_FAILED) err(4, "%s: vm_create_devmem", __func__); gpa_base = (1ULL << 32) - BOOTROM_SIZE; gpa_allocbot = gpa_base; gpa_alloctop = (1ULL << 32) - 1; } int bootrom_alloc(struct vmctx *ctx, size_t len, int prot, int flags, char **region_out, uint64_t *gpa_out) { static const int bootrom_valid_flags = BOOTROM_ALLOC_TOP; vm_paddr_t gpa; vm_ooffset_t segoff; if (flags & ~bootrom_valid_flags) { warnx("%s: Invalid flags: %x", __func__, flags & ~bootrom_valid_flags); return (EINVAL); } if (prot & ~_PROT_ALL) { warnx("%s: Invalid protection: %x", __func__, prot & ~_PROT_ALL); return (EINVAL); } if (len == 0 || len > BOOTROM_SIZE) { warnx("ROM size %zu is invalid", len); return (EINVAL); } if (len & PAGE_MASK) { warnx("ROM size %zu is not a multiple of the page size", len); return (EINVAL); } if (flags & BOOTROM_ALLOC_TOP) { gpa = (gpa_alloctop - len) + 1; if (gpa < gpa_allocbot) { warnx("No room for %zu ROM in bootrom region", len); return (ENOMEM); } } else { gpa = gpa_allocbot; if (gpa > (gpa_alloctop - len) + 1) { warnx("No room for %zu ROM in bootrom region", len); return (ENOMEM); } } segoff = gpa - gpa_base; if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, segoff, len, prot) != 0) { int serrno = errno; warn("%s: vm_mmap_mapseg", __func__); return (serrno); } if (flags & BOOTROM_ALLOC_TOP) gpa_alloctop = gpa - 1; else gpa_allocbot = gpa + len; *region_out = romptr + segoff; if (gpa_out != NULL) *gpa_out = gpa; return (0); } int bootrom_loadrom(struct vmctx *ctx, const nvlist_t *nvl) { struct stat sbuf; ssize_t rlen; off_t rom_size, var_size, total_size; char *ptr, *romfile; int fd, varfd, i, rv; const char *bootrom, *varfile; rv = -1; varfd = -1; bootrom = get_config_value_node(nvl, "bootrom"); if (bootrom == NULL) { return (-1); } /* * get_config_value_node may use a thread local buffer to return * variables. So, when we query the second variable, the first variable * might get overwritten. For that reason, the bootrom should be * duplicated. */ romfile = strdup(bootrom); if (romfile == NULL) { return (-1); } fd = open(romfile, O_RDONLY); if (fd < 0) { EPRINTLN("Error opening bootrom \"%s\": %s", romfile, strerror(errno)); goto done; } if (fstat(fd, &sbuf) < 0) { EPRINTLN("Could not fstat bootrom file \"%s\": %s", romfile, strerror(errno)); goto done; } rom_size = sbuf.st_size; varfile = get_config_value_node(nvl, "bootvars"); var_size = 0; if (varfile != NULL) { varfd = open(varfile, O_RDWR); if (varfd < 0) { - fprintf(stderr, "Error opening bootrom variable file " - "\"%s\": %s\n", varfile, strerror(errno)); + EPRINTLN("Error opening bootrom variable file " + "\"%s\": %s", varfile, strerror(errno)); goto done; } if (fstat(varfd, &sbuf) < 0) { - fprintf(stderr, - "Could not fstat bootrom variable file \"%s\": %s\n", + EPRINTLN( + "Could not fstat bootrom variable file \"%s\": %s", varfile, strerror(errno)); goto done; } var_size = sbuf.st_size; } if (var_size > BOOTROM_SIZE || (var_size != 0 && var_size < PAGE_SIZE)) { - fprintf(stderr, "Invalid bootrom variable size %ld\n", + EPRINTLN("Invalid bootrom variable size %ld", var_size); goto done; } total_size = rom_size + var_size; if (total_size > BOOTROM_SIZE) { - fprintf(stderr, "Invalid bootrom and variable aggregate size " - "%ld\n", total_size); + EPRINTLN("Invalid bootrom and variable aggregate size %ld", + total_size); goto done; } /* Map the bootrom into the guest address space */ if (bootrom_alloc(ctx, rom_size, PROT_READ | PROT_EXEC, BOOTROM_ALLOC_TOP, &ptr, NULL) != 0) { goto done; } /* Read 'romfile' into the guest address space */ for (i = 0; i < rom_size / PAGE_SIZE; i++) { rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE); if (rlen != PAGE_SIZE) { EPRINTLN("Incomplete read of page %d of bootrom " "file %s: %ld bytes", i, romfile, rlen); goto done; } } if (varfd >= 0) { var.mmap = mmap(NULL, var_size, PROT_READ | PROT_WRITE, MAP_SHARED, varfd, 0); if (var.mmap == MAP_FAILED) goto done; var.size = var_size; var.gpa = (gpa_alloctop - var_size) + 1; gpa_alloctop = var.gpa - 1; rv = register_mem(&(struct mem_range){ .name = "bootrom variable", .flags = MEM_F_RW, .handler = bootrom_var_mem_handler, .base = var.gpa, .size = var.size, }); if (rv != 0) goto done; } rv = 0; done: if (varfd >= 0) close(varfd); if (fd >= 0) close(fd); free(romfile); return (rv); } diff --git a/usr.sbin/bhyve/gdb.c b/usr.sbin/bhyve/gdb.c index dcd733c484ca..55178677f837 100644 --- a/usr.sbin/bhyve/gdb.c +++ b/usr.sbin/bhyve/gdb.c @@ -1,1920 +1,1921 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2017-2018 John H. Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" +#include "debug.h" #include "gdb.h" #include "mem.h" #include "mevent.h" /* * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops * use SIGTRAP. */ #define GDB_SIGNAL_TRAP 5 static void gdb_resume_vcpus(void); static void check_command(int fd); static struct mevent *read_event, *write_event; static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting; static pthread_mutex_t gdb_lock; static pthread_cond_t idle_vcpus; static bool first_stop, report_next_stop, swbreak_enabled; /* * An I/O buffer contains 'capacity' bytes of room at 'data'. For a * read buffer, 'start' is unused and 'len' contains the number of * valid bytes in the buffer. For a write buffer, 'start' is set to * the index of the next byte in 'data' to send, and 'len' contains * the remaining number of valid bytes to send. */ struct io_buffer { uint8_t *data; size_t capacity; size_t start; size_t len; }; struct breakpoint { uint64_t gpa; uint8_t shadow_inst; TAILQ_ENTRY(breakpoint) link; }; /* * When a vCPU stops to due to an event that should be reported to the * debugger, information about the event is stored in this structure. * The vCPU thread then sets 'stopped_vcpu' if it is not already set * and stops other vCPUs so the event can be reported. The * report_stop() function reports the event for the 'stopped_vcpu' * vCPU. When the debugger resumes execution via continue or step, * the event for 'stopped_vcpu' is cleared. vCPUs will loop in their * event handlers until the associated event is reported or disabled. * * An idle vCPU will have all of the boolean fields set to false. * * When a vCPU is stepped, 'stepping' is set to true when the vCPU is * released to execute the stepped instruction. When the vCPU reports * the stepping trap, 'stepped' is set. * * When a vCPU hits a breakpoint set by the debug server, * 'hit_swbreak' is set to true. */ struct vcpu_state { bool stepping; bool stepped; bool hit_swbreak; }; static struct io_buffer cur_comm, cur_resp; static uint8_t cur_csum; static struct vmctx *ctx; static int cur_fd = -1; static TAILQ_HEAD(, breakpoint) breakpoints; static struct vcpu_state *vcpu_state; static struct vcpu **vcpus; static int cur_vcpu, stopped_vcpu; static bool gdb_active = false; static const int gdb_regset[] = { VM_REG_GUEST_RAX, VM_REG_GUEST_RBX, VM_REG_GUEST_RCX, VM_REG_GUEST_RDX, VM_REG_GUEST_RSI, VM_REG_GUEST_RDI, VM_REG_GUEST_RBP, VM_REG_GUEST_RSP, VM_REG_GUEST_R8, VM_REG_GUEST_R9, VM_REG_GUEST_R10, VM_REG_GUEST_R11, VM_REG_GUEST_R12, VM_REG_GUEST_R13, VM_REG_GUEST_R14, VM_REG_GUEST_R15, VM_REG_GUEST_RIP, VM_REG_GUEST_RFLAGS, VM_REG_GUEST_CS, VM_REG_GUEST_SS, VM_REG_GUEST_DS, VM_REG_GUEST_ES, VM_REG_GUEST_FS, VM_REG_GUEST_GS }; static const int gdb_regsize[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4 }; #ifdef GDB_LOG #include #include static void __printflike(1, 2) debug(const char *fmt, ...) { static FILE *logfile; va_list ap; if (logfile == NULL) { logfile = fopen("/tmp/bhyve_gdb.log", "w"); if (logfile == NULL) return; #ifndef WITHOUT_CAPSICUM if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) { fclose(logfile); logfile = NULL; return; } #endif setlinebuf(logfile); } va_start(ap, fmt); vfprintf(logfile, fmt, ap); va_end(ap); } #else #define debug(...) #endif static void remove_all_sw_breakpoints(void); static int guest_paging_info(struct vcpu *vcpu, struct vm_guest_paging *paging) { uint64_t regs[4]; const int regset[4] = { VM_REG_GUEST_CR0, VM_REG_GUEST_CR3, VM_REG_GUEST_CR4, VM_REG_GUEST_EFER }; if (vm_get_register_set(vcpu, nitems(regset), regset, regs) == -1) return (-1); /* * For the debugger, always pretend to be the kernel (CPL 0), * and if long-mode is enabled, always parse addresses as if * in 64-bit mode. */ paging->cr3 = regs[1]; paging->cpl = 0; if (regs[3] & EFER_LMA) paging->cpu_mode = CPU_MODE_64BIT; else if (regs[0] & CR0_PE) paging->cpu_mode = CPU_MODE_PROTECTED; else paging->cpu_mode = CPU_MODE_REAL; if (!(regs[0] & CR0_PG)) paging->paging_mode = PAGING_MODE_FLAT; else if (!(regs[2] & CR4_PAE)) paging->paging_mode = PAGING_MODE_32; else if (regs[3] & EFER_LME) paging->paging_mode = (regs[2] & CR4_LA57) ? PAGING_MODE_64_LA57 : PAGING_MODE_64; else paging->paging_mode = PAGING_MODE_PAE; return (0); } /* * Map a guest virtual address to a physical address (for a given vcpu). * If a guest virtual address is valid, return 1. If the address is * not valid, return 0. If an error occurs obtaining the mapping, * return -1. */ static int guest_vaddr2paddr(struct vcpu *vcpu, uint64_t vaddr, uint64_t *paddr) { struct vm_guest_paging paging; int fault; if (guest_paging_info(vcpu, &paging) == -1) return (-1); /* * Always use PROT_READ. We really care if the VA is * accessible, not if the current vCPU can write. */ if (vm_gla2gpa_nofault(vcpu, &paging, vaddr, PROT_READ, paddr, &fault) == -1) return (-1); if (fault) return (0); return (1); } static void io_buffer_reset(struct io_buffer *io) { io->start = 0; io->len = 0; } /* Available room for adding data. */ static size_t io_buffer_avail(struct io_buffer *io) { return (io->capacity - (io->start + io->len)); } static uint8_t * io_buffer_head(struct io_buffer *io) { return (io->data + io->start); } static uint8_t * io_buffer_tail(struct io_buffer *io) { return (io->data + io->start + io->len); } static void io_buffer_advance(struct io_buffer *io, size_t amount) { assert(amount <= io->len); io->start += amount; io->len -= amount; } static void io_buffer_consume(struct io_buffer *io, size_t amount) { io_buffer_advance(io, amount); if (io->len == 0) { io->start = 0; return; } /* * XXX: Consider making this move optional and compacting on a * future read() before realloc(). */ memmove(io->data, io_buffer_head(io), io->len); io->start = 0; } static void io_buffer_grow(struct io_buffer *io, size_t newsize) { uint8_t *new_data; size_t avail, new_cap; avail = io_buffer_avail(io); if (newsize <= avail) return; new_cap = io->capacity + (newsize - avail); new_data = realloc(io->data, new_cap); if (new_data == NULL) err(1, "Failed to grow GDB I/O buffer"); io->data = new_data; io->capacity = new_cap; } static bool response_pending(void) { if (cur_resp.start == 0 && cur_resp.len == 0) return (false); if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+') return (false); return (true); } static void close_connection(void) { /* * XXX: This triggers a warning because mevent does the close * before the EV_DELETE. */ pthread_mutex_lock(&gdb_lock); mevent_delete(write_event); mevent_delete_close(read_event); write_event = NULL; read_event = NULL; io_buffer_reset(&cur_comm); io_buffer_reset(&cur_resp); cur_fd = -1; remove_all_sw_breakpoints(); /* Clear any pending events. */ memset(vcpu_state, 0, guest_ncpus * sizeof(*vcpu_state)); /* Resume any stopped vCPUs. */ gdb_resume_vcpus(); pthread_mutex_unlock(&gdb_lock); } static uint8_t hex_digit(uint8_t nibble) { if (nibble <= 9) return (nibble + '0'); else return (nibble + 'a' - 10); } static uint8_t parse_digit(uint8_t v) { if (v >= '0' && v <= '9') return (v - '0'); if (v >= 'a' && v <= 'f') return (v - 'a' + 10); if (v >= 'A' && v <= 'F') return (v - 'A' + 10); return (0xF); } /* Parses big-endian hexadecimal. */ static uintmax_t parse_integer(const uint8_t *p, size_t len) { uintmax_t v; v = 0; while (len > 0) { v <<= 4; v |= parse_digit(*p); p++; len--; } return (v); } static uint8_t parse_byte(const uint8_t *p) { return (parse_digit(p[0]) << 4 | parse_digit(p[1])); } static void send_pending_data(int fd) { ssize_t nwritten; if (cur_resp.len == 0) { mevent_disable(write_event); return; } nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len); if (nwritten == -1) { warn("Write to GDB socket failed"); close_connection(); } else { io_buffer_advance(&cur_resp, nwritten); if (cur_resp.len == 0) mevent_disable(write_event); else mevent_enable(write_event); } } /* Append a single character to the output buffer. */ static void send_char(uint8_t data) { io_buffer_grow(&cur_resp, 1); *io_buffer_tail(&cur_resp) = data; cur_resp.len++; } /* Append an array of bytes to the output buffer. */ static void send_data(const uint8_t *data, size_t len) { io_buffer_grow(&cur_resp, len); memcpy(io_buffer_tail(&cur_resp), data, len); cur_resp.len += len; } static void format_byte(uint8_t v, uint8_t *buf) { buf[0] = hex_digit(v >> 4); buf[1] = hex_digit(v & 0xf); } /* * Append a single byte (formatted as two hex characters) to the * output buffer. */ static void send_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); send_data(buf, sizeof(buf)); } static void start_packet(void) { send_char('$'); cur_csum = 0; } static void finish_packet(void) { send_char('#'); send_byte(cur_csum); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } /* * Append a single character (for the packet payload) and update the * checksum. */ static void append_char(uint8_t v) { send_char(v); cur_csum += v; } /* * Append an array of bytes (for the packet payload) and update the * checksum. */ static void append_packet_data(const uint8_t *data, size_t len) { send_data(data, len); while (len > 0) { cur_csum += *data; data++; len--; } } static void append_string(const char *str) { append_packet_data(str, strlen(str)); } static void append_byte(uint8_t v) { uint8_t buf[2]; format_byte(v, buf); append_packet_data(buf, sizeof(buf)); } static void append_unsigned_native(uintmax_t value, size_t len) { size_t i; for (i = 0; i < len; i++) { append_byte(value); value >>= 8; } } static void append_unsigned_be(uintmax_t value, size_t len) { char buf[len * 2]; size_t i; for (i = 0; i < len; i++) { format_byte(value, buf + (len - i - 1) * 2); value >>= 8; } append_packet_data(buf, sizeof(buf)); } static void append_integer(unsigned int value) { if (value == 0) append_char('0'); else append_unsigned_be(value, (fls(value) + 7) / 8); } static void append_asciihex(const char *str) { while (*str != '\0') { append_byte(*str); str++; } } static void send_empty_response(void) { start_packet(); finish_packet(); } static void send_error(int error) { start_packet(); append_char('E'); append_byte(error); finish_packet(); } static void send_ok(void) { start_packet(); append_string("OK"); finish_packet(); } static int parse_threadid(const uint8_t *data, size_t len) { if (len == 1 && *data == '0') return (0); if (len == 2 && memcmp(data, "-1", 2) == 0) return (-1); if (len == 0) return (-2); return (parse_integer(data, len)); } /* * Report the current stop event to the debugger. If the stop is due * to an event triggered on a specific vCPU such as a breakpoint or * stepping trap, stopped_vcpu will be set to the vCPU triggering the * stop. If 'set_cur_vcpu' is true, then cur_vcpu will be updated to * the reporting vCPU for vCPU events. */ static void report_stop(bool set_cur_vcpu) { struct vcpu_state *vs; start_packet(); if (stopped_vcpu == -1) { append_char('S'); append_byte(GDB_SIGNAL_TRAP); } else { vs = &vcpu_state[stopped_vcpu]; if (set_cur_vcpu) cur_vcpu = stopped_vcpu; append_char('T'); append_byte(GDB_SIGNAL_TRAP); append_string("thread:"); append_integer(stopped_vcpu + 1); append_char(';'); if (vs->hit_swbreak) { debug("$vCPU %d reporting swbreak\n", stopped_vcpu); if (swbreak_enabled) append_string("swbreak:;"); } else if (vs->stepped) debug("$vCPU %d reporting step\n", stopped_vcpu); else debug("$vCPU %d reporting ???\n", stopped_vcpu); } finish_packet(); report_next_stop = false; } /* * If this stop is due to a vCPU event, clear that event to mark it as * acknowledged. */ static void discard_stop(void) { struct vcpu_state *vs; if (stopped_vcpu != -1) { vs = &vcpu_state[stopped_vcpu]; vs->hit_swbreak = false; vs->stepped = false; stopped_vcpu = -1; } report_next_stop = true; } static void gdb_finish_suspend_vcpus(void) { if (first_stop) { first_stop = false; stopped_vcpu = -1; } else if (report_next_stop) { assert(!response_pending()); report_stop(true); send_pending_data(cur_fd); } } /* * vCPU threads invoke this function whenever the vCPU enters the * debug server to pause or report an event. vCPU threads wait here * as long as the debug server keeps them suspended. */ static void _gdb_cpu_suspend(struct vcpu *vcpu, bool report_stop) { int vcpuid = vcpu_id(vcpu); debug("$vCPU %d suspending\n", vcpuid); CPU_SET(vcpuid, &vcpus_waiting); if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) gdb_finish_suspend_vcpus(); while (CPU_ISSET(vcpuid, &vcpus_suspended)) pthread_cond_wait(&idle_vcpus, &gdb_lock); CPU_CLR(vcpuid, &vcpus_waiting); debug("$vCPU %d resuming\n", vcpuid); } /* * Invoked at the start of a vCPU thread's execution to inform the * debug server about the new thread. */ void gdb_cpu_add(struct vcpu *vcpu) { int vcpuid; if (!gdb_active) return; vcpuid = vcpu_id(vcpu); debug("$vCPU %d starting\n", vcpuid); pthread_mutex_lock(&gdb_lock); assert(vcpuid < guest_ncpus); assert(vcpus[vcpuid] == NULL); vcpus[vcpuid] = vcpu; CPU_SET(vcpuid, &vcpus_active); if (!TAILQ_EMPTY(&breakpoints)) { vm_set_capability(vcpu, VM_CAP_BPT_EXIT, 1); debug("$vCPU %d enabled breakpoint exits\n", vcpu); } /* * If a vcpu is added while vcpus are stopped, suspend the new * vcpu so that it will pop back out with a debug exit before * executing the first instruction. */ if (!CPU_EMPTY(&vcpus_suspended)) { CPU_SET(vcpuid, &vcpus_suspended); _gdb_cpu_suspend(vcpu, false); } pthread_mutex_unlock(&gdb_lock); } /* * Invoked by vCPU before resuming execution. This enables stepping * if the vCPU is marked as stepping. */ static void gdb_cpu_resume(struct vcpu *vcpu) { struct vcpu_state *vs; int error; vs = &vcpu_state[vcpu_id(vcpu)]; /* * Any pending event should already be reported before * resuming. */ assert(vs->hit_swbreak == false); assert(vs->stepped == false); if (vs->stepping) { error = vm_set_capability(vcpu, VM_CAP_MTRAP_EXIT, 1); assert(error == 0); error = vm_set_capability(vcpu, VM_CAP_MASK_HWINTR, 1); assert(error == 0); } } /* * Handler for VM_EXITCODE_DEBUG used to suspend a vCPU when the guest * has been suspended due to an event on different vCPU or in response * to a guest-wide suspend such as Ctrl-C or the stop on attach. */ void gdb_cpu_suspend(struct vcpu *vcpu) { if (!gdb_active) return; pthread_mutex_lock(&gdb_lock); _gdb_cpu_suspend(vcpu, true); gdb_cpu_resume(vcpu); pthread_mutex_unlock(&gdb_lock); } static void gdb_suspend_vcpus(void) { assert(pthread_mutex_isowned_np(&gdb_lock)); debug("suspending all CPUs\n"); vcpus_suspended = vcpus_active; vm_suspend_all_cpus(ctx); if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) gdb_finish_suspend_vcpus(); } /* * Handler for VM_EXITCODE_MTRAP reported when a vCPU single-steps via * the VT-x-specific MTRAP exit. */ void gdb_cpu_mtrap(struct vcpu *vcpu) { struct vcpu_state *vs; int vcpuid; if (!gdb_active) return; vcpuid = vcpu_id(vcpu); debug("$vCPU %d MTRAP\n", vcpuid); pthread_mutex_lock(&gdb_lock); vs = &vcpu_state[vcpuid]; if (vs->stepping) { vs->stepping = false; vs->stepped = true; vm_set_capability(vcpu, VM_CAP_MTRAP_EXIT, 0); vm_set_capability(vcpu, VM_CAP_MASK_HWINTR, 0); while (vs->stepped) { if (stopped_vcpu == -1) { debug("$vCPU %d reporting step\n", vcpuid); stopped_vcpu = vcpuid; gdb_suspend_vcpus(); } _gdb_cpu_suspend(vcpu, true); } gdb_cpu_resume(vcpu); } pthread_mutex_unlock(&gdb_lock); } static struct breakpoint * find_breakpoint(uint64_t gpa) { struct breakpoint *bp; TAILQ_FOREACH(bp, &breakpoints, link) { if (bp->gpa == gpa) return (bp); } return (NULL); } void gdb_cpu_breakpoint(struct vcpu *vcpu, struct vm_exit *vmexit) { struct breakpoint *bp; struct vcpu_state *vs; uint64_t gpa; int error, vcpuid; if (!gdb_active) { - fprintf(stderr, "vm_loop: unexpected VMEXIT_DEBUG\n"); + EPRINTLN("vm_loop: unexpected VMEXIT_DEBUG"); exit(4); } vcpuid = vcpu_id(vcpu); pthread_mutex_lock(&gdb_lock); error = guest_vaddr2paddr(vcpu, vmexit->rip, &gpa); assert(error == 1); bp = find_breakpoint(gpa); if (bp != NULL) { vs = &vcpu_state[vcpuid]; assert(vs->stepping == false); assert(vs->stepped == false); assert(vs->hit_swbreak == false); vs->hit_swbreak = true; vm_set_register(vcpu, VM_REG_GUEST_RIP, vmexit->rip); for (;;) { if (stopped_vcpu == -1) { debug("$vCPU %d reporting breakpoint at rip %#lx\n", vcpuid, vmexit->rip); stopped_vcpu = vcpuid; gdb_suspend_vcpus(); } _gdb_cpu_suspend(vcpu, true); if (!vs->hit_swbreak) { /* Breakpoint reported. */ break; } bp = find_breakpoint(gpa); if (bp == NULL) { /* Breakpoint was removed. */ vs->hit_swbreak = false; break; } } gdb_cpu_resume(vcpu); } else { debug("$vCPU %d injecting breakpoint at rip %#lx\n", vcpuid, vmexit->rip); error = vm_set_register(vcpu, VM_REG_GUEST_ENTRY_INST_LENGTH, vmexit->u.bpt.inst_length); assert(error == 0); error = vm_inject_exception(vcpu, IDT_BP, 0, 0, 0); assert(error == 0); } pthread_mutex_unlock(&gdb_lock); } static bool gdb_step_vcpu(struct vcpu *vcpu) { int error, val, vcpuid; vcpuid = vcpu_id(vcpu); debug("$vCPU %d step\n", vcpuid); error = vm_get_capability(vcpu, VM_CAP_MTRAP_EXIT, &val); if (error < 0) return (false); discard_stop(); vcpu_state[vcpuid].stepping = true; vm_resume_cpu(vcpu); CPU_CLR(vcpuid, &vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); return (true); } static void gdb_resume_vcpus(void) { assert(pthread_mutex_isowned_np(&gdb_lock)); vm_resume_all_cpus(ctx); debug("resuming all CPUs\n"); CPU_ZERO(&vcpus_suspended); pthread_cond_broadcast(&idle_vcpus); } static void gdb_read_regs(void) { uint64_t regvals[nitems(gdb_regset)]; if (vm_get_register_set(vcpus[cur_vcpu], nitems(gdb_regset), gdb_regset, regvals) == -1) { send_error(errno); return; } start_packet(); for (size_t i = 0; i < nitems(regvals); i++) append_unsigned_native(regvals[i], gdb_regsize[i]); finish_packet(); } static void gdb_read_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; bool started; int error; /* Skip 'm' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse length. */ resid = parse_integer(data, len); started = false; while (resid > 0) { error = guest_vaddr2paddr(vcpus[cur_vcpu], gva, &gpa); if (error == -1) { if (started) finish_packet(); else send_error(errno); return; } if (error == 0) { if (started) finish_packet(); else send_error(EFAULT); return; } /* Read bytes from current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, read it a byte * at a time. */ if (!started) { start_packet(); started = true; } while (todo > 0) { append_byte(*cp); cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned reads of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) bytes = 1; else if (gpa & 2 || todo == 2) bytes = 2; else bytes = 4; error = read_mem(vcpus[cur_vcpu], gpa, &val, bytes); if (error == 0) { if (!started) { start_packet(); started = true; } gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; while (bytes > 0) { append_byte(val); val >>= 8; bytes--; } } else { if (started) finish_packet(); else send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } if (!started) start_packet(); finish_packet(); } static void gdb_write_mem(const uint8_t *data, size_t len) { uint64_t gpa, gva, val; uint8_t *cp; size_t resid, todo, bytes; int error; /* Skip 'M' */ data += 1; len -= 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume length. */ cp = memchr(data, ':', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } resid = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Verify the available bytes match the length. */ if (len != resid * 2) { send_error(EINVAL); return; } while (resid > 0) { error = guest_vaddr2paddr(vcpus[cur_vcpu], gva, &gpa); if (error == -1) { send_error(errno); return; } if (error == 0) { send_error(EFAULT); return; } /* Write bytes to current page. */ todo = getpagesize() - gpa % getpagesize(); if (todo > resid) todo = resid; cp = paddr_guest2host(ctx, gpa, todo); if (cp != NULL) { /* * If this page is guest RAM, write it a byte * at a time. */ while (todo > 0) { assert(len >= 2); *cp = parse_byte(data); data += 2; len -= 2; cp++; gpa++; gva++; resid--; todo--; } } else { /* * If this page isn't guest RAM, try to handle * it via MMIO. For MMIO requests, use * aligned writes of words when possible. */ while (todo > 0) { if (gpa & 1 || todo == 1) { bytes = 1; val = parse_byte(data); } else if (gpa & 2 || todo == 2) { bytes = 2; val = be16toh(parse_integer(data, 4)); } else { bytes = 4; val = be32toh(parse_integer(data, 8)); } error = write_mem(vcpus[cur_vcpu], gpa, val, bytes); if (error == 0) { gpa += bytes; gva += bytes; resid -= bytes; todo -= bytes; data += 2 * bytes; len -= 2 * bytes; } else { send_error(EFAULT); return; } } } assert(resid == 0 || gpa % getpagesize() == 0); } assert(len == 0); send_ok(); } static bool set_breakpoint_caps(bool enable) { cpuset_t mask; int vcpu; mask = vcpus_active; while (!CPU_EMPTY(&mask)) { vcpu = CPU_FFS(&mask) - 1; CPU_CLR(vcpu, &mask); if (vm_set_capability(vcpus[vcpu], VM_CAP_BPT_EXIT, enable ? 1 : 0) < 0) return (false); debug("$vCPU %d %sabled breakpoint exits\n", vcpu, enable ? "en" : "dis"); } return (true); } static void remove_all_sw_breakpoints(void) { struct breakpoint *bp, *nbp; uint8_t *cp; if (TAILQ_EMPTY(&breakpoints)) return; TAILQ_FOREACH_SAFE(bp, &breakpoints, link, nbp) { debug("remove breakpoint at %#lx\n", bp->gpa); cp = paddr_guest2host(ctx, bp->gpa, 1); *cp = bp->shadow_inst; TAILQ_REMOVE(&breakpoints, bp, link); free(bp); } TAILQ_INIT(&breakpoints); set_breakpoint_caps(false); } static void update_sw_breakpoint(uint64_t gva, int kind, bool insert) { struct breakpoint *bp; uint64_t gpa; uint8_t *cp; int error; if (kind != 1) { send_error(EINVAL); return; } error = guest_vaddr2paddr(vcpus[cur_vcpu], gva, &gpa); if (error == -1) { send_error(errno); return; } if (error == 0) { send_error(EFAULT); return; } cp = paddr_guest2host(ctx, gpa, 1); /* Only permit breakpoints in guest RAM. */ if (cp == NULL) { send_error(EFAULT); return; } /* Find any existing breakpoint. */ bp = find_breakpoint(gpa); /* * Silently ignore duplicate commands since the protocol * requires these packets to be idempotent. */ if (insert) { if (bp == NULL) { if (TAILQ_EMPTY(&breakpoints) && !set_breakpoint_caps(true)) { send_empty_response(); return; } bp = malloc(sizeof(*bp)); bp->gpa = gpa; bp->shadow_inst = *cp; *cp = 0xcc; /* INT 3 */ TAILQ_INSERT_TAIL(&breakpoints, bp, link); debug("new breakpoint at %#lx\n", gpa); } } else { if (bp != NULL) { debug("remove breakpoint at %#lx\n", gpa); *cp = bp->shadow_inst; TAILQ_REMOVE(&breakpoints, bp, link); free(bp); if (TAILQ_EMPTY(&breakpoints)) set_breakpoint_caps(false); } } send_ok(); } static void parse_breakpoint(const uint8_t *data, size_t len) { uint64_t gva; uint8_t *cp; bool insert; int kind, type; insert = data[0] == 'Z'; /* Skip 'Z/z' */ data += 1; len -= 1; /* Parse and consume type. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } type = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume address. */ cp = memchr(data, ',', len); if (cp == NULL || cp == data) { send_error(EINVAL); return; } gva = parse_integer(data, cp - data); len -= (cp - data) + 1; data += (cp - data) + 1; /* Parse and consume kind. */ cp = memchr(data, ';', len); if (cp == data) { send_error(EINVAL); return; } if (cp != NULL) { /* * We do not advertise support for either the * ConditionalBreakpoints or BreakpointCommands * features, so we should not be getting conditions or * commands from the remote end. */ send_empty_response(); return; } kind = parse_integer(data, len); data += len; len = 0; switch (type) { case 0: update_sw_breakpoint(gva, kind, insert); break; default: send_empty_response(); break; } } static bool command_equals(const uint8_t *data, size_t len, const char *cmd) { if (strlen(cmd) > len) return (false); return (memcmp(data, cmd, strlen(cmd)) == 0); } static void check_features(const uint8_t *data, size_t len) { char *feature, *next_feature, *str, *value; bool supported; str = malloc(len + 1); memcpy(str, data, len); str[len] = '\0'; next_feature = str; while ((feature = strsep(&next_feature, ";")) != NULL) { /* * Null features shouldn't exist, but skip if they * do. */ if (strcmp(feature, "") == 0) continue; /* * Look for the value or supported / not supported * flag. */ value = strchr(feature, '='); if (value != NULL) { *value = '\0'; value++; supported = true; } else { value = feature + strlen(feature) - 1; switch (*value) { case '+': supported = true; break; case '-': supported = false; break; default: /* * This is really a protocol error, * but we just ignore malformed * features for ease of * implementation. */ continue; } value = NULL; } if (strcmp(feature, "swbreak") == 0) swbreak_enabled = supported; } free(str); start_packet(); /* This is an arbitrary limit. */ append_string("PacketSize=4096"); append_string(";swbreak+"); finish_packet(); } static void gdb_query(const uint8_t *data, size_t len) { /* * TODO: * - qSearch */ if (command_equals(data, len, "qAttached")) { start_packet(); append_char('1'); finish_packet(); } else if (command_equals(data, len, "qC")) { start_packet(); append_string("QC"); append_integer(cur_vcpu + 1); finish_packet(); } else if (command_equals(data, len, "qfThreadInfo")) { cpuset_t mask; bool first; int vcpu; if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); return; } mask = vcpus_active; start_packet(); append_char('m'); first = true; while (!CPU_EMPTY(&mask)) { vcpu = CPU_FFS(&mask) - 1; CPU_CLR(vcpu, &mask); if (first) first = false; else append_char(','); append_integer(vcpu + 1); } finish_packet(); } else if (command_equals(data, len, "qsThreadInfo")) { start_packet(); append_char('l'); finish_packet(); } else if (command_equals(data, len, "qSupported")) { data += strlen("qSupported"); len -= strlen("qSupported"); check_features(data, len); } else if (command_equals(data, len, "qThreadExtraInfo")) { char buf[16]; int tid; data += strlen("qThreadExtraInfo"); len -= strlen("qThreadExtraInfo"); if (*data != ',') { send_error(EINVAL); return; } tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } snprintf(buf, sizeof(buf), "vCPU %d", tid - 1); start_packet(); append_asciihex(buf); finish_packet(); } else send_empty_response(); } static void handle_command(const uint8_t *data, size_t len) { /* Reject packets with a sequence-id. */ if (len >= 3 && data[0] >= '0' && data[0] <= '9' && data[0] >= '0' && data[0] <= '9' && data[2] == ':') { send_empty_response(); return; } switch (*data) { case 'c': if (len != 1) { send_error(EINVAL); break; } discard_stop(); gdb_resume_vcpus(); break; case 'D': send_ok(); /* TODO: Resume any stopped CPUs. */ break; case 'g': { gdb_read_regs(); break; } case 'H': { int tid; if (data[1] != 'g' && data[1] != 'c') { send_error(EINVAL); break; } tid = parse_threadid(data + 2, len - 2); if (tid == -2) { send_error(EINVAL); break; } if (CPU_EMPTY(&vcpus_active)) { send_error(EINVAL); break; } if (tid == -1 || tid == 0) cur_vcpu = CPU_FFS(&vcpus_active) - 1; else if (CPU_ISSET(tid - 1, &vcpus_active)) cur_vcpu = tid - 1; else { send_error(EINVAL); break; } send_ok(); break; } case 'm': gdb_read_mem(data, len); break; case 'M': gdb_write_mem(data, len); break; case 'T': { int tid; tid = parse_threadid(data + 1, len - 1); if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { send_error(EINVAL); return; } send_ok(); break; } case 'q': gdb_query(data, len); break; case 's': if (len != 1) { send_error(EINVAL); break; } /* Don't send a reply until a stop occurs. */ if (!gdb_step_vcpu(vcpus[cur_vcpu])) { send_error(EOPNOTSUPP); break; } break; case 'z': case 'Z': parse_breakpoint(data, len); break; case '?': report_stop(false); break; case 'G': /* TODO */ case 'v': /* Handle 'vCont' */ /* 'vCtrlC' */ case 'p': /* TODO */ case 'P': /* TODO */ case 'Q': /* TODO */ case 't': /* TODO */ case 'X': /* TODO */ default: send_empty_response(); } } /* Check for a valid packet in the command buffer. */ static void check_command(int fd) { uint8_t *head, *hash, *p, sum; size_t avail, plen; for (;;) { avail = cur_comm.len; if (avail == 0) return; head = io_buffer_head(&cur_comm); switch (*head) { case 0x03: debug("<- Ctrl-C\n"); io_buffer_consume(&cur_comm, 1); gdb_suspend_vcpus(); break; case '+': /* ACK of previous response. */ debug("<- +\n"); if (response_pending()) io_buffer_reset(&cur_resp); io_buffer_consume(&cur_comm, 1); if (stopped_vcpu != -1 && report_next_stop) { report_stop(true); send_pending_data(fd); } break; case '-': /* NACK of previous response. */ debug("<- -\n"); if (response_pending()) { cur_resp.len += cur_resp.start; cur_resp.start = 0; if (cur_resp.data[0] == '+') io_buffer_advance(&cur_resp, 1); debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); } io_buffer_consume(&cur_comm, 1); send_pending_data(fd); break; case '$': /* Packet. */ if (response_pending()) { warnx("New GDB command while response in " "progress"); io_buffer_reset(&cur_resp); } /* Is packet complete? */ hash = memchr(head, '#', avail); if (hash == NULL) return; plen = (hash - head + 1) + 2; if (avail < plen) return; debug("<- %.*s\n", (int)plen, head); /* Verify checksum. */ for (sum = 0, p = head + 1; p < hash; p++) sum += *p; if (sum != parse_byte(hash + 1)) { io_buffer_consume(&cur_comm, plen); debug("-> -\n"); send_char('-'); send_pending_data(fd); break; } send_char('+'); handle_command(head + 1, hash - (head + 1)); io_buffer_consume(&cur_comm, plen); if (!response_pending()) debug("-> +\n"); send_pending_data(fd); break; default: /* XXX: Possibly drop connection instead. */ debug("-> %02x\n", *head); io_buffer_consume(&cur_comm, 1); break; } } } static void gdb_readable(int fd, enum ev_type event __unused, void *arg __unused) { size_t pending; ssize_t nread; int n; if (ioctl(fd, FIONREAD, &n) == -1) { warn("FIONREAD on GDB socket"); return; } assert(n >= 0); pending = n; /* * 'pending' might be zero due to EOF. We need to call read * with a non-zero length to detect EOF. */ if (pending == 0) pending = 1; /* Ensure there is room in the command buffer. */ io_buffer_grow(&cur_comm, pending); assert(io_buffer_avail(&cur_comm) >= pending); nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm)); if (nread == 0) { close_connection(); } else if (nread == -1) { if (errno == EAGAIN) return; warn("Read from GDB socket"); close_connection(); } else { cur_comm.len += nread; pthread_mutex_lock(&gdb_lock); check_command(fd); pthread_mutex_unlock(&gdb_lock); } } static void gdb_writable(int fd, enum ev_type event __unused, void *arg __unused) { send_pending_data(fd); } static void new_connection(int fd, enum ev_type event __unused, void *arg) { int optval, s; s = accept4(fd, NULL, NULL, SOCK_NONBLOCK); if (s == -1) { if (arg != NULL) err(1, "Failed accepting initial GDB connection"); /* Silently ignore errors post-startup. */ return; } optval = 1; if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) == -1) { warn("Failed to disable SIGPIPE for GDB connection"); close(s); return; } pthread_mutex_lock(&gdb_lock); if (cur_fd != -1) { close(s); warnx("Ignoring additional GDB connection."); } read_event = mevent_add(s, EVF_READ, gdb_readable, NULL); if (read_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); pthread_mutex_unlock(&gdb_lock); return; } write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL); if (write_event == NULL) { if (arg != NULL) err(1, "Failed to setup initial GDB connection"); mevent_delete_close(read_event); read_event = NULL; } cur_fd = s; cur_vcpu = 0; stopped_vcpu = -1; /* Break on attach. */ first_stop = true; report_next_stop = false; gdb_suspend_vcpus(); pthread_mutex_unlock(&gdb_lock); } #ifndef WITHOUT_CAPSICUM static void limit_gdb_socket(int s) { cap_rights_t rights; unsigned long ioctls[] = { FIONREAD }; cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE, CAP_SETSOCKOPT, CAP_IOCTL); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); } #endif void init_gdb(struct vmctx *_ctx) { int error, flags, optval, s; struct addrinfo hints; struct addrinfo *gdbaddr; const char *saddr, *value; char *sport; bool wait; value = get_config_value("gdb.port"); if (value == NULL) return; sport = strdup(value); if (sport == NULL) errx(4, "Failed to allocate memory"); wait = get_config_bool_default("gdb.wait", false); saddr = get_config_value("gdb.address"); if (saddr == NULL) { saddr = "localhost"; } debug("==> starting on %s:%s, %swaiting\n", saddr, sport, wait ? "" : "not "); error = pthread_mutex_init(&gdb_lock, NULL); if (error != 0) errc(1, error, "gdb mutex init"); error = pthread_cond_init(&idle_vcpus, NULL); if (error != 0) errc(1, error, "gdb cv init"); memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_NUMERICSERV | AI_PASSIVE; error = getaddrinfo(saddr, sport, &hints, &gdbaddr); if (error != 0) errx(1, "gdb address resolution: %s", gai_strerror(error)); ctx = _ctx; s = socket(gdbaddr->ai_family, gdbaddr->ai_socktype, 0); if (s < 0) err(1, "gdb socket create"); optval = 1; (void)setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); if (bind(s, gdbaddr->ai_addr, gdbaddr->ai_addrlen) < 0) err(1, "gdb socket bind"); if (listen(s, 1) < 0) err(1, "gdb socket listen"); stopped_vcpu = -1; TAILQ_INIT(&breakpoints); vcpus = calloc(guest_ncpus, sizeof(*vcpus)); vcpu_state = calloc(guest_ncpus, sizeof(*vcpu_state)); if (wait) { /* * Set vcpu 0 in vcpus_suspended. This will trigger the * logic in gdb_cpu_add() to suspend the first vcpu before * it starts execution. The vcpu will remain suspended * until a debugger connects. */ CPU_SET(0, &vcpus_suspended); stopped_vcpu = 0; } flags = fcntl(s, F_GETFL); if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) err(1, "Failed to mark gdb socket non-blocking"); #ifndef WITHOUT_CAPSICUM limit_gdb_socket(s); #endif mevent_add(s, EVF_READ, new_connection, NULL); gdb_active = true; freeaddrinfo(gdbaddr); free(sport); } diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index b69c099225db..1eef285a871c 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -1,2741 +1,2740 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Zhixiang Yu * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "ahci.h" #include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define AHCI_PORT_IDENT 20 + 1 struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; int readop; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; struct ata_params ata_ident; uint8_t *cmd_lst; uint8_t *rfis; int port; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * Generate HBA interrupts on global IS register write. */ static void ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { struct pci_devinst *pi = sc->asc_pi; struct ahci_port *p; int i, nmsg; uint32_t mmask; /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { p = &sc->port[i]; if (p->is & p->ie) sc->is |= (1 << i); } DPRINTF("%s(%08x) %08x", __func__, mask, sc->is); /* If there is nothing enabled -- clear legacy interrupt and exit. */ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } return; } /* If there is anything and no MSI -- assert legacy interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (nmsg == 0) { if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } return; } /* Assert respective MSIs for ports that were touched. */ for (i = 0; i < nmsg; i++) { if (sc->ports <= nmsg || i < nmsg - 1) mmask = 1 << i; else mmask = 0xffffffff << i; if (sc->is & mask && mmask & mask) pci_generate_msi(pi, i); } } /* * Generate HBA interrupt on specific port event. */ static void ahci_port_intr(struct ahci_port *p) { struct pci_ahci_softc *sc = p->pr_sc; struct pci_devinst *pi = sc->asc_pi; int nmsg; DPRINTF("%s(%d) %08x/%08x %08x", __func__, p->port, p->is, p->ie, sc->is); /* If there is nothing enabled -- we are done. */ if ((p->is & p->ie) == 0) return; /* In case of non-shared MSI always generate interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (sc->ports <= nmsg || p->port < nmsg - 1) { sc->is |= (1 << p->port); if ((sc->ghc & AHCI_GHC_IE) == 0) return; pci_generate_msi(pi, p->port); return; } /* If IS for this port is already set -- do nothing. */ if (sc->is & (1 << p->port)) return; sc->is |= (1 << p->port); /* If interrupts are enabled -- generate one. */ if ((sc->ghc & AHCI_GHC_IE) == 0) return; if (nmsg > 0) { pci_generate_msi(pi, nmsg - 1); } else if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: EPRINTLN("unsupported fis type %d", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { if (~p->is & irq) { p->is |= irq; ahci_port_intr(p); } } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; uint32_t dbcsz, extra, left, skip, todo; int i, j; assert(aior->len >= aior->done); /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *to; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; unsigned int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { if (ncq) { if (first) ahci_write_fis_d2h_ncq(p, slot); ahci_write_fis_sdb(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } else { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *from; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint32_t buf[128]; uint8_t *buf8 = (uint8_t *)buf; uint16_t *buf16 = (uint16_t *)buf; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); if (cfis[4] == 0x00) { /* Log directory */ buf16[0x00] = 1; /* Version -- 1 */ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf8, sizeof(buf)); } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { buf[0x00] = 1; /* SFQ DSM supported */ buf[0x01] = 1; /* SFQ DSM TRIM supported */ } } else { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void ata_identify_init(struct ahci_port* p, int atapi) { struct ata_params* ata_ident = &p->ata_ident; if (atapi) { ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM | ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST; ata_ident->capabilities1 = ATA_SUPPORT_LBA | ATA_SUPPORT_DMA; ata_ident->capabilities2 = (1 << 14 | 1); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; ata_ident->obsolete62 = 0x3f; ata_ident->mwdmamodes = 7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM; ata_ident->version_major = 0x3f0; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->support.command2 = (1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } else { uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); ata_ident->config = ATA_DRQ_FAST; ata_ident->cylinders = cyl; ata_ident->heads = heads; ata_ident->sectors = sech; ata_ident->sectors_intr = (0x8000 | 128); ata_ident->tcg = 0; ata_ident->capabilities1 = ATA_SUPPORT_DMA | ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY; ata_ident->capabilities2 = (1 << 14); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; if (p->mult_sectors) ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors); if (sectors <= 0x0fffffff) { ata_ident->lba_size_1 = sectors; ata_ident->lba_size_2 = (sectors >> 16); } else { ata_ident->lba_size_1 = 0xffff; ata_ident->lba_size_2 = 0x0fff; } ata_ident->mwdmamodes = 0x7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 0x3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->support3 = 0; ata_ident->queue = 31; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->version_major = 0x3f0; ata_ident->version_minor = 0x28; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->lba_size48_1 = sectors; ata_ident->lba_size48_2 = (sectors >> 16); ata_ident->lba_size48_3 = (sectors >> 32); ata_ident->lba_size48_4 = (sectors >> 48); if (candelete && !ro) { ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; ata_ident->max_dsm_blocks = 1; ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM; } ata_ident->pss = ATA_PSS_VALID_VALUE; ata_ident->lsalign = 0x4000; if (psectsz > sectsz) { ata_ident->pss |= ATA_PSS_MULTLS; ata_ident->pss |= ffsl(psectsz / sectsz) - 1; ata_ident->lsalign |= (psectoff / sectsz); } if (sectsz > 512) { ata_ident->pss |= ATA_PSS_LSSABOVE512; ata_ident->lss_1 = sectsz / 2; ata_ident->lss_2 = ((sectsz / 2) >> 16); } ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params)); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; unsigned int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { size_t size; int msf; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { size_t size; int msf; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; unsigned int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; unsigned int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF(""); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[3] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: EPRINTLN("Unsupported cmd:%02x", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; #ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; #endif struct pci_ahci_softc *sc; uint8_t *cfis; #ifdef AHCI_DEBUG int cfl, i; #endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); #ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; #endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); #ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); DPRINTF("cfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF(""); DPRINTF("%02x ", cfis[i]); } DPRINTF(""); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { EPRINTLN("Not a H2D FIS:%02x", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_port_intr(p); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: EPRINTLN("pci_ahci_port: read only registers 0x%"PRIx64"", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) { ahci_reset(sc); break; } if (value & AHCI_GHC_IE) sc->ghc |= AHCI_GHC_IE; else sc->ghc &= ~AHCI_GHC_IE; ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc, value); break; default: break; } } static void pci_ahci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else EPRINTLN("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x", port, offset, value); return value; } static uint64_t pci_ahci_read(struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; EPRINTLN("pci_ahci: unknown i/o read offset 0x%"PRIx64"", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } /* * Each AHCI controller has a "port" node which contains nodes for * each port named after the decimal number of the port (no leading * zeroes). Port nodes contain a "type" ("hd" or "cd"), as well as * options for blockif. For example: * * pci.0.1.0 * .device="ahci" * .port * .0 * .type="hd" * .path="/path/to/image" */ static int pci_ahci_legacy_config_port(nvlist_t *nvl, int port, const char *type, const char *opts) { char node_name[sizeof("XX")]; nvlist_t *port_nvl; snprintf(node_name, sizeof(node_name), "%d", port); port_nvl = create_relative_config_node(nvl, node_name); set_config_value_node(port_nvl, "type", type); return (blockif_legacy_config(port_nvl, opts)); } static int pci_ahci_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; const char *type; char *next, *next2, *str, *tofree; int p, ret; if (opts == NULL) return (0); ports_nvl = create_relative_config_node(nvl, "port"); ret = 1; tofree = str = strdup(opts); for (p = 0; p < MAX_PORTS && str != NULL; p++, str = next) { /* Identify and cut off type of present port. */ if (strncmp(str, "hd:", 3) == 0) { type = "hd"; str += 3; } else if (strncmp(str, "cd:", 3) == 0) { type = "cd"; str += 3; } else type = NULL; /* Find and cut off the next port options. */ next = strstr(str, ",hd:"); next2 = strstr(str, ",cd:"); if (next == NULL || (next2 != NULL && next2 < next)) next = next2; if (next != NULL) { next[0] = 0; next++; } if (str[0] == 0) continue; if (type == NULL) { EPRINTLN("Missing or invalid type for port %d: \"%s\"", p, str); goto out; } if (pci_ahci_legacy_config_port(ports_nvl, p, type, str) != 0) goto out; } ret = 0; out: free(tofree); return (ret); } static int pci_ahci_cd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "cd", opts)); } static int pci_ahci_hd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "hd", opts)); } static int pci_ahci_init(struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XXX:XXX:XXX")]; char node_name[sizeof("XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; int atapi, ret, slots, p; MD5_CTX mdctx; u_char digest[16]; const char *path, *type, *value; nvlist_t *ports_nvl, *port_nvl; ret = 0; #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; pthread_mutex_init(&sc->mtx, NULL); sc->ports = 0; sc->pi = 0; slots = 32; ports_nvl = find_relative_config_node(nvl, "port"); for (p = 0; ports_nvl != NULL && p < MAX_PORTS; p++) { struct ata_params *ata_ident = &sc->port[p].ata_ident; char ident[AHCI_PORT_IDENT]; snprintf(node_name, sizeof(node_name), "%d", p); port_nvl = find_relative_config_node(ports_nvl, node_name); if (port_nvl == NULL) continue; type = get_config_value_node(port_nvl, "type"); if (type == NULL) continue; if (strcmp(type, "hd") == 0) atapi = 0; else atapi = 1; /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ snprintf(bident, sizeof(bident), "%u:%u:%u", pi->pi_slot, pi->pi_func, p); bctxt = blockif_open(port_nvl, bident); if (bctxt == NULL) { sc->ports = p; ret = 1; goto open_fail; } ret = blockif_add_boot_device(pi, bctxt); if (ret) { sc->ports = p; goto open_fail; } sc->port[p].bctx = bctxt; sc->port[p].pr_sc = sc; sc->port[p].port = p; sc->port[p].atapi = atapi; /* * Create an identifier for the backing file. * Use parts of the md5 sum of the filename */ path = get_config_value_node(port_nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(ident, AHCI_PORT_IDENT, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); memset(ata_ident, 0, sizeof(struct ata_params)); ata_string((uint8_t*)&ata_ident->serial, ident, 20); ata_string((uint8_t*)&ata_ident->revision, "001", 8); if (atapi) ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); else ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); value = get_config_value_node(port_nvl, "nmrr"); if (value != NULL) ata_ident->media_rotation_rate = atoi(value); value = get_config_value_node(port_nvl, "ser"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->serial), value, 20); value = get_config_value_node(port_nvl, "rev"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->revision), value, 8); value = get_config_value_node(port_nvl, "model"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->model), value, 40); ata_identify_init(&sc->port[p], atapi); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[p]); sc->pi |= (1 << p); if (sc->port[p].ioqsz < slots) slots = sc->port[p].ioqsz; } sc->ports = p; /* Intel ICH8 AHCI */ --slots; if (sc->ports < DEF_PORTS) sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); p = MIN(sc->ports, 16); p = flsl(p) - ((p & (p - 1)) ? 0 : 1); pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { for (p = 0; p < sc->ports; p++) { if (sc->port[p].bctx != NULL) blockif_close(sc->port[p].bctx); } free(sc); } return (ret); } #ifdef BHYVE_SNAPSHOT static int pci_ahci_snapshot(struct vm_snapshot_meta *meta) { int i, ret; void *bctx; struct pci_devinst *pi; struct pci_ahci_softc *sc; struct ahci_port *port; pi = meta->dev_data; sc = pi->pi_arg; /* TODO: add mtx lock/unlock */ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); for (i = 0; i < MAX_PORTS; i++) { port = &sc->port[i]; if (meta->op == VM_SNAPSHOT_SAVE) bctx = port->bctx; SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); /* Mostly for restore; save is ensured by the lines above. */ if (((bctx == NULL) && (port->bctx != NULL)) || ((bctx != NULL) && (port->bctx == NULL))) { - fprintf(stderr, "%s: ports not matching\r\n", __func__); + EPRINTLN("%s: ports not matching", __func__); ret = EINVAL; goto done; } if (port->bctx == NULL) continue; if (port->port != i) { - fprintf(stderr, "%s: ports not matching: " - "actual: %d expected: %d\r\n", - __func__, port->port, i); + EPRINTLN("%s: ports not matching: " + "actual: %d expected: %d", __func__, port->port, i); ret = EINVAL; goto done; } SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->cmd_lst, AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, port->rfis, 256, false, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ata_ident, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); assert(TAILQ_EMPTY(&port->iobhd)); } done: return (ret); } static int pci_ahci_pause(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_pause(bctxt); } return (0); } static int pci_ahci_resume(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_resume(bctxt); } return (0); } #endif /* BHYVE_SNAPSHOT */ /* * Use separate emulation names to distinguish drive and atapi devices */ static const struct pci_devemu pci_de_ahci = { .pe_emu = "ahci", .pe_init = pci_ahci_init, .pe_legacy_config = pci_ahci_legacy_config, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_ahci_snapshot, .pe_pause = pci_ahci_pause, .pe_resume = pci_ahci_resume, #endif }; PCI_EMUL_SET(pci_de_ahci); static const struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_legacy_config = pci_ahci_hd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_hd); static const struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", .pe_legacy_config = pci_ahci_cd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c index 2d37cb0f08fc..99c1d20b3378 100644 --- a/usr.sbin/bhyve/pci_xhci.c +++ b/usr.sbin/bhyve/pci_xhci.c @@ -1,3209 +1,3209 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* XHCI options: -s ,xhci,{devices} devices: tablet USB tablet mouse */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "pci_xhci.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "usb_emul.h" static int xhci_debug = 0; #define DPRINTF(params) if (xhci_debug) PRINTLN params #define WPRINTF(params) PRINTLN params #define XHCI_NAME "xhci" #define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ #define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ /* * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping * to 4k to avoid going over the guest physical memory barrier. */ #define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ #define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ #define XHCI_CAPLEN (4*8) /* offset of op register space */ #define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ #define XHCI_PORTREGS_START 0x400 #define XHCI_DOORBELL_MAX 256 #define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ /* caplength and hci-version registers */ #define XHCI_SET_CAPLEN(x) ((x) & 0xFF) #define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) #define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) /* hcsparams1 register */ #define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) #define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) #define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) /* hcsparams2 register */ #define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) #define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) #define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) #define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) /* hcsparams3 register */ #define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) #define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) /* hccparams1 register */ #define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) #define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) #define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) #define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) #define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) #define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) #define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) #define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) #define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) #define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) /* hccparams2 register */ #define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) #define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) /* other registers */ #define XHCI_SET_DOORBELL(x) ((x) & ~0x03) #define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) /* register masks */ #define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ #define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ #define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ /* port register set */ #define XHCI_PORTREGS_BASE 0x400 /* base offset */ #define XHCI_PORTREGS_PORT0 0x3F0 #define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ #define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) #define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) #define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & (m)) << (s))) #define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & ((m) << (s))))) #define SNAP_DEV_NAME_LEN 128 struct pci_xhci_trb_ring { uint64_t ringaddr; /* current dequeue guest address */ uint32_t ccs; /* consumer cycle state */ }; /* device endpoint transfer/stream rings */ struct pci_xhci_dev_ep { union { struct xhci_trb *_epu_tr; struct xhci_stream_ctx *_epu_sctx; } _ep_trbsctx; #define ep_tr _ep_trbsctx._epu_tr #define ep_sctx _ep_trbsctx._epu_sctx /* * Caches the value of MaxPStreams from the endpoint context * when an endpoint is initialized and is used to validate the * use of ep_ringaddr vs ep_sctx_trbs[] as well as the length * of ep_sctx_trbs[]. */ uint32_t ep_MaxPStreams; union { struct pci_xhci_trb_ring _epu_trb; struct pci_xhci_trb_ring *_epu_sctx_trbs; } _ep_trb_rings; #define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr #define ep_ccs _ep_trb_rings._epu_trb.ccs #define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs struct usb_data_xfer *ep_xfer; /* transfer chain */ }; /* device context base address array: maps slot->device context */ struct xhci_dcbaa { uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ }; /* port status registers */ struct pci_xhci_portregs { uint32_t portsc; /* port status and control */ uint32_t portpmsc; /* port pwr mgmt status & control */ uint32_t portli; /* port link info */ uint32_t porthlpmc; /* port hardware LPM control */ } __packed; #define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) /* xHC operational registers */ struct pci_xhci_opregs { uint32_t usbcmd; /* usb command */ uint32_t usbsts; /* usb status */ uint32_t pgsz; /* page size */ uint32_t dnctrl; /* device notification control */ uint64_t crcr; /* command ring control */ uint64_t dcbaap; /* device ctx base addr array ptr */ uint32_t config; /* configure */ /* guest mapped addresses: */ struct xhci_trb *cr_p; /* crcr dequeue */ struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ }; /* xHC runtime registers */ struct pci_xhci_rtsregs { uint32_t mfindex; /* microframe index */ struct { /* interrupter register set */ uint32_t iman; /* interrupter management */ uint32_t imod; /* interrupter moderation */ uint32_t erstsz; /* event ring segment table size */ uint32_t rsvd; uint64_t erstba; /* event ring seg-tbl base addr */ uint64_t erdp; /* event ring dequeue ptr */ } intrreg __packed; /* guest mapped addresses */ struct xhci_event_ring_seg *erstba_p; struct xhci_trb *erst_p; /* event ring segment tbl */ int er_deq_seg; /* event ring dequeue segment */ int er_enq_idx; /* event ring enqueue index - xHCI */ int er_enq_seg; /* event ring enqueue segment */ uint32_t er_events_cnt; /* number of events in ER */ uint32_t event_pcs; /* producer cycle state flag */ }; struct pci_xhci_softc; /* * USB device emulation container. * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each * emulated device instance. */ struct pci_xhci_dev_emu { struct pci_xhci_softc *xsc; /* XHCI contexts */ struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; int dev_slotstate; struct usb_devemu *dev_ue; /* USB emulated dev */ void *dev_sc; /* device's softc */ struct usb_hci hci; }; struct pci_xhci_softc { struct pci_devinst *xsc_pi; pthread_mutex_t mtx; uint32_t caplength; /* caplen & hciversion */ uint32_t hcsparams1; /* structural parameters 1 */ uint32_t hcsparams2; /* structural parameters 2 */ uint32_t hcsparams3; /* structural parameters 3 */ uint32_t hccparams1; /* capability parameters 1 */ uint32_t dboff; /* doorbell offset */ uint32_t rtsoff; /* runtime register space offset */ uint32_t hccparams2; /* capability parameters 2 */ uint32_t regsend; /* end of configuration registers */ struct pci_xhci_opregs opregs; struct pci_xhci_rtsregs rtsregs; struct pci_xhci_portregs *portregs; struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ int usb2_port_start; int usb3_port_start; }; /* port and slot numbering start from 1 */ #define XHCI_PORTREG_PTR(x,n) &((x)->portregs[(n) - 1]) #define XHCI_DEVINST_PTR(x,n) ((x)->devices[(n) - 1]) #define XHCI_SLOTDEV_PTR(x,n) ((x)->slots[(n) - 1]) #define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) #define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \ (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1))) #define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ (a), XHCI_GADDR_SIZE(a)) static int xhci_in_use; /* map USB errors to XHCI */ static const int xhci_usb_errors[USB_ERR_MAX] = { [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, }; #define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ XHCI_TRB_ERROR_INVALID) static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr); static void pci_xhci_dump_trb(struct xhci_trb *trb); static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs); static void pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, uint32_t evtype) { evtrb->qwTrb0 = port << 24; evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); } /* controller reset */ static void pci_xhci_reset(struct pci_xhci_softc *sc) { int i; sc->rtsregs.er_enq_idx = 0; sc->rtsregs.er_events_cnt = 0; sc->rtsregs.event_pcs = 1; for (i = 1; i <= XHCI_MAX_SLOTS; i++) { pci_xhci_reset_slot(sc, i); } } static uint32_t pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) { int do_intr = 0; int i; if (cmd & XHCI_CMD_RS) { do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; sc->opregs.usbcmd |= XHCI_CMD_RS; sc->opregs.usbsts &= ~XHCI_STS_HCH; sc->opregs.usbsts |= XHCI_STS_PCD; /* Queue port change event on controller run from stop */ if (do_intr) for (i = 1; i <= XHCI_MAX_DEVS; i++) { struct pci_xhci_dev_emu *dev; struct pci_xhci_portregs *port; struct xhci_trb evtrb; if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) continue; port = XHCI_PORTREG_PTR(sc, i); port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; port->portsc &= ~XHCI_PS_PLS_MASK; /* * XHCI 4.19.3 USB2 RxDetect->Polling, * USB3 Polling->U0 */ if (dev->dev_ue->ue_usbver == 2) port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); else port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0); pci_xhci_set_evtrb(&evtrb, i, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); if (pci_xhci_insert_event(sc, &evtrb, 0) != XHCI_TRB_ERROR_SUCCESS) break; } } else { sc->opregs.usbcmd &= ~XHCI_CMD_RS; sc->opregs.usbsts |= XHCI_STS_HCH; sc->opregs.usbsts &= ~XHCI_STS_PCD; } /* start execution of schedule; stop when set to 0 */ cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; if (cmd & XHCI_CMD_HCRST) { /* reset controller */ pci_xhci_reset(sc); cmd &= ~XHCI_CMD_HCRST; } cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); if (do_intr) pci_xhci_assert_interrupt(sc); return (cmd); } static void pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct xhci_trb evtrb; struct pci_xhci_portregs *p; int port; uint32_t oldpls, newpls; if (sc->portregs == NULL) return; port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx", offset, port, value)); assert(port >= 0); if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_write port %d > ndevices", port)); return; } if (XHCI_DEVINST_PTR(sc, port) == NULL) { DPRINTF(("pci_xhci: portregs_write to unattached port %d", port)); } p = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: /* port reset or warm reset */ if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); break; } if ((p->portsc & XHCI_PS_PP) == 0) { WPRINTF(("pci_xhci: portregs_write to unpowered " "port %d", port)); break; } /* Port status and control register */ oldpls = XHCI_PS_PLS_GET(p->portsc); newpls = XHCI_PS_PLS_GET(value); p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; if (XHCI_DEVINST_PTR(sc, port)) p->portsc |= XHCI_PS_CCS; p->portsc |= (value & ~(XHCI_PS_OCA | XHCI_PS_PR | XHCI_PS_PED | XHCI_PS_PLS_MASK | /* link state */ XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK | /* port indicator */ XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); /* clear control bits */ p->portsc &= ~(value & (XHCI_PS_CSC | XHCI_PS_PEC | XHCI_PS_WRC | XHCI_PS_OCC | XHCI_PS_PRC | XHCI_PS_PLC | XHCI_PS_CEC | XHCI_PS_CAS)); /* port disable request; for USB3, don't care */ if (value & XHCI_PS_PED) DPRINTF(("Disable port %d request", port)); if (!(value & XHCI_PS_LWS)) break; DPRINTF(("Port new PLS: %d", newpls)); switch (newpls) { case 0: /* U0 */ case 3: /* U3 */ if (oldpls != newpls) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(newpls) | XHCI_PS_PLC; if (oldpls != 0 && newpls == 0) { pci_xhci_set_evtrb(&evtrb, port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); pci_xhci_insert_event(sc, &evtrb, 1); } } break; default: DPRINTF(("Unhandled change port %d PLS %u", port, newpls)); break; } break; case 4: /* Port power management status and control register */ p->portpmsc = value; break; case 8: /* Port link information register */ DPRINTF(("pci_xhci attempted write to PORTLI, port %d", port)); break; case 12: /* * Port hardware LPM control register. * For USB3, this register is reserved. */ p->porthlpmc = value; break; default: DPRINTF(("pci_xhci: unaligned portreg write offset %#lx", offset)); break; } } static struct xhci_dev_ctx * pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) { uint64_t devctx_addr; struct xhci_dev_ctx *devctx; assert(slot > 0 && slot <= XHCI_MAX_DEVS); assert(XHCI_SLOTDEV_PTR(sc, slot) != NULL); assert(sc->opregs.dcbaa_p != NULL); devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; if (devctx_addr == 0) { DPRINTF(("get_dev_ctx devctx_addr == 0")); return (NULL); } DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx", slot, devctx_addr)); devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); return (devctx); } static struct xhci_trb * pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, uint64_t *guestaddr) { struct xhci_trb *next; assert(curtrb != NULL); if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { if (guestaddr) *guestaddr = curtrb->qwTrb0 & ~0xFUL; next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); } else { if (guestaddr) *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; next = curtrb + 1; } return (next); } static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) { sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; sc->opregs.usbsts |= XHCI_STS_EINT; /* only trigger interrupt if permitted */ if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { if (pci_msi_enabled(sc->xsc_pi)) pci_generate_msi(sc->xsc_pi, 0); else pci_lintr_assert(sc->xsc_pi); } } static void pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) { if (!pci_msi_enabled(sc->xsc_pi)) pci_lintr_assert(sc->xsc_pi); } static void pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; uint32_t i, pstreams; dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); if (pstreams > 0) { DPRINTF(("init_ep %d with pstreams %d", epid, pstreams)); assert(devep->ep_sctx_trbs == NULL); devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK); devep->ep_sctx_trbs = calloc(pstreams, sizeof(struct pci_xhci_trb_ring)); for (i = 0; i < pstreams; i++) { devep->ep_sctx_trbs[i].ringaddr = devep->ep_sctx[i].qwSctx0 & XHCI_SCTX_0_TR_DQ_PTR_MASK; devep->ep_sctx_trbs[i].ccs = XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); } } else { DPRINTF(("init_ep %d with no pstreams", epid)); devep->ep_ringaddr = ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK; devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); DPRINTF(("init_ep tr DCS %x", devep->ep_ccs)); } devep->ep_MaxPStreams = pstreams; if (devep->ep_xfer == NULL) { devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); USB_DATA_XFER_INIT(devep->ep_xfer); } } static void pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; DPRINTF(("pci_xhci disable_ep %d", epid)); dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; devep = &dev->eps[epid]; if (devep->ep_MaxPStreams > 0) free(devep->ep_sctx_trbs); if (devep->ep_xfer != NULL) { free(devep->ep_xfer); devep->ep_xfer = NULL; } memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); } /* reset device at slot and data structures related to it */ static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) { struct pci_xhci_dev_emu *dev; dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev) { DPRINTF(("xhci reset unassigned slot (%d)?", slot)); } else { dev->dev_slotstate = XHCI_ST_DISABLED; } /* TODO: reset ring buffer pointers */ } static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr) { struct pci_xhci_rtsregs *rts; uint64_t erdp; int erdp_idx; int err; struct xhci_trb *evtrbptr; err = XHCI_TRB_ERROR_SUCCESS; rts = &sc->rtsregs; erdp = rts->intrreg.erdp & ~0xF; erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / sizeof(struct xhci_trb); DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]", evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3)); DPRINTF(("\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u", erdp_idx, rts->er_deq_seg, rts->er_enq_idx, rts->er_enq_seg, rts->event_pcs)); DPRINTF(("\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)", erdp, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize, do_intr)); evtrbptr = &rts->erst_p[rts->er_enq_idx]; /* TODO: multi-segment table */ if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { DPRINTF(("pci_xhci[%d] cannot insert event; ring full", __LINE__)); err = XHCI_TRB_ERROR_EV_RING_FULL; goto done; } if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { struct xhci_trb errev; if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { DPRINTF(("pci_xhci[%d] insert evt err: ring full", __LINE__)); errev.qwTrb0 = 0; errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( XHCI_TRB_ERROR_EV_RING_FULL); errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( XHCI_TRB_EVENT_HOST_CTRL) | rts->event_pcs; rts->er_events_cnt++; memcpy(&rts->erst_p[rts->er_enq_idx], &errev, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; err = XHCI_TRB_ERROR_EV_RING_FULL; do_intr = 1; goto done; } } else { rts->er_events_cnt++; } evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; evtrb->dwTrb3 |= rts->event_pcs; memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; if (rts->er_enq_idx == 0) rts->event_pcs ^= 1; done: if (do_intr) pci_xhci_assert_interrupt(sc); return (err); } static uint32_t pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs != NULL) for (i = 1; i <= XHCI_MAX_SLOTS; i++) { dev = XHCI_SLOTDEV_PTR(sc, i); if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { *slot = i; dev->dev_slotstate = XHCI_ST_ENABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; dev->hci.hci_address = i; break; } } DPRINTF(("pci_xhci enable slot (error=%d) slot %u", cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); return (cmderr); } static uint32_t pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; DPRINTF(("pci_xhci disable slot %u", slot)); cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; if (slot > XHCI_MAX_SLOTS) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } dev = XHCI_SLOTDEV_PTR(sc, slot); if (dev) { if (dev->dev_slotstate == XHCI_ST_DISABLED) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; } else { dev->dev_slotstate = XHCI_ST_DISABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; /* TODO: reset events and endpoints */ } } else cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; DPRINTF(("pci_xhci reset device slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; else { dev->dev_slotstate = XHCI_ST_DEFAULT; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, 0x1F, 27); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* reset all eps other than ep-0 */ for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_DISABLED, 0x7, 0); } cmderr = XHCI_TRB_ERROR_SUCCESS; } pci_xhci_reset_slot(sc, slot); done: return (cmderr); } static uint32_t pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { DPRINTF(("pci_xhci: address device, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: address device, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); dev->hci.hci_address = slot; dev->dev_ctx = dev_ctx; if (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); dev_ctx->ctx_slot.dwSctx3 = XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | XHCI_SCTX_3_DEV_ADDR_SET(slot); memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); ep0_ctx = &dev_ctx->ctx_ep[1]; ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); pci_xhci_init_ep(dev, 1); dev->dev_slotstate = XHCI_ST_ADDRESSED; DPRINTF(("pci_xhci: address device, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static uint32_t pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx, *iep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci config_ep slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u", slot)); if (dev->dev_ue->ue_stop != NULL) dev->dev_ue->ue_stop(dev->dev_sc); dev->dev_slotstate = XHCI_ST_ADDRESSED; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, 0x1F, 27); /* disable endpoints */ for (i = 2; i < 32; i++) pci_xhci_disable_ep(dev, i); cmderr = XHCI_TRB_ERROR_SUCCESS; goto done; } if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed", dev->dev_slotstate)); cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } /* In addressed/configured state; * for each drop endpoint ctx flag: * ep->state = DISABLED * for each add endpoint ctx flag: * cp(ep-in, ep-out) * ep->state = RUNNING * for each drop+add endpoint flag: * reset ep resources * cp(ep-in, ep-out) * ep->state = RUNNING * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) * slot->state = configured */ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); dev_ctx = dev->dev_ctx; DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, input_ctx->ctx_input.dwInCtx7)); for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; if (input_ctx->ctx_input.dwInCtx0 & XHCI_INCTX_0_DROP_MASK(i)) { DPRINTF((" config ep - dropping ep %d", i)); pci_xhci_disable_ep(dev, i); } if (input_ctx->ctx_input.dwInCtx1 & XHCI_INCTX_1_ADD_MASK(i)) { iep_ctx = &input_ctx->ctx_ep[i]; DPRINTF((" enable ep[%d] %08x %08x %016lx %08x", i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); pci_xhci_init_ep(dev, i); /* ep state */ ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); } } /* slot state to configured */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); dev->dev_slotstate = XHCI_ST_CONFIGURED; DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " "[3]=0x%08x", slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t type; epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); DPRINTF(("pci_xhci: reset ep %u: slot %u", epid, slot)); cmderr = XHCI_TRB_ERROR_SUCCESS; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if (type == XHCI_TRB_TYPE_STOP_EP && (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { /* XXX suspend endpoint for 10ms */ } if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: reset ep: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } devep = &dev->eps[epid]; if (devep->ep_xfer != NULL) USB_DATA_XFER_RESET(devep->ep_xfer); dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; if (devep->ep_MaxPStreams == 0) ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (type == XHCI_TRB_TYPE_RESET_EP && (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } done: return (cmderr); } static uint32_t pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, struct pci_xhci_dev_ep *devep, uint32_t streamid) { struct xhci_stream_ctx *sctx; if (devep->ep_MaxPStreams == 0) return (XHCI_TRB_ERROR_TRB); if (devep->ep_MaxPStreams > XHCI_STREAMS_MAX) return (XHCI_TRB_ERROR_INVALID_SID); if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { DPRINTF(("pci_xhci: find_stream; LSA bit not set")); return (XHCI_TRB_ERROR_INVALID_SID); } /* only support primary stream */ if (streamid > devep->ep_MaxPStreams) return (XHCI_TRB_ERROR_STREAM_TYPE); sctx = (struct xhci_stream_ctx *)XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) return (XHCI_TRB_ERROR_STREAM_TYPE); return (XHCI_TRB_ERROR_SUCCESS); } static uint32_t pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t streamid; cmderr = XHCI_TRB_ERROR_SUCCESS; dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u", (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), (uint32_t)(trb->qwTrb0 & 0x1))); DPRINTF((" stream-id %u, slot %u, epid %u, C %u", (trb->dwTrb2 >> 16) & 0xFFFF, XHCI_TRB_3_SLOT_GET(trb->dwTrb3), XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { case XHCI_ST_EPCTX_STOPPED: case XHCI_ST_EPCTX_ERROR: break; default: DPRINTF(("pci_xhci cmd set_tr invalid state %x", XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; goto done; } streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); if (devep->ep_MaxPStreams > 0) { cmderr = pci_xhci_find_stream(sc, ep_ctx, devep, streamid); if (cmderr == XHCI_TRB_ERROR_SUCCESS) { assert(devep->ep_sctx != NULL); devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; devep->ep_sctx_trbs[streamid].ringaddr = trb->qwTrb0 & ~0xF; devep->ep_sctx_trbs[streamid].ccs = XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); } } else { if (streamid != 0) { DPRINTF(("pci_xhci cmd set_tr streamid %x != 0", streamid)); } ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; devep->ep_ccs = trb->qwTrb0 & 0x1; devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); DPRINTF(("pci_xhci set_tr first TRB:")); pci_xhci_dump_trb(devep->ep_tr); } ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; done: return (cmderr); } static uint32_t pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { DPRINTF(("pci_xhci: eval ctx, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot; in this emulation, slot_id = address */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: eval ctx, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ /* set max exit latency */ dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, 0xFFFF, 0); /* set interrupter target */ dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, 0x3FF, 22); } if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ /* set max packet size */ dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, 0xFFFF, 16); ep0_ctx = &dev_ctx->ctx_ep[1]; } DPRINTF(("pci_xhci: eval ctx, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static int pci_xhci_complete_commands(struct pci_xhci_softc *sc) { struct xhci_trb evtrb; struct xhci_trb *trb; uint64_t crcr; uint32_t ccs; /* cycle state (XHCI 4.9.2) */ uint32_t type; uint32_t slot; uint32_t cmderr; int error; error = 0; sc->opregs.crcr |= XHCI_CRCR_LO_CRR; trb = sc->opregs.cr_p; ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; crcr = sc->opregs.crcr & ~0xF; while (1) { sc->opregs.cr_p = trb; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) break; DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u", type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); cmderr = XHCI_TRB_ERROR_SUCCESS; evtrb.dwTrb2 = 0; evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); slot = 0; switch (type) { case XHCI_TRB_TYPE_LINK: /* 0x06 */ if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= XHCI_CRCR_LO_RCS; break; case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ cmderr = pci_xhci_cmd_enable_slot(sc, &slot); break; case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_disable_slot(sc, slot); break; case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_address_device(sc, slot, trb); break; case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ DPRINTF(("Reset Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ DPRINTF(("Stop Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_device(sc, slot); break; case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ /* TODO: */ break; case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ break; case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ break; case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ break; case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ break; case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ break; default: DPRINTF(("pci_xhci: unsupported cmd %x", type)); break; } if (type != XHCI_TRB_TYPE_LINK) { /* * insert command completion event and assert intr */ evtrb.qwTrb0 = crcr; evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); DPRINTF(("pci_xhci: command 0x%x result: 0x%x", type, cmderr)); pci_xhci_insert_event(sc, &evtrb, 1); } trb = pci_xhci_trb_next(sc, trb, &crcr); } sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; return (error); } static void pci_xhci_dump_trb(struct xhci_trb *trb) { static const char *trbtypes[] = { "RESERVED", "NORMAL", "SETUP_STAGE", "DATA_STAGE", "STATUS_STAGE", "ISOCH", "LINK", "EVENT_DATA", "NOOP", "ENABLE_SLOT", "DISABLE_SLOT", "ADDRESS_DEVICE", "CONFIGURE_EP", "EVALUATE_CTX", "RESET_EP", "STOP_EP", "SET_TR_DEQUEUE", "RESET_DEVICE", "FORCE_EVENT", "NEGOTIATE_BW", "SET_LATENCY_TOL", "GET_PORT_BW", "FORCE_HEADER", "NOOP_CMD" }; uint32_t type; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x", trb, type, type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); } static int pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, uint32_t slot, uint32_t epid, int *do_intr) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct xhci_trb *trb; struct xhci_trb evtrb; uint32_t trbflags; uint32_t edtla; int i, err; dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; err = XHCI_TRB_ERROR_SUCCESS; *do_intr = 0; edtla = 0; /* go through list of TRBs and insert event(s) */ for (i = xfer->head; xfer->ndata > 0; ) { evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; trb = XHCI_GADDR(sc, evtrb.qwTrb0); trbflags = trb->dwTrb3; DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " "(err %d) IOC?%d", i, xfer->data[i].processed, xfer->data[i].blen, XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, trbflags, err, trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); if (!xfer->data[i].processed) { xfer->head = i; break; } xfer->ndata--; edtla += xfer->data[i].bdone; trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, xfer->data[i].streamid, xfer->data[i].trbnext, xfer->data[i].ccs); /* Only interrupt if IOC or short packet */ if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && !((err == XHCI_TRB_ERROR_SHORT_PKT) && (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { i = (i + 1) % USB_MAX_XFER_BLOCKS; continue; } evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | XHCI_TRB_2_REM_SET(xfer->data[i].blen); evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { DPRINTF(("pci_xhci EVENT_DATA edtla %u", edtla)); evtrb.qwTrb0 = trb->qwTrb0; evtrb.dwTrb2 = (edtla & 0xFFFFF) | XHCI_TRB_2_ERROR_SET(err); evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; edtla = 0; } *do_intr = 1; err = pci_xhci_insert_event(sc, &evtrb, 0); if (err != XHCI_TRB_ERROR_SUCCESS) { break; } i = (i + 1) % USB_MAX_XFER_BLOCKS; } return (err); } static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev __unused, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs) { if (devep->ep_MaxPStreams != 0) { devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | (ccs & 0x1); devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); DPRINTF(("xhci update ep-ring stream %d, addr %lx", streamid, devep->ep_sctx[streamid].qwSctx0)); } else { devep->ep_ringaddr = ringaddr & ~0xFUL; devep->ep_ccs = ccs & 0x1; devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); DPRINTF(("xhci update ep-ring, addr %lx", (devep->ep_ringaddr | devep->ep_ccs))); } } /* * Outstanding transfer still in progress (device NAK'd earlier) so retry * the transfer again to see if it succeeds. */ static int pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) { struct usb_data_xfer *xfer; int err; int do_intr; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); err = 0; do_intr = 0; xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); /* outstanding requests queued up */ if (dev->dev_ue->ue_data != NULL) { err = dev->dev_ue->ue_data(dev->dev_sc, xfer, epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); if (err == USB_ERR_CANCELLED) { if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == USB_NAK) err = XHCI_TRB_ERROR_SUCCESS; } else { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { pci_xhci_assert_interrupt(sc); } /* XXX should not do it if error? */ USB_DATA_XFER_RESET(xfer); } } USB_DATA_XFER_UNLOCK(xfer); return (err); } static int pci_xhci_handle_transfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) { struct xhci_trb *setup_trb; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; uint64_t val; uint32_t trbflags; int do_intr, err; int do_retry; ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); DPRINTF(("pci_xhci handle_transfer slot %u", slot)); retry: err = XHCI_TRB_ERROR_INVALID; do_retry = 0; do_intr = 0; setup_trb = NULL; while (1) { pci_xhci_dump_trb(trb); trbflags = trb->dwTrb3; if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && (trbflags & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) { DPRINTF(("Cycle-bit changed trbflags %x, ccs %x", trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); break; } xfer_block = NULL; switch (XHCI_TRB_3_TYPE_GET(trbflags)) { case XHCI_TRB_TYPE_LINK: if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= 0x1; xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_SETUP_STAGE: if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { DPRINTF(("pci_xhci: invalid setup trb")); err = XHCI_TRB_ERROR_TRB; goto errout; } setup_trb = trb; val = trb->qwTrb0; if (!xfer->ureq) xfer->ureq = malloc( sizeof(struct usb_device_request)); memcpy(xfer->ureq, &val, sizeof(struct usb_device_request)); xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_NORMAL: case XHCI_TRB_TYPE_ISOCH: if (setup_trb != NULL) { DPRINTF(("pci_xhci: trb not supposed to be in " "ctl scope")); err = XHCI_TRB_ERROR_TRB; goto errout; } /* fall through */ case XHCI_TRB_TYPE_DATA_STAGE: xfer_block = usb_data_xfer_append(xfer, (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); break; case XHCI_TRB_TYPE_STATUS_STAGE: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); break; case XHCI_TRB_TYPE_NOOP: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_EVENT_DATA: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { xfer_block->processed = 1; } break; default: DPRINTF(("pci_xhci: handle xfer unexpected trb type " "0x%x", XHCI_TRB_3_TYPE_GET(trbflags))); err = XHCI_TRB_ERROR_TRB; goto errout; } trb = pci_xhci_trb_next(sc, trb, &addr); DPRINTF(("pci_xhci: next trb: 0x%lx", (uint64_t)trb)); if (xfer_block) { xfer_block->trbnext = addr; xfer_block->streamid = streamid; } if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { break; } /* handle current batch that requires interrupt on complete */ if (trbflags & XHCI_TRB_3_IOC_BIT) { DPRINTF(("pci_xhci: trb IOC bit set")); if (epid == 1) do_retry = 1; break; } } DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata)); if (xfer->ndata <= 0) goto errout; if (epid == 1) { int usberr; if (dev->dev_ue->ue_request != NULL) usberr = dev->dev_ue->ue_request(dev->dev_sc, xfer); else usberr = USB_ERR_NOT_STARTED; err = USB_TO_XHCI_ERR(usberr); if (err == XHCI_TRB_ERROR_SUCCESS || err == XHCI_TRB_ERROR_STALL || err == XHCI_TRB_ERROR_SHORT_PKT) { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err != XHCI_TRB_ERROR_SUCCESS) do_retry = 0; } } else { /* handle data transfer */ pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); err = XHCI_TRB_ERROR_SUCCESS; } errout: if (err == XHCI_TRB_ERROR_EV_RING_FULL) DPRINTF(("pci_xhci[%d]: event ring full", __LINE__)); if (!do_retry) USB_DATA_XFER_UNLOCK(xfer); if (do_intr) pci_xhci_assert_interrupt(sc); if (do_retry) { USB_DATA_XFER_RESET(xfer); DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs", __LINE__)); goto retry; } if (epid == 1) USB_DATA_XFER_RESET(xfer); return (err); } static void pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, uint32_t epid, uint32_t streamid) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct pci_xhci_trb_ring *sctx_tr; struct xhci_trb *trb; uint64_t ringaddr; uint32_t ccs; int error; DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u", slot, epid, streamid)); if (slot == 0 || slot > XHCI_MAX_SLOTS) { DPRINTF(("pci_xhci: invalid doorbell slot %u", slot)); return; } if (epid == 0 || epid >= XHCI_MAX_ENDPOINTS) { DPRINTF(("pci_xhci: invalid endpoint %u", epid)); return; } dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); if (!dev_ctx) { return; } ep_ctx = &dev_ctx->ctx_ep[epid]; sctx_tr = NULL; DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (ep_ctx->qwEpCtx2 == 0) return; /* handle pending transfers */ if (devep->ep_xfer->ndata > 0) { pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); return; } /* get next trb work item */ if (devep->ep_MaxPStreams != 0) { /* * Stream IDs of 0, 65535 (any stream), and 65534 * (prime) are invalid. */ if (streamid == 0 || streamid == 65534 || streamid == 65535) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } error = pci_xhci_find_stream(sc, ep_ctx, devep, streamid); if (error != XHCI_TRB_ERROR_SUCCESS) { DPRINTF(("pci_xhci: invalid stream %u: %d", streamid, error)); return; } sctx_tr = &devep->ep_sctx_trbs[streamid]; ringaddr = sctx_tr->ringaddr; ccs = sctx_tr->ccs; trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x", streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } else { if (streamid != 0) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } ringaddr = devep->ep_ringaddr; ccs = devep->ep_ccs; trb = devep->ep_tr; DPRINTF(("doorbell, ccs %lx, trb ccs %x", ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?", ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); return; } pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, ringaddr, ccs, streamid); } static void pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset = (offset - sc->dboff) / sizeof(uint32_t); DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx", offset, value)); if (XHCI_HALTED(sc)) { DPRINTF(("pci_xhci: controller halted")); return; } if (offset == 0) pci_xhci_complete_commands(sc); else if (sc->portregs != NULL) pci_xhci_device_doorbell(sc, offset, XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); } static void pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct pci_xhci_rtsregs *rts; offset -= sc->rtsoff; if (offset == 0) { DPRINTF(("pci_xhci attempted write to MFINDEX")); return; } DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx", offset, value)); offset -= 0x20; /* start of intrreg */ rts = &sc->rtsregs; switch (offset) { case 0x00: if (value & XHCI_IMAN_INTR_PEND) rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); if (!(value & XHCI_IMAN_INTR_ENA)) pci_xhci_deassert_interrupt(sc); break; case 0x04: rts->intrreg.imod = value; break; case 0x08: rts->intrreg.erstsz = value & 0xFFFF; break; case 0x10: /* ERSTBA low bits */ rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | (value & ~0x3F); break; case 0x14: /* ERSTBA high bits */ rts->intrreg.erstba = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erstba); rts->erstba_p = XHCI_GADDR(sc, sc->rtsregs.intrreg.erstba & ~0x3FUL); rts->erst_p = XHCI_GADDR(sc, sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); rts->er_enq_idx = 0; rts->er_events_cnt = 0; DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u", rts->erstba_p, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize)); break; case 0x18: /* ERDP low bits */ rts->intrreg.erdp = MASK_64_HI(sc->rtsregs.intrreg.erdp) | (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | (value & ~0xF); if (value & XHCI_ERDP_LO_BUSY) { rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; } rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); break; case 0x1C: /* ERDP high bits */ rts->intrreg.erdp = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erdp); if (rts->er_events_cnt > 0) { uint64_t erdp; int erdp_i; erdp = rts->intrreg.erdp & ~0xF; erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / sizeof(struct xhci_trb); if (erdp_i <= rts->er_enq_idx) rts->er_events_cnt = rts->er_enq_idx - erdp_i; else rts->er_events_cnt = rts->erstba_p->dwEvrsTableSize - (erdp_i - rts->er_enq_idx); DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u", erdp, rts->er_events_cnt)); } break; default: DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx", offset)); break; } } static uint64_t pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) { struct pci_xhci_portregs *portregs; int port; uint32_t reg; if (sc->portregs == NULL) return (0); port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS", port)); /* return default value for unused port */ return (XHCI_PS_SPEED_SET(3)); } portregs = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: reg = portregs->portsc; break; case 4: reg = portregs->portpmsc; break; case 8: reg = portregs->portli; break; case 12: reg = portregs->porthlpmc; break; default: DPRINTF(("pci_xhci: unaligned portregs read offset %#lx", offset)); reg = 0xffffffff; break; } DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x", offset, port, reg)); return (reg); } static void pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset -= XHCI_CAPLEN; if (offset < 0x400) DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx", offset, value)); switch (offset) { case XHCI_USBCMD: sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); break; case XHCI_USBSTS: /* clear bits on write */ sc->opregs.usbsts &= ~(value & (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); break; case XHCI_PAGESIZE: /* read only */ break; case XHCI_DNCTRL: sc->opregs.dnctrl = value & 0xFFFF; break; case XHCI_CRCR_LO: if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); sc->opregs.crcr |= value & (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); } else { sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); } break; case XHCI_CRCR_HI: if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | (value << 32); sc->opregs.cr_p = XHCI_GADDR(sc, sc->opregs.crcr & ~0xF); } if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { /* Stop operation of Command Ring */ } if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { /* Abort command */ } break; case XHCI_DCBAAP_LO: sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | (value & 0xFFFFFFC0); break; case XHCI_DCBAAP_HI: sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | (value << 32); sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)", sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); break; case XHCI_CONFIG: sc->opregs.config = value & 0x03FF; break; default: if (offset >= 0x400) pci_xhci_portregs_write(sc, offset, value); break; } } static void pci_xhci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size __unused, uint64_t value) { struct pci_xhci_softc *sc; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) /* read only registers */ WPRINTF(("pci_xhci: write RO-CAPs offset %ld", offset)); else if (offset < sc->dboff) pci_xhci_hostop_write(sc, offset, value); else if (offset < sc->rtsoff) pci_xhci_dbregs_write(sc, offset, value); else if (offset < sc->regsend) pci_xhci_rtsregs_write(sc, offset, value); else WPRINTF(("pci_xhci: write invalid offset %ld", offset)); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; switch (offset) { case XHCI_CAPLENGTH: /* 0x00 */ value = sc->caplength; break; case XHCI_HCSPARAMS1: /* 0x04 */ value = sc->hcsparams1; break; case XHCI_HCSPARAMS2: /* 0x08 */ value = sc->hcsparams2; break; case XHCI_HCSPARAMS3: /* 0x0C */ value = sc->hcsparams3; break; case XHCI_HCSPARAMS0: /* 0x10 */ value = sc->hccparams1; break; case XHCI_DBOFF: /* 0x14 */ value = sc->dboff; break; case XHCI_RTSOFF: /* 0x18 */ value = sc->rtsoff; break; case XHCI_HCCPRAMS2: /* 0x1C */ value = sc->hccparams2; break; default: value = 0; break; } DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; offset = (offset - XHCI_CAPLEN); switch (offset) { case XHCI_USBCMD: /* 0x00 */ value = sc->opregs.usbcmd; break; case XHCI_USBSTS: /* 0x04 */ value = sc->opregs.usbsts; break; case XHCI_PAGESIZE: /* 0x08 */ value = sc->opregs.pgsz; break; case XHCI_DNCTRL: /* 0x14 */ value = sc->opregs.dnctrl; break; case XHCI_CRCR_LO: /* 0x18 */ value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; break; case XHCI_CRCR_HI: /* 0x1C */ value = 0; break; case XHCI_DCBAAP_LO: /* 0x30 */ value = sc->opregs.dcbaap & 0xFFFFFFFF; break; case XHCI_DCBAAP_HI: /* 0x34 */ value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; break; case XHCI_CONFIG: /* 0x38 */ value = sc->opregs.config; break; default: if (offset >= 0x400) value = pci_xhci_portregs_read(sc, offset); else value = 0; break; } if (offset < 0x400) DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_dbregs_read(struct pci_xhci_softc *sc __unused, uint64_t offset __unused) { /* read doorbell always returns 0 */ return (0); } static uint64_t pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->rtsoff; value = 0; if (offset == XHCI_MFINDEX) { value = sc->rtsregs.mfindex; } else if (offset >= 0x20) { int item; uint32_t *p; offset -= 0x20; item = offset % 32; assert(offset < sizeof(sc->rtsregs.intrreg)); p = &sc->rtsregs.intrreg.iman; p += item / sizeof(uint32_t); value = *p; } DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->regsend; value = 0; switch (offset) { case 0: /* rev major | rev minor | next-cap | cap-id */ value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; break; case 4: /* name string = "USB" */ value = 0x20425355; break; case 8: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; break; case 12: break; case 16: /* rev major | rev minor | next-cap | cap-id */ value = (0x03 << 24) | XHCI_ID_PROTOCOLS; break; case 20: /* name string = "USB" */ value = 0x20425355; break; case 24: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; break; case 28: break; default: DPRINTF(("pci_xhci: xecp invalid offset 0x%lx", offset)); break; } DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_xhci_softc *sc; uint32_t value; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) value = pci_xhci_hostcap_read(sc, offset); else if (offset < sc->dboff) value = pci_xhci_hostop_read(sc, offset); else if (offset < sc->rtsoff) value = pci_xhci_dbregs_read(sc, offset); else if (offset < sc->regsend) value = pci_xhci_rtsregs_read(sc, offset); else if (offset < (sc->regsend + 4*32)) value = pci_xhci_xecp_read(sc, offset); else { value = 0; WPRINTF(("pci_xhci: read invalid offset %ld", offset)); } pthread_mutex_unlock(&sc->mtx); switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } return (value); } static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; struct xhci_trb evtrb; int error; assert(portn <= XHCI_MAX_DEVS); DPRINTF(("xhci reset port %d", portn)); port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); port->portsc |= XHCI_PS_PED | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); if (warm && dev->dev_ue->ue_usbver == 3) { port->portsc |= XHCI_PS_WRC; } if ((port->portsc & XHCI_PS_PRC) == 0) { port->portsc |= XHCI_PS_PRC; pci_xhci_set_evtrb(&evtrb, portn, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 1); if (error != XHCI_TRB_ERROR_SUCCESS) DPRINTF(("xhci reset port insert event " "failed")); } } } static void pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc = XHCI_PS_CCS | /* connected */ XHCI_PS_PP; /* port power */ if (dev->dev_ue->ue_usbver == 2) { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } else { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | XHCI_PS_PED | /* enabled */ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } DPRINTF(("Init port %d 0x%x", portn, port->portsc)); } else { port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; DPRINTF(("Init empty port %d 0x%x", portn, port->portsc)); } } static int pci_xhci_dev_intr(struct usb_hci *hci, int epctx) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_trb evtrb; struct pci_xhci_softc *sc; struct pci_xhci_portregs *p; struct xhci_endp_ctx *ep_ctx; int error = 0; int dir_in; int epid; dir_in = epctx & 0x80; epid = epctx & ~0x80; /* HW endpoint contexts are 0-15; convert to epid based on dir */ epid = (epid * 2) + (dir_in ? 1 : 0); assert(epid >= 1 && epid <= 31); dev = hci->hci_sc; sc = dev->xsc; /* check if device is ready; OS has to initialise it */ if (sc->rtsregs.erstba_p == NULL || (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || dev->dev_ctx == NULL) return (0); p = XHCI_PORTREG_PTR(sc, hci->hci_port); /* raise event if link U3 (suspended) state */ if (XHCI_PS_PLS_GET(p->portsc) == 3) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); if ((p->portsc & XHCI_PS_PLC) != 0) return (0); p->portsc |= XHCI_PS_PLC; pci_xhci_set_evtrb(&evtrb, hci->hci_port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 0); if (error != XHCI_TRB_ERROR_SUCCESS) goto done; } dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { DPRINTF(("xhci device interrupt on disabled endpoint %d", epid)); return (0); } DPRINTF(("xhci device interrupt on endpoint %d", epid)); pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); done: return (error); } static int pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid __unused, void *param __unused) { DPRINTF(("xhci device event port %d", hci->hci_port)); return (0); } /* * Each controller contains a "slot" node which contains a list of * child nodes each of which is a device. Each slot node's name * corresponds to a specific controller slot. These nodes * contain a "device" variable identifying the device model of the * USB device. For example: * * pci.0.1.0 * .device="xhci" * .slot * .1 * .device="tablet" */ static int pci_xhci_legacy_config(nvlist_t *nvl, const char *opts) { char node_name[16]; nvlist_t *slots_nvl, *slot_nvl; char *cp, *opt, *str, *tofree; int slot; if (opts == NULL) return (0); slots_nvl = create_relative_config_node(nvl, "slot"); slot = 1; tofree = str = strdup(opts); while ((opt = strsep(&str, ",")) != NULL) { /* device[=] */ cp = strchr(opt, '='); if (cp != NULL) { *cp = '\0'; cp++; } snprintf(node_name, sizeof(node_name), "%d", slot); slot++; slot_nvl = create_relative_config_node(slots_nvl, node_name); set_config_value_node(slot_nvl, "device", opt); /* * NB: Given that we split on commas above, the legacy * format only supports a single option. */ if (cp != NULL && *cp != '\0') pci_parse_legacy_config(slot_nvl, cp); } free(tofree); return (0); } static int pci_xhci_parse_devices(struct pci_xhci_softc *sc, nvlist_t *nvl) { struct pci_xhci_dev_emu *dev; struct usb_devemu *ue; const nvlist_t *slots_nvl, *slot_nvl; const char *name, *device; char *cp; void *devsc, *cookie; long slot; int type, usb3_port, usb2_port, i, ndevices; usb3_port = sc->usb3_port_start; usb2_port = sc->usb2_port_start; sc->devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); ndevices = 0; slots_nvl = find_relative_config_node(nvl, "slot"); if (slots_nvl == NULL) goto portsfinal; cookie = NULL; while ((name = nvlist_next(slots_nvl, &type, &cookie)) != NULL) { if (usb2_port == ((sc->usb2_port_start) + XHCI_MAX_DEVS/2) || usb3_port == ((sc->usb3_port_start) + XHCI_MAX_DEVS/2)) { WPRINTF(("pci_xhci max number of USB 2 or 3 " "devices reached, max %d", XHCI_MAX_DEVS/2)); goto bad; } if (type != NV_TYPE_NVLIST) { EPRINTLN( "pci_xhci: config variable '%s' under slot node", name); goto bad; } slot = strtol(name, &cp, 0); if (*cp != '\0' || slot <= 0 || slot > XHCI_MAX_SLOTS) { EPRINTLN("pci_xhci: invalid slot '%s'", name); goto bad; } if (XHCI_SLOTDEV_PTR(sc, slot) != NULL) { EPRINTLN("pci_xhci: duplicate slot '%s'", name); goto bad; } slot_nvl = nvlist_get_nvlist(slots_nvl, name); device = get_config_value_node(slot_nvl, "device"); if (device == NULL) { EPRINTLN( "pci_xhci: missing \"device\" value for slot '%s'", name); goto bad; } ue = usb_emu_finddev(device); if (ue == NULL) { EPRINTLN("pci_xhci: unknown device model \"%s\"", device); goto bad; } DPRINTF(("pci_xhci adding device %s", device)); dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); dev->xsc = sc; dev->hci.hci_sc = dev; dev->hci.hci_intr = pci_xhci_dev_intr; dev->hci.hci_event = pci_xhci_dev_event; if (ue->ue_usbver == 2) { if (usb2_port == sc->usb2_port_start + XHCI_MAX_DEVS / 2) { WPRINTF(("pci_xhci max number of USB 2 devices " "reached, max %d", XHCI_MAX_DEVS / 2)); goto bad; } dev->hci.hci_port = usb2_port; usb2_port++; } else { if (usb3_port == sc->usb3_port_start + XHCI_MAX_DEVS / 2) { WPRINTF(("pci_xhci max number of USB 3 devices " "reached, max %d", XHCI_MAX_DEVS / 2)); goto bad; } dev->hci.hci_port = usb3_port; usb3_port++; } XHCI_DEVINST_PTR(sc, dev->hci.hci_port) = dev; dev->hci.hci_address = 0; devsc = ue->ue_init(&dev->hci, nvl); if (devsc == NULL) { goto bad; } dev->dev_ue = ue; dev->dev_sc = devsc; XHCI_SLOTDEV_PTR(sc, slot) = dev; ndevices++; } portsfinal: sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); if (ndevices > 0) { for (i = 1; i <= XHCI_MAX_DEVS; i++) { pci_xhci_init_port(sc, i); } } else { WPRINTF(("pci_xhci no USB devices configured")); } return (0); bad: for (i = 1; i <= XHCI_MAX_DEVS; i++) { free(XHCI_DEVINST_PTR(sc, i)); } free(sc->devices); free(sc->slots); return (-1); } static int pci_xhci_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_xhci_softc *sc; int error; if (xhci_in_use) { WPRINTF(("pci_xhci controller already defined")); return (-1); } xhci_in_use = 1; sc = calloc(1, sizeof(struct pci_xhci_softc)); pi->pi_arg = sc; sc->xsc_pi = pi; sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; sc->usb3_port_start = 1; /* discover devices */ error = pci_xhci_parse_devices(sc, nvl); if (error < 0) goto done; else error = 0; sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | XHCI_SET_HCIVERSION(0x0100); sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | XHCI_SET_HCSP2_IST(0x04); sc->hcsparams3 = 0; /* no latency */ sc->hccparams1 = XHCI_SET_HCCP1_AC64(1) | /* 64-bit addrs */ XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ XHCI_SET_HCCP1_SPC(1) | /* short packet */ XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | XHCI_SET_HCCP2_U3C(1); sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); /* dboff must be 32-bit aligned */ if (sc->dboff & 0x3) sc->dboff = (sc->dboff + 0x3) & ~0x3; /* rtsoff must be 32-bytes aligned */ sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); if (sc->rtsoff & 0x1F) sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x", sc->dboff, sc->rtsoff)); sc->opregs.usbsts = XHCI_STS_HCH; sc->opregs.pgsz = XHCI_PAGESIZE_4K; pci_xhci_reset(sc); sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ /* * Set extended capabilities pointer to be after regsend; * value of xecp field is 32-bit offset. */ sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); pci_emul_add_msicap(pi, 1); /* regsend + xecp registers */ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); DPRINTF(("pci_xhci pci_emu_alloc: %d", sc->regsend + 4*32)); pci_lintr_request(pi); pthread_mutex_init(&sc->mtx, NULL); done: if (error) { free(sc); } return (error); } #ifdef BHYVE_SNAPSHOT static void pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[]) { int i, j; struct pci_xhci_dev_emu *dev, *slot; memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { for (j = 1; j <= XHCI_MAX_DEVS; j++) { slot = XHCI_SLOTDEV_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, j); if (slot == dev) maps[i] = j; } } } static int pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, int idx, struct vm_snapshot_meta *meta) { int k; int ret; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; /* some sanity checks */ if (meta->op == VM_SNAPSHOT_SAVE) xfer = dev->eps[idx].ep_xfer; SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done); if (xfer == NULL) { ret = 0; goto done; } if (meta->op == VM_SNAPSHOT_RESTORE) { pci_xhci_init_ep(dev, idx); xfer = dev->eps[idx].ep_xfer; } /* save / restore proper */ for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) { xfer_block = &xfer->data[k]; SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->xsc_pi->pi_vmctx, xfer_block->buf, XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done); if (xfer->ureq) { /* xfer->ureq is not allocated at restore time */ if (meta->op == VM_SNAPSHOT_RESTORE) xfer->ureq = malloc(sizeof(struct usb_device_request)); SNAPSHOT_BUF_OR_LEAVE(xfer->ureq, sizeof(struct usb_device_request), meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done); done: return (ret); } static int pci_xhci_snapshot(struct vm_snapshot_meta *meta) { int i, j; int ret; int restore_idx; struct pci_devinst *pi; struct pci_xhci_softc *sc; struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; char dname[SNAP_DEV_NAME_LEN]; int maps[XHCI_MAX_SLOTS + 1]; pi = meta->dev_data; sc = pi->pi_arg; SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done); /* opregs */ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done); /* opregs.cr_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->opregs.cr_p, XHCI_GADDR_SIZE(sc->opregs.cr_p), true, meta, ret, done); /* opregs.dcbaa_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->opregs.dcbaa_p, XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), true, meta, ret, done); /* rtsregs */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done); /* rtsregs.intrreg */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done); /* rtsregs.erstba_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->rtsregs.erstba_p, XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), true, meta, ret, done); /* rtsregs.erst_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, sc->rtsregs.erst_p, XHCI_GADDR_SIZE(sc->rtsregs.erst_p), true, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done); /* sanity checking */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; if (meta->op == VM_SNAPSHOT_SAVE) restore_idx = i; SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done); /* check if the restored device (when restoring) is sane */ if (restore_idx != i) { - fprintf(stderr, "%s: idx not matching: actual: %d, " - "expected: %d\r\n", __func__, restore_idx, i); + EPRINTLN("%s: idx not matching: actual: %d, " + "expected: %d", __func__, restore_idx, i); ret = EINVAL; goto done; } if (meta->op == VM_SNAPSHOT_SAVE) { memset(dname, 0, sizeof(dname)); strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1); } SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) { dname[sizeof(dname) - 1] = '\0'; if (strcmp(dev->dev_ue->ue_emu, dname)) { - fprintf(stderr, "%s: device names mismatch: " - "actual: %s, expected: %s\r\n", - __func__, dname, dev->dev_ue->ue_emu); + EPRINTLN("%s: device names mismatch: " + "actual: %s, expected: %s", + __func__, dname, dev->dev_ue->ue_emu); ret = EINVAL; goto done; } } } /* portregs */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { port = XHCI_PORTREG_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done); } /* slots */ if (meta->op == VM_SNAPSHOT_SAVE) pci_xhci_map_devs_slots(sc, maps); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) { dev = XHCI_SLOTDEV_PTR(sc, i); } else if (meta->op == VM_SNAPSHOT_RESTORE) { if (maps[i] != 0) dev = XHCI_DEVINST_PTR(sc, maps[i]); else dev = NULL; XHCI_SLOTDEV_PTR(sc, i) = dev; } else { /* error */ ret = EINVAL; goto done; } if (dev == NULL) continue; SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(pi->pi_vmctx, dev->dev_ctx, XHCI_GADDR_SIZE(dev->dev_ctx), true, meta, ret, done); if (dev->dev_ctx != NULL) { for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) { ret = pci_xhci_snapshot_ep(sc, dev, j, meta); if (ret != 0) goto done; } } SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done); /* devices[i]->dev_sc */ dev->dev_ue->ue_snapshot(dev->dev_sc, meta); /* devices[i]->hci */ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done); done: return (ret); } #endif static const struct pci_devemu pci_de_xhci = { .pe_emu = "xhci", .pe_init = pci_xhci_init, .pe_legacy_config = pci_xhci_legacy_config, .pe_barwrite = pci_xhci_write, .pe_barread = pci_xhci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_xhci_snapshot, #endif }; PCI_EMUL_SET(pci_de_xhci); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c index 86c76e679e53..5f643c9ceb50 100644 --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -1,1613 +1,1613 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #ifdef __amd64__ #include "amd64/atkbdc.h" #endif #include "debug.h" #include "ipc.h" #include "mem.h" #include "pci_emul.h" #include "snapshot.h" #include #include struct spinner_info { const size_t *crtval; const size_t maxval; const size_t total; }; extern int guest_ncpus; static struct winsize winsize; static sig_t old_winch_handler; #define KB (1024UL) #define MB (1024UL * KB) #define GB (1024UL * MB) #define SNAPSHOT_CHUNK (4 * MB) #define PROG_BUF_SZ (8192) #define SNAPSHOT_BUFFER_SIZE (20 * MB) #define JSON_KERNEL_ARR_KEY "kern_structs" #define JSON_DEV_ARR_KEY "devices" #define JSON_BASIC_METADATA_KEY "basic metadata" #define JSON_SNAPSHOT_REQ_KEY "device" #define JSON_SIZE_KEY "size" #define JSON_FILE_OFFSET_KEY "file_offset" #define JSON_NCPUS_KEY "ncpus" #define JSON_VMNAME_KEY "vmname" #define JSON_MEMSIZE_KEY "memsize" #define JSON_MEMFLAGS_KEY "memflags" #define min(a,b) \ ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; \ }) static const struct vm_snapshot_kern_info snapshot_kern_structs[] = { { "vhpet", STRUCT_VHPET }, { "vm", STRUCT_VM }, { "vioapic", STRUCT_VIOAPIC }, { "vlapic", STRUCT_VLAPIC }, { "vmcx", STRUCT_VMCX }, { "vatpit", STRUCT_VATPIT }, { "vatpic", STRUCT_VATPIC }, { "vpmtmr", STRUCT_VPMTMR }, { "vrtc", STRUCT_VRTC }, }; static cpuset_t vcpus_active, vcpus_suspended; static pthread_mutex_t vcpu_lock; static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. */ static char * strcat_extension(const char *base_str, const char *ext) { char *res; size_t base_len, ext_len; base_len = strnlen(base_str, NAME_MAX); ext_len = strnlen(ext, NAME_MAX); if (base_len + ext_len > NAME_MAX) { - fprintf(stderr, "Filename exceeds maximum length.\n"); + EPRINTLN("Filename exceeds maximum length."); return (NULL); } res = malloc(base_len + ext_len + 1); if (res == NULL) { - perror("Failed to allocate memory."); + EPRINTLN("Failed to allocate memory: %s", strerror(errno)); return (NULL); } memcpy(res, base_str, base_len); memcpy(res + base_len, ext, ext_len); res[base_len + ext_len] = 0; return (res); } void destroy_restore_state(struct restore_state *rstate) { if (rstate == NULL) { - fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); + EPRINTLN("Attempting to destroy NULL restore struct."); return; } if (rstate->kdata_map != MAP_FAILED) munmap(rstate->kdata_map, rstate->kdata_len); if (rstate->kdata_fd > 0) close(rstate->kdata_fd); if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); if (rstate->meta_root_obj != NULL) ucl_object_unref(rstate->meta_root_obj); if (rstate->meta_parser != NULL) ucl_parser_free(rstate->meta_parser); } static int load_vmmem_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->vmmem_fd = open(filename, O_RDONLY); if (rstate->vmmem_fd < 0) { perror("Failed to open restore file"); return (-1); } err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); goto err_load_vmmem; } if (sb.st_size == 0) { fprintf(stderr, "Restore file is empty.\n"); goto err_load_vmmem; } rstate->vmmem_len = sb.st_size; return (0); err_load_vmmem: if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); return (-1); } static int load_kdata_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->kdata_fd = open(filename, O_RDONLY); if (rstate->kdata_fd < 0) { perror("Failed to open kernel data file"); return (-1); } err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); goto err_load_kdata; } if (sb.st_size == 0) { fprintf(stderr, "Kernel data file is empty.\n"); goto err_load_kdata; } rstate->kdata_len = sb.st_size; rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, MAP_SHARED, rstate->kdata_fd, 0); if (rstate->kdata_map == MAP_FAILED) { perror("Failed to map restore file"); goto err_load_kdata; } return (0); err_load_kdata: if (rstate->kdata_fd > 0) close(rstate->kdata_fd); return (-1); } static int load_metadata_file(const char *filename, struct restore_state *rstate) { ucl_object_t *obj; struct ucl_parser *parser; int err; parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (parser == NULL) { fprintf(stderr, "Failed to initialize UCL parser.\n"); err = -1; goto err_load_metadata; } err = ucl_parser_add_file(parser, filename); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); err = -1; goto err_load_metadata; } obj = ucl_parser_get_object(parser); if (obj == NULL) { fprintf(stderr, "Failed to parse object.\n"); err = -1; goto err_load_metadata; } rstate->meta_parser = parser; rstate->meta_root_obj = (ucl_object_t *)obj; return (0); err_load_metadata: if (parser != NULL) ucl_parser_free(parser); return (err); } int load_restore_file(const char *filename, struct restore_state *rstate) { int err = 0; char *kdata_filename = NULL, *meta_filename = NULL; assert(filename != NULL); assert(rstate != NULL); memset(rstate, 0, sizeof(*rstate)); rstate->kdata_map = MAP_FAILED; err = load_vmmem_file(filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest RAM file.\n"); goto err_restore; } kdata_filename = strcat_extension(filename, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); goto err_restore; } err = load_kdata_file(kdata_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest kernel data file.\n"); goto err_restore; } meta_filename = strcat_extension(filename, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct kernel metadata filename.\n"); goto err_restore; } err = load_metadata_file(meta_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest metadata file.\n"); goto err_restore; } return (0); err_restore: destroy_restore_state(rstate); if (kdata_filename != NULL) free(kdata_filename); if (meta_filename != NULL) free(meta_filename); return (-1); } #define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_toint_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to int.", key); \ return (ret); \ } \ } while(0) #define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to string.", key); \ return (ret); \ } \ } while(0) static void * lookup_check_dev(const char *dev_name, struct restore_state *rstate, const ucl_object_t *obj, size_t *data_size) { const char *snapshot_req; int64_t size, file_offset; snapshot_req = NULL; JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req != NULL); if (!strcmp(snapshot_req, dev_name)) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *data_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } return (NULL); } static void * lookup_dev(const char *dev_name, const char *key, struct restore_state *rstate, size_t *data_size) { const ucl_object_t *devs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; void *ret; devs = ucl_object_lookup(rstate->meta_root_obj, key); if (devs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_DEV_ARR_KEY); return (NULL); } if (ucl_object_type(devs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_DEV_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { ret = lookup_check_dev(dev_name, rstate, obj, data_size); if (ret != NULL) return (ret); } return (NULL); } static const ucl_object_t * lookup_basic_metadata_object(struct restore_state *rstate) { const ucl_object_t *basic_meta_obj = NULL; basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, JSON_BASIC_METADATA_KEY); if (basic_meta_obj == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } if (ucl_object_type(basic_meta_obj) != UCL_OBJECT) { fprintf(stderr, "Object '%s' is not a JSON object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } return (basic_meta_obj); } const char * lookup_vmname(struct restore_state *rstate) { const char *vmname; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (NULL); JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); return (vmname); } int lookup_memflags(struct restore_state *rstate) { int64_t memflags; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); return ((int)memflags); } size_t lookup_memsize(struct restore_state *rstate) { int64_t memsize; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); if (memsize < 0) memsize = 0; return ((size_t)memsize); } int lookup_guest_ncpus(struct restore_state *rstate) { int64_t ncpus; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); return ((int)ncpus); } static void winch_handler(int signal __unused) { #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ } static int print_progress(size_t crtval, const size_t maxval) { size_t rc; double crtval_gb, maxval_gb; size_t i, win_width, prog_start, prog_done, prog_end; int mval_len; static char prog_buf[PROG_BUF_SZ]; static const size_t len = sizeof(prog_buf); static size_t div; static const char *div_str; static char wip_bar[] = { '/', '-', '\\', '|' }; static int wip_idx = 0; if (maxval == 0) { printf("[0B / 0B]\r\n"); return (0); } if (crtval > maxval) crtval = maxval; if (maxval > 10 * GB) { div = GB; div_str = "GiB"; } else if (maxval > 10 * MB) { div = MB; div_str = "MiB"; } else { div = KB; div_str = "KiB"; } crtval_gb = (double) crtval / div; maxval_gb = (double) maxval / div; rc = snprintf(prog_buf, len, "%.03lf", maxval_gb); if (rc == len) { fprintf(stderr, "Maxval too big\n"); return (-1); } mval_len = rc; rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |", mval_len, crtval_gb, div_str, maxval_gb, div_str); if (rc == len) { fprintf(stderr, "Buffer too small to print progress\n"); return (-1); } win_width = min(winsize.ws_col, len); prog_start = rc; if (prog_start < (win_width - 2)) { prog_end = win_width - prog_start - 2; prog_done = prog_end * (crtval_gb / maxval_gb); for (i = prog_start; i < prog_start + prog_done; i++) prog_buf[i] = '#'; if (crtval != maxval) { prog_buf[i] = wip_bar[wip_idx]; wip_idx = (wip_idx + 1) % sizeof(wip_bar); i++; } else { prog_buf[i++] = '#'; } for (; i < win_width - 2; i++) prog_buf[i] = '_'; prog_buf[win_width - 2] = '|'; } prog_buf[win_width - 1] = '\0'; write(STDOUT_FILENO, prog_buf, win_width); return (0); } static void * snapshot_spinner_cb(void *arg) { int rc; size_t crtval, maxval, total; struct spinner_info *si; struct timespec ts; si = arg; if (si == NULL) pthread_exit(NULL); ts.tv_sec = 0; ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */ do { crtval = *si->crtval; maxval = si->maxval; total = si->total; rc = print_progress(crtval, total); if (rc < 0) { fprintf(stderr, "Failed to parse progress\n"); break; } nanosleep(&ts, NULL); } while (crtval < maxval); pthread_exit(NULL); return NULL; } static int vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src, const size_t len, const size_t totalmem, const bool op_wr) { int rc; size_t part_done, todo, rem; ssize_t done; bool show_progress; pthread_t spinner_th; struct spinner_info *si; if (lseek(snapfd, foff, SEEK_SET) < 0) { perror("Failed to change file offset"); return (-1); } show_progress = false; if (isatty(STDIN_FILENO) && (winsize.ws_col != 0)) show_progress = true; part_done = foff; rem = len; if (show_progress) { si = &(struct spinner_info) { .crtval = &part_done, .maxval = foff + len, .total = totalmem }; rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si); if (rc) { perror("Unable to create spinner thread"); show_progress = false; } } while (rem > 0) { if (show_progress) todo = min(SNAPSHOT_CHUNK, rem); else todo = rem; if (op_wr) done = write(snapfd, src, todo); else done = read(snapfd, src, todo); if (done < 0) { perror("Failed to write in file"); return (-1); } src = (uint8_t *)src + done; part_done += done; rem -= done; } if (show_progress) { rc = pthread_join(spinner_th, NULL); if (rc) perror("Unable to end spinner thread"); } return (0); } static size_t vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr) { int ret; size_t lowmem, highmem, totalmem; char *baseaddr; ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); if (ret) { fprintf(stderr, "%s: unable to retrieve guest memory size\r\n", __func__); return (0); } totalmem = lowmem + highmem; if ((op_wr == false) && (totalmem != memsz)) { fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n", __func__, totalmem, memsz); return (0); } winsize.ws_col = 80; #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ old_winch_handler = signal(SIGWINCH, winch_handler); ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s lowmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } if (highmem == 0) goto done; ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB, highmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s highmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } done: printf("\r\n"); signal(SIGWINCH, old_winch_handler); return (totalmem); } int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) { size_t restored; restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len, false); if (restored != rstate->vmmem_len) return (-1); return (0); } int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) { for (unsigned i = 0; i < nitems(snapshot_kern_structs); i++) { const struct vm_snapshot_kern_info *info; struct vm_snapshot_meta *meta; void *data; size_t size; info = &snapshot_kern_structs[i]; data = lookup_dev(info->struct_name, JSON_KERNEL_ARR_KEY, rstate, &size); if (data == NULL) errx(EX_DATAERR, "Cannot find kern struct %s", info->struct_name); if (size == 0) errx(EX_DATAERR, "data with zero size for %s", info->struct_name); meta = &(struct vm_snapshot_meta) { .dev_name = info->struct_name, .dev_req = info->req, .buffer.buf_start = data, .buffer.buf_size = size, .buffer.buf = data, .buffer.buf_rem = size, .op = VM_SNAPSHOT_RESTORE, }; if (vm_snapshot_req(ctx, meta)) err(EX_DATAERR, "Failed to restore %s", info->struct_name); } return (0); } static int vm_restore_device(struct restore_state *rstate, vm_snapshot_dev_cb func, const char *name, void *data) { void *dev_ptr; size_t dev_size; int ret; struct vm_snapshot_meta *meta; dev_ptr = lookup_dev(name, JSON_DEV_ARR_KEY, rstate, &dev_size); if (dev_ptr == NULL) { EPRINTLN("Failed to lookup dev: %s", name); return (EINVAL); } if (dev_size == 0) { EPRINTLN("Restore device size is 0: %s", name); return (EINVAL); } meta = &(struct vm_snapshot_meta) { .dev_name = name, .dev_data = data, .buffer.buf_start = dev_ptr, .buffer.buf_size = dev_size, .buffer.buf = dev_ptr, .buffer.buf_rem = dev_size, .op = VM_SNAPSHOT_RESTORE, }; ret = func(meta); if (ret != 0) { EPRINTLN("Failed to restore dev: %s %d", name, ret); return (ret); } return (0); } int vm_restore_devices(struct restore_state *rstate) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = vm_restore_device(rstate, pci_snapshot, pdi->pi_name, pdi); if (ret) return (ret); } #ifdef __amd64__ ret = vm_restore_device(rstate, atkbdc_snapshot, "atkbdc", NULL); #else ret = 0; #endif return (ret); } int vm_pause_devices(void) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = pci_pause(pdi); if (ret) { EPRINTLN("Cannot pause dev %s: %d", pdi->pi_name, ret); return (ret); } } return (0); } int vm_resume_devices(void) { int ret; struct pci_devinst *pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = pci_resume(pdi); if (ret) { EPRINTLN("Cannot resume '%s': %d", pdi->pi_name, ret); return (ret); } } return (0); } static int vm_save_kern_struct(struct vmctx *ctx, int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { int ret; size_t data_size; ssize_t write_cnt; ret = vm_snapshot_req(ctx, meta); if (ret != 0) { fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", __func__, meta->dev_name); ret = -1; goto done; } data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ write_cnt = write(data_fd, meta->buffer.buf_start, data_size); if (write_cnt < 0 || (size_t)write_cnt != data_size) { perror("Failed to write all snapshotted data."); ret = -1; goto done; } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, JSON_KERNEL_ARR_KEY); *offset += data_size; done: return (ret); } static int vm_save_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret, error; size_t buf_size, i, offset; char *buffer; struct vm_snapshot_meta *meta; error = 0; offset = 0; buf_size = SNAPSHOT_BUFFER_SIZE; buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); if (buffer == NULL) { error = ENOMEM; perror("Failed to allocate memory for snapshot buffer"); goto err_vm_snapshot_kern_data; } meta = &(struct vm_snapshot_meta) { .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_KERNEL_ARR_KEY); for (i = 0; i < nitems(snapshot_kern_structs); i++) { meta->dev_name = snapshot_kern_structs[i].struct_name; meta->dev_req = snapshot_kern_structs[i].req; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_save_kern_struct(ctx, data_fd, xop, JSON_DEV_ARR_KEY, meta, &offset); if (ret != 0) { error = -1; goto err_vm_snapshot_kern_data; } } xo_close_list_h(xop, JSON_KERNEL_ARR_KEY); err_vm_snapshot_kern_data: if (buffer != NULL) free(buffer); return (error); } static int vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz) { xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vm_get_name(ctx)); xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz); xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", vm_get_memflags(ctx)); xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); return (0); } static int vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { ssize_t ret; size_t data_size; data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ ret = write(data_fd, meta->buffer.buf_start, data_size); if (ret < 0 || (size_t)ret != data_size) { perror("Failed to write all snapshotted data."); return (-1); } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, array_key); *offset += data_size; return (0); } static int vm_snapshot_device(vm_snapshot_dev_cb func, const char *dev_name, void *devdata, int data_fd, xo_handle_t *xop, struct vm_snapshot_meta *meta, off_t *offset) { int ret; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; meta->dev_name = dev_name; meta->dev_data = devdata; ret = func(meta); if (ret != 0) { EPRINTLN("Failed to snapshot %s; ret=%d", dev_name, ret); return (ret); } ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, offset); if (ret != 0) return (ret); return (0); } static int vm_snapshot_devices(int data_fd, xo_handle_t *xop) { int ret; off_t offset; void *buffer; size_t buf_size; struct vm_snapshot_meta *meta; struct pci_devinst *pdi; buf_size = SNAPSHOT_BUFFER_SIZE; offset = lseek(data_fd, 0, SEEK_CUR); if (offset < 0) { perror("Failed to get data file current offset."); return (-1); } buffer = malloc(buf_size); if (buffer == NULL) { perror("Failed to allocate memory for snapshot buffer"); ret = ENOSPC; goto snapshot_err; } meta = &(struct vm_snapshot_meta) { .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_DEV_ARR_KEY); /* Save PCI devices */ pdi = NULL; while ((pdi = pci_next(pdi)) != NULL) { ret = vm_snapshot_device(pci_snapshot, pdi->pi_name, pdi, data_fd, xop, meta, &offset); if (ret != 0) goto snapshot_err; } #ifdef __amd64__ ret = vm_snapshot_device(atkbdc_snapshot, "atkbdc", NULL, data_fd, xop, meta, &offset); #else ret = 0; #endif xo_close_list_h(xop, JSON_DEV_ARR_KEY); snapshot_err: if (buffer != NULL) free(buffer); return (ret); } void checkpoint_cpu_add(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_active); if (checkpoint_active) { CPU_SET(vcpu, &vcpus_suspended); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); } pthread_mutex_unlock(&vcpu_lock); } /* * When a vCPU is suspended for any reason, it calls * checkpoint_cpu_suspend(). This records that the vCPU is idle. * Before returning from suspension, checkpoint_cpu_resume() is * called. In suspend we note that the vCPU is idle. In resume we * pause the vCPU thread until the checkpoint is complete. The reason * for the two-step process is that vCPUs might already be stopped in * the debug server when a checkpoint is requested. This approach * allows us to account for and handle those vCPUs. */ void checkpoint_cpu_suspend(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_suspended); if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) pthread_cond_signal(&vcpus_idle); pthread_mutex_unlock(&vcpu_lock); } void checkpoint_cpu_resume(int vcpu) { pthread_mutex_lock(&vcpu_lock); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_pause(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; vm_suspend_all_cpus(ctx); while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_resume(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = false; pthread_mutex_unlock(&vcpu_lock); vm_resume_all_cpus(ctx); pthread_cond_broadcast(&vcpus_can_run); } static int vm_checkpoint(struct vmctx *ctx, int fddir, const char *checkpoint_file, bool stop_vm) { int fd_checkpoint = 0, kdata_fd = 0, fd_meta; int ret = 0; int error = 0; size_t memsz; xo_handle_t *xop = NULL; char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); return (-1); } kdata_fd = openat(fddir, kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } fd_checkpoint = openat(fddir, checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; goto done; } meta_filename = strcat_extension(checkpoint_file, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct vm metadata filename.\n"); goto done; } fd_meta = openat(fddir, meta_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (fd_meta != -1) meta_file = fdopen(fd_meta, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); close(fd_meta); goto done; } xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); if (xop == NULL) { perror("Failed to get libxo handle on metadata file."); goto done; } vm_vcpu_pause(ctx); ret = vm_pause_devices(); if (ret != 0) { fprintf(stderr, "Could not pause devices\r\n"); error = ret; goto done; } memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); if (memsz == 0) { perror("Could not write guest memory to file"); error = -1; goto done; } ret = vm_snapshot_basic_metadata(ctx, xop, memsz); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); error = -1; goto done; } ret = vm_save_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); error = -1; goto done; } ret = vm_snapshot_devices(kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot device state.\n"); error = -1; goto done; } xo_finish_h(xop); if (stop_vm) { vm_destroy(ctx); exit(0); } done: ret = vm_resume_devices(); if (ret != 0) fprintf(stderr, "Could not resume devices\r\n"); vm_vcpu_resume(ctx); if (fd_checkpoint > 0) close(fd_checkpoint); if (meta_filename != NULL) free(meta_filename); if (kdata_filename != NULL) free(kdata_filename); if (xop != NULL) xo_destroy(xop); if (meta_file != NULL) fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); return (error); } static int handle_message(struct vmctx *ctx, nvlist_t *nvl) { const char *cmd; struct ipc_command **ipc_cmd; if (!nvlist_exists_string(nvl, "cmd")) return (EINVAL); cmd = nvlist_get_string(nvl, "cmd"); IPC_COMMAND_FOREACH(ipc_cmd, ipc_cmd_set) { if (strcmp(cmd, (*ipc_cmd)->name) == 0) return ((*ipc_cmd)->handler(ctx, nvl)); } return (EOPNOTSUPP); } /* * Listen for commands from bhyvectl */ void * checkpoint_thread(void *param) { int fd; struct checkpoint_thread_info *thread_info; nvlist_t *nvl; pthread_set_name_np(pthread_self(), "checkpoint thread"); thread_info = (struct checkpoint_thread_info *)param; while ((fd = accept(thread_info->socket_fd, NULL, NULL)) != -1) { nvl = nvlist_recv(fd, 0); if (nvl != NULL) handle_message(thread_info->ctx, nvl); else EPRINTLN("nvlist_recv() failed: %s", strerror(errno)); close(fd); nvlist_destroy(nvl); } return (NULL); } static int vm_do_checkpoint(struct vmctx *ctx, const nvlist_t *nvl) { int error; if (!nvlist_exists_string(nvl, "filename") || !nvlist_exists_bool(nvl, "suspend") || !nvlist_exists_descriptor(nvl, "fddir")) error = EINVAL; else error = vm_checkpoint(ctx, nvlist_get_descriptor(nvl, "fddir"), nvlist_get_string(nvl, "filename"), nvlist_get_bool(nvl, "suspend")); return (error); } IPC_COMMAND(ipc_cmd_set, checkpoint, vm_do_checkpoint); void init_snapshot(void) { int err; err = pthread_mutex_init(&vcpu_lock, NULL); if (err != 0) errc(1, err, "checkpoint mutex init"); err = pthread_cond_init(&vcpus_idle, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_idle)"); err = pthread_cond_init(&vcpus_can_run, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_can_run)"); } /* * Create the listening socket for IPC with bhyvectl */ int init_checkpoint_thread(struct vmctx *ctx) { struct checkpoint_thread_info *checkpoint_info = NULL; struct sockaddr_un addr; int socket_fd; pthread_t checkpoint_pthread; int err; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif memset(&addr, 0, sizeof(addr)); socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { EPRINTLN("Socket creation failed: %s", strerror(errno)); err = -1; goto fail; } addr.sun_family = AF_UNIX; snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vm_get_name(ctx)); addr.sun_len = SUN_LEN(&addr); unlink(addr.sun_path); if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { EPRINTLN("Failed to bind socket \"%s\": %s\n", addr.sun_path, strerror(errno)); err = -1; goto fail; } if (listen(socket_fd, 10) < 0) { EPRINTLN("ipc socket listen: %s\n", strerror(errno)); err = errno; goto fail; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_RECV, CAP_WRITE, CAP_SEND, CAP_GETSOCKOPT); if (caph_rights_limit(socket_fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd; err = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, checkpoint_info); if (err != 0) goto fail; return (0); fail: free(checkpoint_info); if (socket_fd > 0) close(socket_fd); unlink(addr.sun_path); return (err); } void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) { const char *__op; if (op == VM_SNAPSHOT_SAVE) __op = "save"; else if (op == VM_SNAPSHOT_RESTORE) __op = "restore"; else __op = "unknown"; fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", __func__, __op, bufname); } int vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); return (E2BIG); } if (op == VM_SNAPSHOT_SAVE) memcpy(buffer->buf, data, data_size); else if (op == VM_SNAPSHOT_RESTORE) memcpy(data, buffer->buf, data_size); else return (EINVAL); buffer->buf += data_size; buffer->buf_rem -= data_size; return (0); } size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta) { size_t length; struct vm_snapshot_buffer *buffer; buffer = &meta->buffer; if (buffer->buf_size < buffer->buf_rem) { fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", __func__, buffer->buf_size, buffer->buf_rem); length = 0; } else { length = buffer->buf_size - buffer->buf_rem; } return (length); } int vm_snapshot_guest2host_addr(struct vmctx *ctx, void **addrp, size_t len, bool restore_null, struct vm_snapshot_meta *meta) { int ret; vm_paddr_t gaddr; if (meta->op == VM_SNAPSHOT_SAVE) { gaddr = paddr_host2guest(ctx, *addrp); if (gaddr == (vm_paddr_t) -1) { if (!restore_null || (restore_null && (*addrp != NULL))) { ret = EFAULT; goto done; } } SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); } else if (meta->op == VM_SNAPSHOT_RESTORE) { SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); if (gaddr == (vm_paddr_t) -1) { if (!restore_null) { ret = EFAULT; goto done; } } *addrp = paddr_guest2host(ctx, gaddr, len); } else { ret = EINVAL; } done: return (ret); } int vm_snapshot_buf_cmp(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; int ret; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); ret = E2BIG; goto done; } if (op == VM_SNAPSHOT_SAVE) { ret = 0; memcpy(buffer->buf, data, data_size); } else if (op == VM_SNAPSHOT_RESTORE) { ret = memcmp(data, buffer->buf, data_size); } else { ret = EINVAL; goto done; } buffer->buf += data_size; buffer->buf_rem -= data_size; done: return (ret); } diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h index fdfbbcbe0cb8..179aafb6471d 100644 --- a/usr.sbin/bhyve/snapshot.h +++ b/usr.sbin/bhyve/snapshot.h @@ -1,128 +1,127 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _BHYVE_SNAPSHOT_ #define _BHYVE_SNAPSHOT_ #include #include #include #define BHYVE_RUN_DIR "/var/run/bhyve/" #define MAX_SNAPSHOT_FILENAME PATH_MAX struct vmctx; struct restore_state { int kdata_fd; int vmmem_fd; void *kdata_map; size_t kdata_len; size_t vmmem_len; struct ucl_parser *meta_parser; ucl_object_t *meta_root_obj; }; struct checkpoint_thread_info { struct vmctx *ctx; int socket_fd; }; typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); typedef int (*vm_pause_dev_cb) (const char *); typedef int (*vm_resume_dev_cb) (const char *); struct vm_snapshot_dev_info { const char *dev_name; /* device name */ vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */ vm_pause_dev_cb pause_cb; /* callback for device pause */ vm_resume_dev_cb resume_cb; /* callback for device resume */ }; struct vm_snapshot_kern_info { const char *struct_name; /* kernel structure name*/ enum snapshot_req req; /* request type */ }; void destroy_restore_state(struct restore_state *rstate); const char *lookup_vmname(struct restore_state *rstate); int lookup_memflags(struct restore_state *rstate); size_t lookup_memsize(struct restore_state *rstate); int lookup_guest_ncpus(struct restore_state *rstate); void checkpoint_cpu_add(int vcpu); void checkpoint_cpu_resume(int vcpu); void checkpoint_cpu_suspend(int vcpu); int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); int vm_restore_devices(struct restore_state *rstate); int vm_pause_devices(void); int vm_resume_devices(void); int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); void *checkpoint_thread(void *param); int init_checkpoint_thread(struct vmctx *ctx); void init_snapshot(void); int load_restore_file(const char *filename, struct restore_state *rstate); int vm_snapshot_guest2host_addr(struct vmctx *ctx, void **addrp, size_t len, bool restore_null, struct vm_snapshot_meta *meta); /* * Address variables are pointers to guest memory. * * When RNULL != 0, do not enforce invalid address checks; instead, make the * pointer NULL at restore time. */ #define SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(CTX, ADDR, LEN, RNULL, META, RES, LABEL) \ do { \ (RES) = vm_snapshot_guest2host_addr((CTX), (void **)&(ADDR), (LEN), \ (RNULL), (META)); \ if ((RES) != 0) { \ if ((RES) == EFAULT) \ - fprintf(stderr, "%s: invalid address: %s\r\n", \ - __func__, #ADDR); \ + EPRINTLN("%s: invalid address: %s", __func__, #ADDR); \ goto LABEL; \ } \ } while (0) #endif