diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index aeafed1d0c69..caa284a8dd63 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -1,1604 +1,1604 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #ifdef BHYVE_SNAPSHOT #include #include #endif #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #endif #include #include #include #include #include #include #include #include #ifdef BHYVE_SNAPSHOT #include #include #include #endif #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "debug.h" #include "fwctl.h" #include "gdb.h" #include "ioapic.h" #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #ifdef BHYVE_SNAPSHOT #include "snapshot.h" #endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include "vmgenc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) static const char * const vmx_exit_reason_desc[] = { [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", [EXIT_REASON_EXT_INTR] = "External interrupt", [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", [EXIT_REASON_INIT] = "INIT signal", [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", [EXIT_REASON_SMI] = "Other SMI", [EXIT_REASON_INTR_WINDOW] = "Interrupt window", [EXIT_REASON_NMI_WINDOW] = "NMI window", [EXIT_REASON_TASK_SWITCH] = "Task switch", [EXIT_REASON_CPUID] = "CPUID", [EXIT_REASON_GETSEC] = "GETSEC", [EXIT_REASON_HLT] = "HLT", [EXIT_REASON_INVD] = "INVD", [EXIT_REASON_INVLPG] = "INVLPG", [EXIT_REASON_RDPMC] = "RDPMC", [EXIT_REASON_RDTSC] = "RDTSC", [EXIT_REASON_RSM] = "RSM", [EXIT_REASON_VMCALL] = "VMCALL", [EXIT_REASON_VMCLEAR] = "VMCLEAR", [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", [EXIT_REASON_VMPTRLD] = "VMPTRLD", [EXIT_REASON_VMPTRST] = "VMPTRST", [EXIT_REASON_VMREAD] = "VMREAD", [EXIT_REASON_VMRESUME] = "VMRESUME", [EXIT_REASON_VMWRITE] = "VMWRITE", [EXIT_REASON_VMXOFF] = "VMXOFF", [EXIT_REASON_VMXON] = "VMXON", [EXIT_REASON_CR_ACCESS] = "Control-register accesses", [EXIT_REASON_DR_ACCESS] = "MOV DR", [EXIT_REASON_INOUT] = "I/O instruction", [EXIT_REASON_RDMSR] = "RDMSR", [EXIT_REASON_WRMSR] = "WRMSR", [EXIT_REASON_INVAL_VMCS] = "VM-entry failure due to invalid guest state", [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", [EXIT_REASON_MWAIT] = "MWAIT", [EXIT_REASON_MTF] = "Monitor trap flag", [EXIT_REASON_MONITOR] = "MONITOR", [EXIT_REASON_PAUSE] = "PAUSE", [EXIT_REASON_MCE_DURING_ENTRY] = "VM-entry failure due to machine-check event", [EXIT_REASON_TPR] = "TPR below threshold", [EXIT_REASON_APIC_ACCESS] = "APIC access", [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", [EXIT_REASON_EPT_FAULT] = "EPT violation", [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", [EXIT_REASON_INVEPT] = "INVEPT", [EXIT_REASON_RDTSCP] = "RDTSCP", [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", [EXIT_REASON_INVVPID] = "INVVPID", [EXIT_REASON_WBINVD] = "WBINVD", [EXIT_REASON_XSETBV] = "XSETBV", [EXIT_REASON_APIC_WRITE] = "APIC write", [EXIT_REASON_RDRAND] = "RDRAND", [EXIT_REASON_INVPCID] = "INVPCID", [EXIT_REASON_VMFUNC] = "VMFUNC", [EXIT_REASON_ENCLS] = "ENCLS", [EXIT_REASON_RDSEED] = "RDSEED", [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", [EXIT_REASON_XSAVES] = "XSAVES", [EXIT_REASON_XRSTORS] = "XRSTORS" }; typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); int guest_ncpus; uint16_t cpu_cores, cpu_sockets, cpu_threads; int raw_stdio = 0; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu); static struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; static struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } *mt_vmm_info; static cpuset_t **vcpumap; static void usage(int code) { fprintf(stderr, "Usage: %s [-AaCDeHhPSuWwxY]\n" " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" " %*s [-G port] [-k config_file] [-l lpc] [-m mem] [-o var=value]\n" " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" " -A: create ACPI tables\n" " -a: local apic is in xAPIC mode (deprecated)\n" " -C: include guest memory in core file\n" " -c: number of CPUs and/or topology specification\n" " -D: destroy on power-off\n" " -e: exit on unhandled I/O access\n" " -G: start a debug server\n" " -H: vmexit from the guest on HLT\n" " -h: help\n" " -k: key=value flat config file\n" " -K: PS2 keyboard layout\n" " -l: LPC device configuration\n" " -m: memory size\n" " -o: set config 'var' to 'value'\n" " -P: vmexit from the guest on pause\n" " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" " -U: UUID\n" " -u: RTC keeps UTC time\n" " -W: force virtio to use single-vector MSI\n" " -w: ignore unimplemented MSRs\n" " -x: local APIC is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), "", (int)strlen(progname), "", (int)strlen(progname), ""); exit(code); } /* * XXX This parser is known to have the following issues: * 1. It accepts null key=value tokens ",," as setting "cpus" to an * empty string. * * The acceptance of a null specification ('-c ""') is by design to match the * manual page syntax specification, this results in a topology of 1 vCPU. */ static int topology_parse(const char *opt) { char *cp, *str, *tofree; if (*opt == '\0') { set_config_value("sockets", "1"); set_config_value("cores", "1"); set_config_value("threads", "1"); set_config_value("cpus", "1"); return (0); } tofree = str = strdup(opt); if (str == NULL) errx(4, "Failed to allocate memory"); while ((cp = strsep(&str, ",")) != NULL) { if (strncmp(cp, "cpus=", strlen("cpus=")) == 0) set_config_value("cpus", cp + strlen("cpus=")); else if (strncmp(cp, "sockets=", strlen("sockets=")) == 0) set_config_value("sockets", cp + strlen("sockets=")); else if (strncmp(cp, "cores=", strlen("cores=")) == 0) set_config_value("cores", cp + strlen("cores=")); else if (strncmp(cp, "threads=", strlen("threads=")) == 0) set_config_value("threads", cp + strlen("threads=")); #ifdef notyet /* Do not expose this until vmm.ko implements it */ else if (strncmp(cp, "maxcpus=", strlen("maxcpus=")) == 0) set_config_value("maxcpus", cp + strlen("maxcpus=")); #endif else if (strchr(cp, '=') != NULL) goto out; else set_config_value("cpus", cp); } free(tofree); return (0); out: free(tofree); return (-1); } static int parse_int_value(const char *key, const char *value, int minval, int maxval) { char *cp; long lval; errno = 0; lval = strtol(value, &cp, 0); if (errno != 0 || *cp != '\0' || cp == value || lval < minval || lval > maxval) errx(4, "Invalid value for %s: '%s'", key, value); return (lval); } /* * Set the sockets, cores, threads, and guest_cpus variables based on * the configured topology. * * The limits of UINT16_MAX are due to the types passed to * vm_set_topology(). vmm.ko may enforce tighter limits. */ static void calc_topology(void) { const char *value; bool explicit_cpus; uint64_t ncpus; value = get_config_value("cpus"); if (value != NULL) { guest_ncpus = parse_int_value("cpus", value, 1, UINT16_MAX); explicit_cpus = true; } else { guest_ncpus = 1; explicit_cpus = false; } value = get_config_value("cores"); if (value != NULL) cpu_cores = parse_int_value("cores", value, 1, UINT16_MAX); else cpu_cores = 1; value = get_config_value("threads"); if (value != NULL) cpu_threads = parse_int_value("threads", value, 1, UINT16_MAX); else cpu_threads = 1; value = get_config_value("sockets"); if (value != NULL) cpu_sockets = parse_int_value("sockets", value, 1, UINT16_MAX); else cpu_sockets = guest_ncpus; /* * Compute sockets * cores * threads avoiding overflow. The * range check above insures these are 16 bit values. */ ncpus = (uint64_t)cpu_sockets * cpu_cores * cpu_threads; if (ncpus > UINT16_MAX) errx(4, "Computed number of vCPUs too high: %ju", (uintmax_t)ncpus); if (explicit_cpus) { if (guest_ncpus != (int)ncpus) errx(4, "Topology (%d sockets, %d cores, %d threads) " "does not match %d vCPUs", cpu_sockets, cpu_cores, cpu_threads, guest_ncpus); } else guest_ncpus = ncpus; } static int pincpu_parse(const char *opt) { const char *value; char *newval; char key[16]; int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0) { fprintf(stderr, "invalid vcpu '%d'\n", vcpu); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (asprintf(&newval, "%s%s%d", value != NULL ? value : "", value != NULL ? "," : "", pcpu) == -1) { perror("failed to build new cpuset string"); return (-1); } set_config_value(key, newval); free(newval); return (0); } static void parse_cpuset(int vcpu, const char *list, cpuset_t *set) { char *cp, *token; int pcpu, start; CPU_ZERO(set); start = -1; token = __DECONST(char *, list); for (;;) { pcpu = strtoul(token, &cp, 0); if (cp == token) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); if (pcpu < 0 || pcpu >= CPU_SETSIZE) errx(4, "hostcpu '%d' outside valid range from 0 to %d", pcpu, CPU_SETSIZE - 1); switch (*cp) { case ',': case '\0': if (start >= 0) { if (start > pcpu) errx(4, "Invalid hostcpu range %d-%d", start, pcpu); while (start < pcpu) { CPU_SET(start, set); start++; } start = -1; } CPU_SET(pcpu, set); break; case '-': if (start >= 0) errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); start = pcpu; break; default: errx(4, "invalid cpuset for vcpu %d: '%s'", vcpu, list); } if (*cp == '\0') break; token = cp + 1; } } static void build_vcpumaps(void) { char key[16]; const char *value; int vcpu; vcpumap = calloc(guest_ncpus, sizeof(*vcpumap)); for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { snprintf(key, sizeof(key), "vcpu.%d.cpuset", vcpu); value = get_config_value(key); if (value == NULL) continue; vcpumap[vcpu] = malloc(sizeof(cpuset_t)); if (vcpumap[vcpu] == NULL) err(4, "Failed to allocate cpuset for vcpu %d", vcpu); parse_cpuset(vcpu, value, vcpumap[vcpu]); } } void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } #ifdef BHYVE_SNAPSHOT uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr) { return (vm_rev_map_gpa(ctx, addr)); } #endif int fbsdrun_virtio_msix(void) { return (get_config_bool_default("virtio_msix", true)); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int error, vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(mtp->mt_thr, sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } #ifdef BHYVE_SNAPSHOT checkpoint_cpu_add(vcpu); #endif gdb_cpu_add(vcpu); vm_loop(mtp->mt_ctx, vcpu); /* not reached */ exit(1); return (NULL); } static void fbsdrun_addcpu(struct vmctx *ctx, int newcpu, bool suspend) { int error; error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); if (suspend) vm_suspend_cpu(ctx, newcpu); mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(4); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu __unused, uint32_t eax __unused) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vme->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (get_config_bool("x86.strictmsr")) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static const char * vmexit_vmx_desc(uint32_t exit_reason) { if (exit_reason >= nitems(vmx_exit_reason_desc) || vmx_exit_reason_desc[exit_reason] == NULL) return ("Unknown"); return (vmx_exit_reason_desc[exit_reason]); } static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status); fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason, vmexit_vmx_desc(vme->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vme->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip); fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu __unused) { assert(vme->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu __unused) { assert(vme->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu __unused) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu __unused) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) { assert(vme->inst_length == 0); stats.vmexit_mtrap++; #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif gdb_cpu_mtrap(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int err, i, cs_d; struct vie *vie; enum vm_cpu_mode mode; stats.vmexit_inst_emul++; vie = &vme->u.inst_emul.vie; if (!vie->decoded) { /* * Attempt to decode in userspace as a fallback. This allows * updating instruction decode in bhyve without rebooting the * kernel (rapid prototyping), albeit with much slower * emulation. */ vie_restart(vie); mode = vme->u.inst_emul.paging.cpu_mode; cs_d = vme->u.inst_emul.cs_d; if (vmm_decode_instruction(mode, cs_d, vie) != 0) goto fail; if (vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RIP, vme->rip + vie->num_processed) != 0) goto fail; } err = emulate_mem(ctx, *pvcpu, vme->u.inst_emul.gpa, vie, &vme->u.inst_emul.paging); if (err) { if (err == ESRCH) { EPRINTLN("Unhandled memory access to 0x%lx\n", vme->u.inst_emul.gpa); } goto fail; } return (VMEXIT_CONTINUE); fail: fprintf(stderr, "Failed to emulate instruction sequence [ "); for (i = 0; i < vie->num_valid; i++) fprintf(stderr, "%02x", vie->inst[i]); FPRINTLN(stderr, " ] at 0x%lx", vme->rip); return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { enum vm_suspend_how how; how = vme->u.suspended.how; fbsdrun_deletecpu(*pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: if (get_config_bool_default("destroy_on_poweroff", false)) vm_destroy(ctx); exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static int vmexit_debug(struct vmctx *ctx __unused, struct vm_exit *vme __unused, int *pvcpu) { #ifdef BHYVE_SNAPSHOT checkpoint_cpu_suspend(*pvcpu); #endif gdb_cpu_suspend(*pvcpu); #ifdef BHYVE_SNAPSHOT checkpoint_cpu_resume(*pvcpu); #endif return (VMEXIT_CONTINUE); } static int vmexit_breakpoint(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) { gdb_cpu_breakpoint(*pvcpu, vme); return (VMEXIT_CONTINUE); } static int vmexit_ipi(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu __unused) { int error = -1; int i; switch (vme->u.ipi.mode) { case APIC_DELMODE_INIT: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { error = vm_suspend_cpu(ctx, i); if (error) { warnx("%s: failed to suspend cpu %d\n", __func__, i); break; } } break; case APIC_DELMODE_STARTUP: CPU_FOREACH_ISSET(i, &vme->u.ipi.dmask) { spinup_ap(ctx, i, vme->u.ipi.vector << PAGE_SHIFT); } error = 0; break; default: break; } return (error); } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, [VM_EXITCODE_IPI] = vmexit_ipi, }; static void vm_loop(struct vmctx *ctx, int vcpu) { struct vm_exit vme; int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); while (1) { error = vm_run(ctx, vcpu, &vme); if (error != 0) break; exitcode = vme.exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } rc = (*handler[exitcode])(ctx, &vme, &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { uint16_t sockets, cores, threads, maxcpus; int tmp, error; /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); if (error != 0) return (1); error = vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); if (error == 0) return (maxcpus); else return (1); } static void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (get_config_bool_default("x86.vmexit_on_hlt", false)) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (get_config_bool_default("x86.vmexit_on_pause", false)) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (get_config_bool_default("x86.x2apic", false)) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(4); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); err = vm_set_capability(ctx, cpu, VM_CAP_IPI_EXIT, 1); assert(err == 0); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(4); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(4); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(4); } #ifndef WITHOUT_CAPSICUM if (vm_limit_rights(ctx) != 0) err(EX_OSERR, "vm_limit_rights"); #endif if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(4); } } error = vm_set_topology(ctx, cpu_sockets, cpu_cores, cpu_threads, 0 /* maxcpus, unimplemented */); if (error) errx(EX_OSERR, "vm_set_topology"); return (ctx); } static void spinup_vcpu(struct vmctx *ctx, int vcpu, bool suspend) { int error; if (vcpu != BSP) { fbsdrun_set_capabilities(ctx, vcpu); /* * Enable the 'unrestricted guest' mode for APs. * * APs startup in power-on 16-bit mode. */ error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); } fbsdrun_addcpu(ctx, vcpu, suspend); } static bool parse_config_option(const char *option) { const char *value; char *path; value = strchr(option, '='); if (value == NULL || value[1] == '\0') return (false); path = strndup(option, value - option); if (path == NULL) err(4, "Failed to allocate memory"); set_config_value(path, value + 1); return (true); } static void parse_simple_config_file(const char *path) { FILE *fp; char *line, *cp; size_t linecap; unsigned int lineno; fp = fopen(path, "r"); if (fp == NULL) err(4, "Failed to open configuration file %s", path); line = NULL; linecap = 0; lineno = 1; for (lineno = 1; getline(&line, &linecap, fp) > 0; lineno++) { if (*line == '#' || *line == '\n') continue; cp = strchr(line, '\n'); if (cp != NULL) *cp = '\0'; if (!parse_config_option(line)) errx(4, "%s line %u: invalid config option '%s'", path, lineno, line); } free(line); fclose(fp); } static void parse_gdb_options(const char *opt) { const char *sport; char *colon; if (opt[0] == 'w') { set_config_bool("gdb.wait", true); opt++; } colon = strrchr(opt, ':'); if (colon == NULL) { sport = opt; } else { *colon = '\0'; colon++; sport = colon; set_config_value("gdb.address", opt); } set_config_value("gdb.port", sport); } static void set_defaults(void) { set_config_bool("acpi_tables", false); set_config_value("memory.size", "256M"); set_config_bool("x86.strictmsr", true); } int main(int argc, char *argv[]) { int c, error, err; int max_vcpus, memflags; struct vmctx *ctx; uint64_t rip; size_t memsize; const char *optstr, *value, *vmname; #ifdef BHYVE_SNAPSHOT char *restore_file; struct restore_state rstate; restore_file = NULL; #endif init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:r:"; #else optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:"; #endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': set_config_bool("x86.x2apic", false); break; case 'A': set_config_bool("acpi_tables", true); break; case 'D': set_config_bool("destroy_on_poweroff", true); break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': if (topology_parse(optarg) != 0) { errx(EX_USAGE, "invalid cpu topology " "'%s'", optarg); } break; case 'C': set_config_bool("memory.guest_in_core", true); break; case 'G': parse_gdb_options(optarg); break; case 'k': parse_simple_config_file(optarg); break; case 'K': set_config_value("keyboard.layout", optarg); break; case 'l': if (strncmp(optarg, "help", strlen(optarg)) == 0) { lpc_print_supported_devices(); exit(0); } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; #ifdef BHYVE_SNAPSHOT case 'r': restore_file = optarg; break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); else break; case 'S': set_config_bool("memory.wired", true); break; case 'm': set_config_value("memory.size", optarg); break; case 'o': if (!parse_config_option(optarg)) errx(EX_USAGE, "invalid configuration option '%s'", optarg); break; case 'H': set_config_bool("x86.vmexit_on_hlt", true); break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': set_config_bool("x86.vmexit_on_pause", true); break; case 'e': set_config_bool("x86.strictio", true); break; case 'u': set_config_bool("rtc.use_localtime", false); break; case 'U': set_config_value("uuid", optarg); break; case 'w': set_config_bool("x86.strictmsr", false); break; case 'W': set_config_bool("virtio_msix", false); break; case 'x': set_config_bool("x86.x2apic", true); break; case 'Y': set_config_bool("x86.mptable", false); break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc > 1) usage(1); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { error = load_restore_file(restore_file, &rstate); if (error) { fprintf(stderr, "Failed to read checkpoint info from " "file: '%s'.\n", restore_file); exit(1); } vmname = lookup_vmname(&rstate); if (vmname != NULL) set_config_value("name", vmname); } #endif if (argc == 1) set_config_value("name", argv[0]); vmname = get_config_value("name"); if (vmname == NULL) usage(1); if (get_config_bool_default("config.dump", false)) { dump_config(); exit(1); } calc_topology(); build_vcpumaps(); value = get_config_value("memory.size"); error = vm_parse_memsize(value, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", value); ctx = do_open(vmname); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { guest_ncpus = lookup_guest_ncpus(&rstate); memflags = lookup_memflags(&rstate); memsize = lookup_memsize(&rstate); } if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } #endif max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(4); } fbsdrun_set_capabilities(ctx, BSP); memflags = 0; if (get_config_bool_default("memory.wired", false)) memflags |= VM_MEM_F_WIRED; if (get_config_bool_default("memory.guest_in_core", false)) memflags |= VM_MEM_F_INCORE; vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(4); } init_mem(guest_ncpus); init_inout(); kernemu_dev_init(); init_bootrom(ctx); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx); sci_init(ctx); /* * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) { perror("device emulation initialization error"); exit(4); } /* * Initialize after PCI, to allow a bootrom file to reserve the high * region. */ if (get_config_bool("acpi_tables")) vmgenc_init(ctx); init_gdb(ctx); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(4); } error = vcpu_reset(ctx, BSP); assert(error == 0); } #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) { fprintf(stdout, "Pausing pci devs...\r\n"); - if (vm_pause_user_devs(ctx) != 0) { + if (vm_pause_user_devs() != 0) { fprintf(stderr, "Failed to pause PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring vm mem...\r\n"); if (restore_vm_mem(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore VM memory.\n"); exit(1); } fprintf(stdout, "Restoring pci devs...\r\n"); if (vm_restore_user_devs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore PCI device state.\n"); exit(1); } fprintf(stdout, "Restoring kernel structs...\r\n"); if (vm_restore_kern_structs(ctx, &rstate) != 0) { fprintf(stderr, "Failed to restore kernel structs.\n"); exit(1); } fprintf(stdout, "Resuming pci devs...\r\n"); - if (vm_resume_user_devs(ctx) != 0) { + if (vm_resume_user_devs() != 0) { fprintf(stderr, "Failed to resume PCI device state.\n"); exit(1); } } #endif error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (get_config_bool_default("x86.mptable", true)) { error = mptable_build(ctx, guest_ncpus); if (error) { perror("error to build the guest tables"); exit(4); } } error = smbios_build(ctx); if (error != 0) exit(4); if (get_config_bool("acpi_tables")) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom()) fwctl_init(); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); #ifndef WITHOUT_CAPSICUM caph_cache_catpages(); if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); if (caph_enter() == -1) errx(EX_OSERR, "cap_enter() failed"); #endif #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) destroy_restore_state(&rstate); /* initialize mutex/cond variables */ init_snapshot(); /* * checkpointing thread for communication with bhyvectl */ if (init_checkpoint_thread(ctx) < 0) printf("Failed to start checkpoint thread!\r\n"); if (restore_file != NULL) vm_restore_time(ctx); #endif /* Allocate per-VCPU resources. */ mt_vmm_info = calloc(guest_ncpus, sizeof(*mt_vmm_info)); /* * Add all vCPUs. */ for (int vcpu = 0; vcpu < guest_ncpus; vcpu++) { bool suspend = (vcpu != BSP); #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) suspend = false; #endif spinup_vcpu(ctx, vcpu, suspend); } /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(4); } diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c index 700170885a13..9c023d93cab9 100644 --- a/usr.sbin/bhyve/pci_ahci.c +++ b/usr.sbin/bhyve/pci_ahci.c @@ -1,2740 +1,2738 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Zhixiang Yu * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "ahci.h" #include "block_if.h" #define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ #define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ enum sata_fis_type { FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */ }; /* * SCSI opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define INQUIRY 0x12 #define START_STOP_UNIT 0x1B #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define POSITION_TO_ELEMENT 0x2B #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A #define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE /* * SCSI mode page codes */ #define MODEPAGE_RW_ERROR_RECOVERY 0x01 #define MODEPAGE_CD_CAPABILITIES 0x2A /* * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 #define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* * Debug printf */ #ifdef AHCI_DEBUG static FILE *dbg; #define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0) #else #define DPRINTF(format, arg...) #endif #define WPRINTF(format, arg...) printf(format, ##arg) #define AHCI_PORT_IDENT 20 + 1 struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; STAILQ_ENTRY(ahci_ioreq) io_flist; TAILQ_ENTRY(ahci_ioreq) io_blist; uint8_t *cfis; uint32_t len; uint32_t done; int slot; int more; int readop; }; struct ahci_port { struct blockif_ctxt *bctx; struct pci_ahci_softc *pr_sc; struct ata_params ata_ident; uint8_t *cmd_lst; uint8_t *rfis; int port; int atapi; int reset; int waitforclear; int mult_sectors; uint8_t xfermode; uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; u_int ccs; uint32_t pending; uint32_t clb; uint32_t clbu; uint32_t fb; uint32_t fbu; uint32_t is; uint32_t ie; uint32_t cmd; uint32_t unused0; uint32_t tfd; uint32_t sig; uint32_t ssts; uint32_t sctl; uint32_t serr; uint32_t sact; uint32_t ci; uint32_t sntf; uint32_t fbs; /* * i/o request info */ struct ahci_ioreq *ioreq; int ioqsz; STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd; TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd; }; struct ahci_cmd_hdr { uint16_t flags; uint16_t prdtl; uint32_t prdbc; uint64_t ctba; uint32_t reserved[4]; }; struct ahci_prdt_entry { uint64_t dba; uint32_t reserved; #define DBCMASK 0x3fffff uint32_t dbc; }; struct pci_ahci_softc { struct pci_devinst *asc_pi; pthread_mutex_t mtx; int ports; uint32_t cap; uint32_t ghc; uint32_t is; uint32_t pi; uint32_t vs; uint32_t ccc_ctl; uint32_t ccc_pts; uint32_t em_loc; uint32_t em_ctl; uint32_t cap2; uint32_t bohc; uint32_t lintr; struct ahci_port port[MAX_PORTS]; }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) static void ahci_handle_port(struct ahci_port *p); static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; buf[0] = (lba / 75) / 60; buf[1] = (lba / 75) % 60; buf[2] = lba % 75; } /* * Generate HBA interrupts on global IS register write. */ static void ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { struct pci_devinst *pi = sc->asc_pi; struct ahci_port *p; int i, nmsg; uint32_t mmask; /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { p = &sc->port[i]; if (p->is & p->ie) sc->is |= (1 << i); } DPRINTF("%s(%08x) %08x", __func__, mask, sc->is); /* If there is nothing enabled -- clear legacy interrupt and exit. */ if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } return; } /* If there is anything and no MSI -- assert legacy interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (nmsg == 0) { if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } return; } /* Assert respective MSIs for ports that were touched. */ for (i = 0; i < nmsg; i++) { if (sc->ports <= nmsg || i < nmsg - 1) mmask = 1 << i; else mmask = 0xffffffff << i; if (sc->is & mask && mmask & mask) pci_generate_msi(pi, i); } } /* * Generate HBA interrupt on specific port event. */ static void ahci_port_intr(struct ahci_port *p) { struct pci_ahci_softc *sc = p->pr_sc; struct pci_devinst *pi = sc->asc_pi; int nmsg; DPRINTF("%s(%d) %08x/%08x %08x", __func__, p->port, p->is, p->ie, sc->is); /* If there is nothing enabled -- we are done. */ if ((p->is & p->ie) == 0) return; /* In case of non-shared MSI always generate interrupt. */ nmsg = pci_msi_maxmsgnum(pi); if (sc->ports <= nmsg || p->port < nmsg - 1) { sc->is |= (1 << p->port); if ((sc->ghc & AHCI_GHC_IE) == 0) return; pci_generate_msi(pi, p->port); return; } /* If IS for this port is already set -- do nothing. */ if (sc->is & (1 << p->port)) return; sc->is |= (1 << p->port); /* If interrupts are enabled -- generate one. */ if ((sc->ghc & AHCI_GHC_IE) == 0) return; if (nmsg > 0) { pci_generate_msi(pi, nmsg - 1); } else if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } } static void ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) { int offset, len, irq; if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE)) return; switch (ft) { case FIS_TYPE_REGD2H: offset = 0x40; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d", ft); return; } if (fis[2] & ATA_S_ERROR) { p->waitforclear = 1; irq |= AHCI_P_IX_TFE; } memcpy(p->rfis + offset, fis, len); if (irq) { if (~p->is & irq) { p->is |= irq; ahci_port_intr(p); } } } static void ahci_write_fis_piosetup(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_PIOSETUP; ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis); } static void ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; tfd &= 0x77; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_SETDEVBITS; fis[1] = (1 << 6); fis[2] = tfd; fis[3] = error; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = slot; p->err_cfis[2] = tfd; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else { *(uint32_t *)(fis + 4) = (1 << slot); p->sact &= ~(1 << slot); } p->tfd &= ~0x77; p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } static void ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[20]; uint8_t error; error = (tfd >> 8) & 0xff; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = (1 << 6); fis[2] = tfd & 0xff; fis[3] = error; fis[4] = cfis[4]; fis[5] = cfis[5]; fis[6] = cfis[6]; fis[7] = cfis[7]; fis[8] = cfis[8]; fis[9] = cfis[9]; fis[10] = cfis[10]; fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; if (fis[2] & ATA_S_ERROR) { p->err_cfis[0] = 0x80; p->err_cfis[2] = tfd & 0xff; p->err_cfis[3] = error; memcpy(&p->err_cfis[4], cfis + 4, 16); } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) { uint8_t fis[20]; p->tfd = ATA_S_READY | ATA_S_DSC; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[1] = 0; /* No interrupt */ fis[2] = p->tfd; /* Status */ fis[3] = 0; /* No error */ p->ci &= ~(1 << slot); ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; memset(fis, 0, sizeof(fis)); fis[0] = FIS_TYPE_REGD2H; fis[3] = 1; fis[4] = 1; if (p->atapi) { fis[5] = 0x14; fis[6] = 0xeb; } fis[12] = 1; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void ahci_check_stopped(struct ahci_port *p) { /* * If we are no longer processing the command list and nothing * is in-flight, clear the running bit, the current command * slot, the command issue and active bits. */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; p->waitforclear = 0; } } } static void ahci_port_stop(struct ahci_port *p) { struct ahci_ioreq *aior; uint8_t *cfis; int slot; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); TAILQ_FOREACH(aior, &p->iobhd, io_blist) { /* * Try to cancel the outstanding blockif request. */ error = blockif_cancel(p->bctx, &aior->io_req); if (error != 0) continue; slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); /* * This command is now done. */ p->pending &= ~(1 << slot); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); } ahci_check_stopped(p); } static void ahci_port_reset(struct ahci_port *pr) { pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; pr->mult_sectors = 128; if (!pr->bctx) { pr->ssts = ATA_SS_DET_NO_DEVICE; pr->sig = 0xFFFFFFFF; pr->tfd = 0x7F; return; } pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; if (pr->sctl & ATA_SC_SPD_MASK) pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); else pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; pr->tfd |= ATA_S_READY; } else pr->sig = PxSIG_ATAPI; ahci_write_reset_fis_d2h(pr); } static void ahci_reset(struct pci_ahci_softc *sc) { int i; sc->ghc = AHCI_GHC_AE; sc->is = 0; if (sc->lintr) { pci_lintr_deassert(sc->asc_pi); sc->lintr = 0; } for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); if (sc->port[i].bctx) sc->port[i].cmd |= AHCI_P_CMD_CPS; sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } static void ata_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i ^ 1] = *src++; else dest[i ^ 1] = ' '; } } static void atapi_string(uint8_t *dest, const char *src, int len) { int i; for (i = 0; i < len; i++) { if (*src) dest[i] = *src++; else dest[i] = ' '; } } /* * Build up the iovec based on the PRDT, 'done' and 'len'. */ static void ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, struct ahci_prdt_entry *prdt, uint16_t prdtl) { struct blockif_req *breq = &aior->io_req; uint32_t dbcsz, extra, left, skip, todo; int i, j; assert(aior->len >= aior->done); /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ skip = aior->done; left = aior->len - aior->done; todo = 0; for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; i++, prdt++) { dbcsz = (prdt->dbc & DBCMASK) + 1; /* Skip already done part of the PRDT */ if (dbcsz <= skip) { skip -= dbcsz; continue; } dbcsz -= skip; if (dbcsz > left) dbcsz = left; breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba + skip, dbcsz); breq->br_iov[j].iov_len = dbcsz; todo += dbcsz; left -= dbcsz; skip = 0; j++; } /* If we got limited by IOV length, round I/O down to sector size. */ if (j == BLOCKIF_IOV_MAX) { extra = todo % blockif_sectsz(p->bctx); todo -= extra; assert(todo > 0); while (extra > 0) { if (breq->br_iov[j - 1].iov_len > extra) { breq->br_iov[j - 1].iov_len -= extra; break; } extra -= breq->br_iov[j - 1].iov_len; j--; } } breq->br_iovcnt = j; breq->br_resid = todo; aior->done += todo; aior->more = (aior->done < aior->len && i < prdtl); } static void ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; int err, first, ncq, readop; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; first = (done == 0); if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[11] << 8 | cfis[3]; if (!len) len = 65536; ncq = 1; } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | ((uint64_t)cfis[6] << 16) | ((uint64_t)cfis[5] << 8) | cfis[4]; len = cfis[13] << 8 | cfis[12]; if (!len) len = 65536; } else { lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) | (cfis[5] << 8) | cfis[4]; len = cfis[12]; if (!len) len = 256; } lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); } static void ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_ioreq *aior; struct blockif_req *breq; int err; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = 0; aior->done = 0; aior->more = 0; breq = &aior->io_req; /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_flush(p->bctx, breq); assert(err == 0); } static inline void read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *to; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; to = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; unsigned int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(to, ptr, sublen); len -= sublen; to += sublen; prdt++; } } static void ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; uint8_t *entry; uint64_t elba; uint32_t len, elen; int err, first, ncq; uint8_t buf[512]; first = (done == 0); if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { len = (uint16_t)cfis[13] << 8 | cfis[12]; len *= 512; ncq = 0; } else { /* ATA_SEND_FPDMA_QUEUED */ len = (uint16_t)cfis[11] << 8 | cfis[3]; len *= 512; ncq = 1; } read_prdt(p, slot, cfis, buf, sizeof(buf)); next: entry = &buf[done]; elba = ((uint64_t)entry[5] << 40) | ((uint64_t)entry[4] << 32) | ((uint64_t)entry[3] << 24) | ((uint64_t)entry[2] << 16) | ((uint64_t)entry[1] << 8) | entry[0]; elen = (uint16_t)entry[7] << 8 | entry[6]; done += 8; if (elen == 0) { if (done >= len) { if (ncq) { if (first) ahci_write_fis_d2h_ncq(p, slot); ahci_write_fis_sdb(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } else { ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } p->pending &= ~(1 << slot); ahci_check_stopped(p); if (!first) ahci_handle_port(p); return; } goto next; } /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->more = (len != done); breq = &aior->io_req; breq->br_offset = elba * blockif_sectsz(p->bctx); breq->br_resid = elen * blockif_sectsz(p->bctx); /* * Mark this command in-flight. */ p->pending |= 1 << slot; /* * Stuff request onto busy list */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); if (ncq && first) ahci_write_fis_d2h_ncq(p, slot); err = blockif_delete(p->bctx, breq); assert(err == 0); } static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, unsigned int size) { struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; uint8_t *from; unsigned int len; int i; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); len = size; from = buf; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); for (i = 0; i < hdr->prdtl && len; i++) { uint8_t *ptr; uint32_t dbcsz; int sublen; dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; prdt++; } hdr->prdbc = size - len; } static void ahci_checksum(uint8_t *buf, int size) { int i; uint8_t sum = 0; for (i = 0; i < size - 1; i++) sum += buf[i]; buf[size - 1] = 0x100 - sum; } static void ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; uint32_t buf[128]; uint8_t *buf8 = (uint8_t *)buf; uint16_t *buf16 = (uint16_t *)buf; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } memset(buf, 0, sizeof(buf)); if (cfis[4] == 0x00) { /* Log directory */ buf16[0x00] = 1; /* Version -- 1 */ buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); ahci_checksum(buf8, sizeof(buf)); } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { buf[0x00] = 1; /* SFQ DSM supported */ buf[0x01] = 1; /* SFQ DSM TRIM supported */ } } else { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); return; } if (cfis[2] == ATA_READ_LOG_EXT) ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void*)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void ata_identify_init(struct ahci_port* p, int atapi) { struct ata_params* ata_ident = &p->ata_ident; if (atapi) { ata_ident->config = ATA_PROTO_ATAPI | ATA_ATAPI_TYPE_CDROM | ATA_ATAPI_REMOVABLE | ATA_DRQ_FAST; ata_ident->capabilities1 = ATA_SUPPORT_LBA | ATA_SUPPORT_DMA; ata_ident->capabilities2 = (1 << 14 | 1); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; ata_ident->obsolete62 = 0x3f; ata_ident->mwdmamodes = 7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); ata_ident->satacapabilities2 = ((p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->satasupport = ATA_SUPPORT_NCQ_STREAM; ata_ident->version_major = 0x3f0; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->support.command2 = (1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } else { uint64_t sectors; int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; ro = blockif_is_ro(p->bctx); candelete = blockif_candelete(p->bctx); sectsz = blockif_sectsz(p->bctx); sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); blockif_psectsz(p->bctx, &psectsz, &psectoff); ata_ident->config = ATA_DRQ_FAST; ata_ident->cylinders = cyl; ata_ident->heads = heads; ata_ident->sectors = sech; ata_ident->sectors_intr = (0x8000 | 128); ata_ident->tcg = 0; ata_ident->capabilities1 = ATA_SUPPORT_DMA | ATA_SUPPORT_LBA | ATA_SUPPORT_IORDY; ata_ident->capabilities2 = (1 << 14); ata_ident->atavalid = ATA_FLAG_64_70 | ATA_FLAG_88; if (p->mult_sectors) ata_ident->multi = (ATA_MULTI_VALID | p->mult_sectors); if (sectors <= 0x0fffffff) { ata_ident->lba_size_1 = sectors; ata_ident->lba_size_2 = (sectors >> 16); } else { ata_ident->lba_size_1 = 0xffff; ata_ident->lba_size_2 = 0x0fff; } ata_ident->mwdmamodes = 0x7; if (p->xfermode & ATA_WDMA0) ata_ident->mwdmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->apiomodes = 0x3; ata_ident->mwdmamin = 0x0078; ata_ident->mwdmarec = 0x0078; ata_ident->pioblind = 0x0078; ata_ident->pioiordy = 0x0078; ata_ident->support3 = 0; ata_ident->queue = 31; ata_ident->satacapabilities = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | ATA_SUPPORT_NCQ); ata_ident->satacapabilities2 = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | (p->ssts & ATA_SS_SPD_MASK) >> 3); ata_ident->version_major = 0x3f0; ata_ident->version_minor = 0x28; ata_ident->support.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->support.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); ata_ident->support.extension = (1 << 14); ata_ident->enabled.command1 = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE | ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); ata_ident->enabled.command2 = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); ata_ident->enabled.extension = (1 << 14); ata_ident->udmamodes = 0x7f; if (p->xfermode & ATA_UDMA0) ata_ident->udmamodes |= (1 << ((p->xfermode & 7) + 8)); ata_ident->lba_size48_1 = sectors; ata_ident->lba_size48_2 = (sectors >> 16); ata_ident->lba_size48_3 = (sectors >> 32); ata_ident->lba_size48_4 = (sectors >> 48); if (candelete && !ro) { ata_ident->support3 |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; ata_ident->max_dsm_blocks = 1; ata_ident->support_dsm = ATA_SUPPORT_DSM_TRIM; } ata_ident->pss = ATA_PSS_VALID_VALUE; ata_ident->lsalign = 0x4000; if (psectsz > sectsz) { ata_ident->pss |= ATA_PSS_MULTLS; ata_ident->pss |= ffsl(psectsz / sectsz) - 1; ata_ident->lsalign |= (psectoff / sectsz); } if (sectsz > 512) { ata_ident->pss |= ATA_PSS_LSSABOVE512; ata_ident->lss_1 = sectsz / 2; ata_ident->lss_2 = ((sectsz / 2) >> 16); } ata_ident->support2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->enabled2 = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); ata_ident->transport_major = 0x1020; ata_ident->integrity = 0x00a5; } ahci_checksum((uint8_t*)ata_ident, sizeof(struct ata_params)); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)&p->ata_ident, sizeof(struct ata_params)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } } static void atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[36]; uint8_t *acmd; unsigned int len; uint32_t tfd; acmd = cfis + 0x40; if (acmd[1] & 1) { /* VPD */ if (acmd[2] == 0) { /* Supported VPD pages */ buf[0] = 0x05; buf[1] = 0; buf[2] = 0; buf[3] = 1; buf[4] = 0; len = 4 + buf[3]; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } } else { buf[0] = 0x05; buf[1] = 0x80; buf[2] = 0x00; buf[3] = 0x21; buf[4] = 31; buf[5] = 0; buf[6] = 0; buf[7] = 0; atapi_string(buf + 8, "BHYVE", 8); atapi_string(buf + 16, "BHYVE DVD-ROM", 16); atapi_string(buf + 32, "001", 4); len = sizeof(buf); } if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, len); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[8]; uint64_t sectors; sectors = blockif_size(p->bctx) / 2048; be32enc(buf, sectors - 1); be32enc(buf + 4, 2048); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint8_t format; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); format = acmd[9] >> 6; switch (format) { case 0: { size_t size; int msf; uint64_t sectors; uint8_t start_track, buf[20], *bp; msf = (acmd[1] >> 1) & 1; start_track = acmd[6]; if (start_track > 1 && start_track != 0xaa) { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); return; } bp = buf + 2; *bp++ = 1; *bp++ = 1; if (start_track <= 1) { *bp++ = 0; *bp++ = 0x14; *bp++ = 1; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } } *bp++ = 0; *bp++ = 0x14; *bp++ = 0xaa; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 1: { uint8_t buf[12]; memset(buf, 0, sizeof(buf)); buf[1] = 0xa; buf[2] = 0x1; buf[3] = 0x1; if (len > sizeof(buf)) len = sizeof(buf); write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } case 2: { size_t size; int msf; uint64_t sectors; uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; bp = buf + 2; *bp++ = 1; *bp++ = 1; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa1; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 0xa2; *bp++ = 0; *bp++ = 0; *bp++ = 0; sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); sectors >>= 2; if (msf) { *bp++ = 0; lba_to_msf(bp, sectors); bp += 3; } else { be32enc(bp, sectors); bp += 4; } *bp++ = 1; *bp++ = 0x14; *bp++ = 0; *bp++ = 1; *bp++ = 0; *bp++ = 0; *bp++ = 0; if (msf) { *bp++ = 0; lba_to_msf(bp, 0); bp += 3; } else { *bp++ = 0; *bp++ = 0; *bp++ = 0; *bp++ = 0; } size = bp - buf; be16enc(buf, size - 2); if (len > size) len = size; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; } default: { uint32_t tfd; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); break; } } } static void atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[16]; memset(buf, 0, sizeof(buf)); buf[3] = 8; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; write_prdt(p, slot, cfis, buf, sizeof(buf)); ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; uint8_t *acmd; uint64_t lba; uint32_t len; int err; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); else len = be32dec(acmd + 6); if (len == 0) { cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } lba *= 2048; len *= 2048; /* * Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); /* Mark this command in-flight. */ p->pending |= 1 << slot; /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); err = blockif_read(p->bctx, breq); assert(err == 0); } static void atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t buf[64]; uint8_t *acmd; unsigned int len; acmd = cfis + 0x40; len = acmd[4]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, len); buf[0] = 0x70 | (1 << 7); buf[2] = p->sense_key; buf[7] = 10; buf[12] = p->asc; write_prdt(p, slot, cfis, buf, len); cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); } static void atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd = cfis + 0x40; uint32_t tfd; switch (acmd[4] & 3) { case 0: case 1: case 3: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; tfd = ATA_S_READY | ATA_S_DSC; break; case 2: /* TODO eject media */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x53; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; uint8_t pc, code; unsigned int len; acmd = cfis + 0x40; len = be16dec(acmd + 7); pc = acmd[2] >> 6; code = acmd[2] & 0x3f; switch (pc) { case 0: switch (code) { case MODEPAGE_RW_ERROR_RECOVERY: { uint8_t buf[16]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 16 - 2); buf[2] = 0x70; buf[8] = 0x01; buf[9] = 16 - 10; buf[11] = 0x05; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } case MODEPAGE_CD_CAPABILITIES: { uint8_t buf[30]; if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 30 - 2); buf[2] = 0x70; buf[8] = 0x2A; buf[9] = 30 - 10; buf[10] = 0x08; buf[12] = 0x71; be16enc(&buf[18], 2); be16enc(&buf[20], 512); write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; break; } default: goto error; break; } break; case 3: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x39; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; error: case 1: case 2: p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; break; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void atapi_get_event_status_notification(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; uint32_t tfd; acmd = cfis + 0x40; /* we don't support asynchronous operation */ if (!(acmd[1] & 1)) { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x24; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } else { uint8_t buf[8]; unsigned int len; len = be16dec(acmd + 7); if (len > sizeof(buf)) len = sizeof(buf); memset(buf, 0, sizeof(buf)); be16enc(buf, 8 - 2); buf[2] = 0x04; buf[3] = 0x10; buf[5] = 0x02; write_prdt(p, slot, cfis, buf, len); tfd = ATA_S_READY | ATA_S_DSC; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); } static void handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; acmd = cfis + 0x40; #ifdef AHCI_DEBUG { int i; DPRINTF("ACMD:"); for (i = 0; i < 16; i++) DPRINTF("%02x ", acmd[i]); DPRINTF(""); } #endif switch (acmd[0]) { case TEST_UNIT_READY: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case INQUIRY: atapi_inquiry(p, slot, cfis); break; case READ_CAPACITY: atapi_read_capacity(p, slot, cfis); break; case PREVENT_ALLOW: /* TODO */ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case READ_TOC: atapi_read_toc(p, slot, cfis); break; case REPORT_LUNS: atapi_report_luns(p, slot, cfis); break; case READ_10: case READ_12: atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); break; case START_STOP_UNIT: atapi_start_stop_unit(p, slot, cfis); break; case MODE_SENSE_10: atapi_mode_sense(p, slot, cfis); break; case GET_EVENT_STATUS_NOTIFICATION: atapi_get_event_status_notification(p, slot, cfis); break; default: cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x20; ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); break; case ATA_SETFEATURES: { switch (cfis[3]) { case ATA_SF_ENAB_SATA_SF: switch (cfis[12]) { case ATA_SATA_SF_AN: p->tfd = ATA_S_DSC | ATA_S_READY; break; default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } break; case ATA_SF_ENAB_WCACHE: case ATA_SF_DIS_WCACHE: case ATA_SF_ENAB_RCACHE: case ATA_SF_DIS_RCACHE: p->tfd = ATA_S_DSC | ATA_S_READY; break; case ATA_SF_SETXFER: { switch (cfis[12] & 0xf8) { case ATA_PIO: case ATA_PIO0: break; case ATA_WDMA0: case ATA_UDMA0: p->xfermode = (cfis[12] & 0x7); break; } p->tfd = ATA_S_DSC | ATA_S_READY; break; } default: p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); break; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; } case ATA_SET_MULTI: if (cfis[12] != 0 && (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) { p->tfd = ATA_S_ERROR | ATA_S_READY; p->tfd |= (ATA_ERROR_ABORT << 8); } else { p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; case ATA_READ: case ATA_WRITE: case ATA_READ48: case ATA_WRITE48: case ATA_READ_MUL: case ATA_WRITE_MUL: case ATA_READ_MUL48: case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; case ATA_DATA_SET_MANAGEMENT: if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && cfis[13] == 0 && cfis[12] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_SEND_FPDMA_QUEUED: if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && cfis[11] == 0 && cfis[3] == 1) { ahci_handle_dsm_trim(p, slot, cfis, 0); break; } ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_READ_LOG_EXT: case ATA_READ_LOG_DMA_EXT: ahci_handle_read_log(p, slot, cfis); break; case ATA_SECURITY_FREEZE_LOCK: case ATA_SMART_CMD: case ATA_NOP: ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; case ATA_CHECK_POWER_MODE: cfis[12] = 0xff; /* always on */ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: case ATA_READ_VERIFY: case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: handle_atapi_identify(p, slot, cfis); break; case ATA_PACKET_CMD: if (!p->atapi) { ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x", cfis[2]); ahci_write_fis_d2h(p, slot, cfis, (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; #ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; #endif struct pci_ahci_softc *sc; uint8_t *cfis; #ifdef AHCI_DEBUG int cfl, i; #endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); #ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; #endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); #ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); DPRINTF("cfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) DPRINTF(""); DPRINTF("%02x ", cfis[i]); } DPRINTF(""); for (i = 0; i < hdr->prdtl; i++) { DPRINTF("%d@%08"PRIx64"", prdt->dbc & 0x3fffff, prdt->dba); prdt++; } #endif if (cfis[0] != FIS_TYPE_REGH2D) { WPRINTF("Not a H2D FIS:%02x", cfis[0]); return; } if (cfis[1] & 0x80) { ahci_handle_cmd(p, slot, cfis); } else { if (cfis[15] & (1 << 2)) p->reset = 1; else if (p->reset) { p->reset = 0; ahci_port_reset(p); } p->ci &= ~(1 << slot); } } static void ahci_handle_port(struct ahci_port *p) { if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that * are already in-flight. Stop if device is busy or in error. */ for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) break; if (p->waitforclear) break; if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; ahci_handle_slot(p, p->ccs); } } } /* * blockif callback routine - this runs in the context of the blockif * i/o thread, so the mutex needs to be acquired. */ static void ata_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; int slot, ncq, dsm; DPRINTF("%s %d", __func__, err); ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || cfis[2] == ATA_READ_FPDMA_QUEUED || cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; if (cfis[2] == ATA_DATA_SET_MANAGEMENT || (cfis[2] == ATA_SEND_FPDMA_QUEUED && (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) dsm = 1; pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { if (dsm) ahci_handle_dsm_trim(p, slot, cfis, aior->done); else ahci_handle_rw(p, slot, cfis, aior->done); goto out; } if (!err) tfd = ATA_S_READY | ATA_S_DSC; else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; if (ncq) ahci_write_fis_sdb(p, slot, cfis, tfd); else ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void atapi_ioreq_cb(struct blockif_req *br, int err) { struct ahci_cmd_hdr *hdr; struct ahci_ioreq *aior; struct ahci_port *p; struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; int slot; DPRINTF("%s %d", __func__, err); aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); pthread_mutex_lock(&sc->mtx); /* * Delete the blockif request from the busy list */ TAILQ_REMOVE(&p->iobhd, aior, io_blist); /* * Move the blockif request back to the free list */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); if (!err) hdr->prdbc = aior->done; if (!err && aior->more) { atapi_read(p, slot, cfis, aior->done); goto out; } if (!err) { tfd = ATA_S_READY | ATA_S_DSC; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; } cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); /* * This command is now complete. */ p->pending &= ~(1 << slot); ahci_check_stopped(p); ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit", __func__); } static void pci_ahci_ioreq_init(struct ahci_port *pr) { struct ahci_ioreq *vr; int i; pr->ioqsz = blockif_queuesz(pr->bctx); pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq)); STAILQ_INIT(&pr->iofhd); /* * Add all i/o request entries to the free queue */ for (i = 0; i < pr->ioqsz; i++) { vr = &pr->ioreq[i]; vr->io_pr = pr; if (!pr->atapi) vr->io_req.br_callback = ata_ioreq_cb; else vr->io_req.br_callback = atapi_ioreq_cb; vr->io_req.br_param = vr; STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist); } TAILQ_INIT(&pr->iobhd); } static void pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; struct ahci_port *p = &sc->port[port]; DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"", port, offset, value); switch (offset) { case AHCI_P_CLB: p->clb = value; break; case AHCI_P_CLBU: p->clbu = value; break; case AHCI_P_FB: p->fb = value; break; case AHCI_P_FBU: p->fbu = value; break; case AHCI_P_IS: p->is &= ~value; ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; ahci_port_intr(p); break; case AHCI_P_CMD: { p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { uint64_t clb; p->cmd |= AHCI_P_CMD_CR; clb = (uint64_t)p->clbu << 32 | p->clb; p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb, AHCI_CL_SIZE * AHCI_MAX_SLOTS); } if (value & AHCI_P_CMD_FRE) { uint64_t fb; p->cmd |= AHCI_P_CMD_FR; fb = (uint64_t)p->fbu << 32 | p->fb; /* we don't support FBSCP, so rfis size is 256Bytes */ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256); } else { p->cmd &= ~AHCI_P_CMD_FR; } if (value & AHCI_P_CMD_CLO) { p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } if (value & AHCI_P_CMD_ICC_MASK) { p->cmd &= ~AHCI_P_CMD_ICC_MASK; } ahci_handle_port(p); break; } case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"", offset); break; case AHCI_P_SCTL: p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); } break; case AHCI_P_SERR: p->serr &= ~value; break; case AHCI_P_SACT: p->sact |= value; break; case AHCI_P_CI: p->ci |= value; ahci_handle_port(p); break; case AHCI_P_SNTF: case AHCI_P_FBS: default: break; } } static void pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) { DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"", offset, value); switch (offset) { case AHCI_CAP: case AHCI_PI: case AHCI_VS: case AHCI_CAP2: DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"", offset); break; case AHCI_GHC: if (value & AHCI_GHC_HR) { ahci_reset(sc); break; } if (value & AHCI_GHC_IE) sc->ghc |= AHCI_GHC_IE; else sc->ghc &= ~AHCI_GHC_IE; ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; ahci_generate_intr(sc, value); break; default: break; } } static void -pci_ahci_write(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size, +pci_ahci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); if (offset < AHCI_OFFSET) pci_ahci_host_write(sc, offset, value); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) pci_ahci_port_write(sc, offset, value); else WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"", offset); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; switch (offset) { case AHCI_CAP: case AHCI_GHC: case AHCI_IS: case AHCI_PI: case AHCI_VS: case AHCI_CCCC: case AHCI_CCCP: case AHCI_EM_LOC: case AHCI_EM_CTL: case AHCI_CAP2: { uint32_t *p = &sc->cap; p += (offset - AHCI_CAP) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x", offset, value); return (value); } static uint64_t pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) { uint32_t value; int port = (offset - AHCI_OFFSET) / AHCI_STEP; offset = (offset - AHCI_OFFSET) % AHCI_STEP; switch (offset) { case AHCI_P_CLB: case AHCI_P_CLBU: case AHCI_P_FB: case AHCI_P_FBU: case AHCI_P_IS: case AHCI_P_IE: case AHCI_P_CMD: case AHCI_P_TFD: case AHCI_P_SIG: case AHCI_P_SSTS: case AHCI_P_SCTL: case AHCI_P_SERR: case AHCI_P_SACT: case AHCI_P_CI: case AHCI_P_SNTF: case AHCI_P_FBS: { uint32_t *p= &sc->port[port].clb; p += (offset - AHCI_P_CLB) / sizeof(uint32_t); value = *p; break; } default: value = 0; break; } DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x", port, offset, value); return value; } static uint64_t -pci_ahci_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t regoff, int size) +pci_ahci_read(struct pci_devinst *pi, int baridx, uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; uint64_t offset; uint32_t value; assert(baridx == 5); assert(size == 1 || size == 2 || size == 4); assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < (uint64_t)AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"", regoff); } value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); return (value); } /* * Each AHCI controller has a "port" node which contains nodes for * each port named after the decimal number of the port (no leading * zeroes). Port nodes contain a "type" ("hd" or "cd"), as well as * options for blockif. For example: * * pci.0.1.0 * .device="ahci" * .port * .0 * .type="hd" * .path="/path/to/image" */ static int pci_ahci_legacy_config_port(nvlist_t *nvl, int port, const char *type, const char *opts) { char node_name[sizeof("XX")]; nvlist_t *port_nvl; snprintf(node_name, sizeof(node_name), "%d", port); port_nvl = create_relative_config_node(nvl, node_name); set_config_value_node(port_nvl, "type", type); return (blockif_legacy_config(port_nvl, opts)); } static int pci_ahci_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; const char *type; char *next, *next2, *str, *tofree; int p, ret; if (opts == NULL) return (0); ports_nvl = create_relative_config_node(nvl, "port"); ret = 1; tofree = str = strdup(opts); for (p = 0; p < MAX_PORTS && str != NULL; p++, str = next) { /* Identify and cut off type of present port. */ if (strncmp(str, "hd:", 3) == 0) { type = "hd"; str += 3; } else if (strncmp(str, "cd:", 3) == 0) { type = "cd"; str += 3; } else type = NULL; /* Find and cut off the next port options. */ next = strstr(str, ",hd:"); next2 = strstr(str, ",cd:"); if (next == NULL || (next2 != NULL && next2 < next)) next = next2; if (next != NULL) { next[0] = 0; next++; } if (str[0] == 0) continue; if (type == NULL) { EPRINTLN("Missing or invalid type for port %d: \"%s\"", p, str); goto out; } if (pci_ahci_legacy_config_port(ports_nvl, p, type, str) != 0) goto out; } ret = 0; out: free(tofree); return (ret); } static int pci_ahci_cd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "cd", opts)); } static int pci_ahci_hd_legacy_config(nvlist_t *nvl, const char *opts) { nvlist_t *ports_nvl; ports_nvl = create_relative_config_node(nvl, "port"); return (pci_ahci_legacy_config_port(ports_nvl, 0, "hd", opts)); } static int -pci_ahci_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) +pci_ahci_init(struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XXX:XXX:XXX")]; char node_name[sizeof("XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; int atapi, ret, slots, p; MD5_CTX mdctx; u_char digest[16]; const char *path, *type, *value; nvlist_t *ports_nvl, *port_nvl; ret = 0; #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; pthread_mutex_init(&sc->mtx, NULL); sc->ports = 0; sc->pi = 0; slots = 32; ports_nvl = find_relative_config_node(nvl, "port"); for (p = 0; ports_nvl != NULL && p < MAX_PORTS; p++) { struct ata_params *ata_ident = &sc->port[p].ata_ident; char ident[AHCI_PORT_IDENT]; snprintf(node_name, sizeof(node_name), "%d", p); port_nvl = find_relative_config_node(ports_nvl, node_name); if (port_nvl == NULL) continue; type = get_config_value_node(port_nvl, "type"); if (type == NULL) continue; if (strcmp(type, "hd") == 0) atapi = 0; else atapi = 1; /* * Attempt to open the backing image. Use the PCI slot/func * and the port number for the identifier string. */ snprintf(bident, sizeof(bident), "%u:%u:%u", pi->pi_slot, pi->pi_func, p); bctxt = blockif_open(port_nvl, bident); if (bctxt == NULL) { sc->ports = p; ret = 1; goto open_fail; } sc->port[p].bctx = bctxt; sc->port[p].pr_sc = sc; sc->port[p].port = p; sc->port[p].atapi = atapi; /* * Create an identifier for the backing file. * Use parts of the md5 sum of the filename */ path = get_config_value_node(port_nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(ident, AHCI_PORT_IDENT, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); memset(ata_ident, 0, sizeof(struct ata_params)); ata_string((uint8_t*)&ata_ident->serial, ident, 20); ata_string((uint8_t*)&ata_ident->revision, "001", 8); if (atapi) ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DVD ROM", 40); else ata_string((uint8_t*)&ata_ident->model, "BHYVE SATA DISK", 40); value = get_config_value_node(port_nvl, "nmrr"); if (value != NULL) ata_ident->media_rotation_rate = atoi(value); value = get_config_value_node(port_nvl, "ser"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->serial), value, 20); value = get_config_value_node(port_nvl, "rev"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->revision), value, 8); value = get_config_value_node(port_nvl, "model"); if (value != NULL) ata_string((uint8_t*)(&ata_ident->model), value, 40); ata_identify_init(&sc->port[p], atapi); /* * Allocate blockif request structures and add them * to the free list */ pci_ahci_ioreq_init(&sc->port[p]); sc->pi |= (1 << p); if (sc->port[p].ioqsz < slots) slots = sc->port[p].ioqsz; } sc->ports = p; /* Intel ICH8 AHCI */ --slots; if (sc->ports < DEF_PORTS) sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); p = MIN(sc->ports, 16); p = flsl(p) - ((p & (p - 1)) ? 0 : 1); pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); pci_lintr_request(pi); open_fail: if (ret) { for (p = 0; p < sc->ports; p++) { if (sc->port[p].bctx != NULL) blockif_close(sc->port[p].bctx); } free(sc); } return (ret); } #ifdef BHYVE_SNAPSHOT static int pci_ahci_snapshot(struct vm_snapshot_meta *meta) { int i, ret; void *bctx; struct pci_devinst *pi; struct pci_ahci_softc *sc; struct ahci_port *port; pi = meta->dev_data; sc = pi->pi_arg; /* TODO: add mtx lock/unlock */ SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); for (i = 0; i < MAX_PORTS; i++) { port = &sc->port[i]; if (meta->op == VM_SNAPSHOT_SAVE) bctx = port->bctx; SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); /* Mostly for restore; save is ensured by the lines above. */ if (((bctx == NULL) && (port->bctx != NULL)) || ((bctx != NULL) && (port->bctx == NULL))) { fprintf(stderr, "%s: ports not matching\r\n", __func__); ret = EINVAL; goto done; } if (port->bctx == NULL) continue; if (port->port != i) { fprintf(stderr, "%s: ports not matching: " "actual: %d expected: %d\r\n", __func__, port->port, i); ret = EINVAL; goto done; } SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst, AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ata_ident, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); assert(TAILQ_EMPTY(&port->iobhd)); } done: return (ret); } static int -pci_ahci_pause(struct vmctx *ctx __unused, struct pci_devinst *pi) +pci_ahci_pause(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_pause(bctxt); } return (0); } static int -pci_ahci_resume(struct vmctx *ctx __unused, struct pci_devinst *pi) +pci_ahci_resume(struct pci_devinst *pi) { struct pci_ahci_softc *sc; struct blockif_ctxt *bctxt; int i; sc = pi->pi_arg; for (i = 0; i < MAX_PORTS; i++) { bctxt = sc->port[i].bctx; if (bctxt == NULL) continue; blockif_resume(bctxt); } return (0); } #endif /* BHYVE_SNAPSHOT */ /* * Use separate emulation names to distinguish drive and atapi devices */ static const struct pci_devemu pci_de_ahci = { .pe_emu = "ahci", .pe_init = pci_ahci_init, .pe_legacy_config = pci_ahci_legacy_config, .pe_barwrite = pci_ahci_write, .pe_barread = pci_ahci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_ahci_snapshot, .pe_pause = pci_ahci_pause, .pe_resume = pci_ahci_resume, #endif }; PCI_EMUL_SET(pci_de_ahci); static const struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_legacy_config = pci_ahci_hd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_hd); static const struct pci_devemu pci_de_ahci_cd = { .pe_emu = "ahci-cd", .pe_legacy_config = pci_ahci_cd_legacy_config, .pe_alias = "ahci", }; PCI_EMUL_SET(pci_de_ahci_cd); diff --git a/usr.sbin/bhyve/pci_e82545.c b/usr.sbin/bhyve/pci_e82545.c index 49c5f986a341..bec8edb17caf 100644 --- a/usr.sbin/bhyve/pci_e82545.c +++ b/usr.sbin/bhyve/pci_e82545.c @@ -1,2541 +1,2539 @@ /* * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alexander Motin * Copyright (c) 2015 Peter Grehan * Copyright (c) 2013 Jeremiah Lott, Avere Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "e1000_regs.h" #include "e1000_defines.h" #include "mii.h" #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "mevent.h" #include "net_utils.h" #include "net_backends.h" /* Hardware/register definitions XXX: move some to common code. */ #define E82545_VENDOR_ID_INTEL 0x8086 #define E82545_DEV_ID_82545EM_COPPER 0x100F #define E82545_SUBDEV_ID 0x1008 #define E82545_REVISION_4 4 #define E82545_MDIC_DATA_MASK 0x0000FFFF #define E82545_MDIC_OP_MASK 0x0c000000 #define E82545_MDIC_IE 0x20000000 #define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ #define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ #define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ #define E82545_BAR_REGISTER 0 #define E82545_BAR_REGISTER_LEN (128*1024) #define E82545_BAR_FLASH 1 #define E82545_BAR_FLASH_LEN (64*1024) #define E82545_BAR_IO 2 #define E82545_BAR_IO_LEN 8 #define E82545_IOADDR 0x00000000 #define E82545_IODATA 0x00000004 #define E82545_IO_REGISTER_MAX 0x0001FFFF #define E82545_IO_FLASH_BASE 0x00080000 #define E82545_IO_FLASH_MAX 0x000FFFFF #define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) #define E82545_RAR_MAX 15 #define E82545_MTA_MAX 127 #define E82545_VFTA_MAX 127 /* Slightly modified from the driver versions, hardcoded for 3 opcode bits, * followed by 6 address bits. * TODO: make opcode bits and addr bits configurable? * NVM Commands - Microwire */ #define E82545_NVM_OPCODE_BITS 3 #define E82545_NVM_ADDR_BITS 6 #define E82545_NVM_DATA_BITS 16 #define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) #define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) #define E82545_NVM_OPCODE_MASK \ (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) #define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ #define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ #define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ #define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ #define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ #define E1000_ICR_SRPD 0x00010000 /* This is an arbitrary number. There is no hard limit on the chip. */ #define I82545_MAX_TXSEGS 64 /* Legacy receive descriptor */ struct e1000_rx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ uint16_t length; /* Length of data DMAed into data buffer */ uint16_t csum; /* Packet checksum */ uint8_t status; /* Descriptor status */ uint8_t errors; /* Descriptor Errors */ uint16_t special; }; /* Transmit descriptor types */ #define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) #define E1000_TXD_TYP_L (0) #define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) #define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) /* Legacy transmit descriptor */ struct e1000_tx_desc { uint64_t buffer_addr; /* Address of the descriptor's data buffer */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t cso; /* Checksum offset */ uint8_t cmd; /* Descriptor control */ } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t css; /* Checksum start */ uint16_t special; } fields; } upper; }; /* Context descriptor */ struct e1000_context_desc { union { uint32_t ip_config; struct { uint8_t ipcss; /* IP checksum start */ uint8_t ipcso; /* IP checksum offset */ uint16_t ipcse; /* IP checksum end */ } ip_fields; } lower_setup; union { uint32_t tcp_config; struct { uint8_t tucss; /* TCP checksum start */ uint8_t tucso; /* TCP checksum offset */ uint16_t tucse; /* TCP checksum end */ } tcp_fields; } upper_setup; uint32_t cmd_and_length; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t hdr_len; /* Header length */ uint16_t mss; /* Maximum segment size */ } fields; } tcp_seg_setup; }; /* Data descriptor */ struct e1000_data_desc { uint64_t buffer_addr; /* Address of the descriptor's buffer address */ union { uint32_t data; struct { uint16_t length; /* Data buffer length */ uint8_t typ_len_ext; uint8_t cmd; } flags; } lower; union { uint32_t data; struct { uint8_t status; /* Descriptor status */ uint8_t popts; /* Packet Options */ uint16_t special; } fields; } upper; }; union e1000_tx_udesc { struct e1000_tx_desc td; struct e1000_context_desc cd; struct e1000_data_desc dd; }; /* Tx checksum info for a packet. */ struct ck_info { int ck_valid; /* ck_info is valid */ uint8_t ck_start; /* start byte of cksum calcuation */ uint8_t ck_off; /* offset of cksum insertion */ uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ }; /* * Debug printf */ static int e82545_debug = 0; #define WPRINTF(msg,params...) PRINTLN("e82545: " msg, ##params) #define DPRINTF(msg,params...) if (e82545_debug) WPRINTF(msg, params) #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) /* s/w representation of the RAL/RAH regs */ struct eth_uni { int eu_valid; int eu_addrsel; struct ether_addr eu_eth; }; struct e82545_softc { struct pci_devinst *esc_pi; struct vmctx *esc_ctx; struct mevent *esc_mevpitr; pthread_mutex_t esc_mtx; struct ether_addr esc_mac; net_backend_t *esc_be; /* General */ uint32_t esc_CTRL; /* x0000 device ctl */ uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ uint32_t esc_FCAH; /* x002C flow ctl addr hi */ uint32_t esc_FCT; /* x0030 flow ctl type */ uint32_t esc_VET; /* x0038 VLAN eth type */ uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ uint32_t esc_LEDCTL; /* x0E00 LED control */ uint32_t esc_PBA; /* x1000 pkt buffer allocation */ /* Interrupt control */ int esc_irq_asserted; uint32_t esc_ICR; /* x00C0 cause read/clear */ uint32_t esc_ITR; /* x00C4 intr throttling */ uint32_t esc_ICS; /* x00C8 cause set */ uint32_t esc_IMS; /* x00D0 mask set/read */ uint32_t esc_IMC; /* x00D8 mask clear */ /* Transmit */ union e1000_tx_udesc *esc_txdesc; struct e1000_context_desc esc_txctx; pthread_t esc_tx_tid; pthread_cond_t esc_tx_cond; int esc_tx_enabled; int esc_tx_active; uint32_t esc_TXCW; /* x0178 transmit config */ uint32_t esc_TCTL; /* x0400 transmit ctl */ uint32_t esc_TIPG; /* x0410 inter-packet gap */ uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ uint64_t esc_tdba; /* verified 64-bit desc table addr */ uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ uint16_t esc_TDH; /* x3810 desc table head idx */ uint16_t esc_TDHr; /* internal read version of TDH */ uint16_t esc_TDT; /* x3818 desc table tail idx */ uint32_t esc_TIDV; /* x3820 intr delay */ uint32_t esc_TXDCTL; /* x3828 desc control */ uint32_t esc_TADV; /* x382C intr absolute delay */ /* L2 frame acceptance */ struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ /* Receive */ struct e1000_rx_desc *esc_rxdesc; pthread_cond_t esc_rx_cond; int esc_rx_enabled; int esc_rx_active; int esc_rx_loopback; uint32_t esc_RCTL; /* x0100 receive ctl */ uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ uint64_t esc_rdba; /* verified 64-bit desc table addr */ uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ uint32_t esc_RDLEN; /* x2808 #descriptors */ uint16_t esc_RDH; /* x2810 desc table head idx */ uint16_t esc_RDT; /* x2818 desc table tail idx */ uint32_t esc_RDTR; /* x2820 intr delay */ uint32_t esc_RXDCTL; /* x2828 desc control */ uint32_t esc_RADV; /* x282C intr absolute delay */ uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ /* IO Port register access */ uint32_t io_addr; /* Shadow copy of MDIC */ uint32_t mdi_control; /* Shadow copy of EECD */ uint32_t eeprom_control; /* Latest NVM in/out */ uint16_t nvm_data; uint16_t nvm_opaddr; /* stats */ uint32_t missed_pkt_count; /* dropped for no room in rx queue */ uint32_t pkt_rx_by_size[6]; uint32_t pkt_tx_by_size[6]; uint32_t good_pkt_rx_count; uint32_t bcast_pkt_rx_count; uint32_t mcast_pkt_rx_count; uint32_t good_pkt_tx_count; uint32_t bcast_pkt_tx_count; uint32_t mcast_pkt_tx_count; uint32_t oversize_rx_count; uint32_t tso_tx_count; uint64_t good_octets_rx; uint64_t good_octets_tx; uint64_t missed_octets; /* counts missed and oversized */ uint8_t nvm_bits:6; /* number of bits remaining in/out */ uint8_t nvm_mode:2; #define E82545_NVM_MODE_OPADDR 0x0 #define E82545_NVM_MODE_DATAIN 0x1 #define E82545_NVM_MODE_DATAOUT 0x2 /* EEPROM data */ uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; }; static void e82545_reset(struct e82545_softc *sc, int dev); static void e82545_rx_enable(struct e82545_softc *sc); static void e82545_rx_disable(struct e82545_softc *sc); static void e82545_rx_callback(int fd, enum ev_type type, void *param); static void e82545_tx_start(struct e82545_softc *sc); static void e82545_tx_enable(struct e82545_softc *sc); static void e82545_tx_disable(struct e82545_softc *sc); static inline int __unused e82545_size_stat_index(uint32_t size) { if (size <= 64) { return 0; } else if (size >= 1024) { return 5; } else { /* should be 1-4 */ return (ffs(size) - 6); } } static void e82545_init_eeprom(struct e82545_softc *sc) { uint16_t checksum, i; /* mac addr */ sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | (((uint16_t)sc->esc_mac.octet[1]) << 8); sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | (((uint16_t)sc->esc_mac.octet[3]) << 8); sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | (((uint16_t)sc->esc_mac.octet[5]) << 8); /* pci ids */ sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; /* fill in the checksum */ checksum = 0; for (i = 0; i < NVM_CHECKSUM_REG; i++) { checksum += sc->eeprom_data[i]; } checksum = NVM_SUM - checksum; sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; DPRINTF("eeprom checksum: 0x%x", checksum); } static void e82545_write_mdi(struct e82545_softc *sc __unused, uint8_t reg_addr, uint8_t phy_addr, uint32_t data) { DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x", reg_addr, phy_addr, data); } static uint32_t e82545_read_mdi(struct e82545_softc *sc __unused, uint8_t reg_addr, uint8_t phy_addr) { //DPRINTF("Read mdi reg:0x%x phy:0x%x", reg_addr, phy_addr); switch (reg_addr) { case PHY_STATUS: return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | MII_SR_AUTONEG_COMPLETE); case PHY_AUTONEG_ADV: return NWAY_AR_SELECTOR_FIELD; case PHY_LP_ABILITY: return 0; case PHY_1000T_STATUS: return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | SR_1000T_LOCAL_RX_STATUS); case PHY_ID1: return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; case PHY_ID2: return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; default: DPRINTF("Unknown mdi read reg:0x%x phy:0x%x", reg_addr, phy_addr); return 0; } /* not reached */ } static void e82545_eecd_strobe(struct e82545_softc *sc) { /* Microwire state machine */ /* DPRINTF("eeprom state machine srtobe " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data);*/ if (sc->nvm_bits == 0) { DPRINTF("eeprom state machine not expecting data! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); return; } sc->nvm_bits--; if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { /* shifting out */ if (sc->nvm_data & 0x8000) { sc->eeprom_control |= E1000_EECD_DO; } else { sc->eeprom_control &= ~E1000_EECD_DO; } sc->nvm_data <<= 1; if (sc->nvm_bits == 0) { /* read done, back to opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { /* shifting in */ sc->nvm_data <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_data |= 1; } if (sc->nvm_bits == 0) { /* eeprom write */ uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; if (op != E82545_NVM_OPCODE_WRITE) { DPRINTF("Illegal eeprom write op 0x%x", sc->nvm_opaddr); } else if (addr >= E82545_NVM_EEPROM_SIZE) { DPRINTF("Illegal eeprom write addr 0x%x", sc->nvm_opaddr); } else { DPRINTF("eeprom write eeprom[0x%x] = 0x%x", addr, sc->nvm_data); sc->eeprom_data[addr] = sc->nvm_data; } /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { sc->nvm_opaddr <<= 1; if (sc->eeprom_control & E1000_EECD_DI) { sc->nvm_opaddr |= 1; } if (sc->nvm_bits == 0) { uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; switch (op) { case E82545_NVM_OPCODE_EWEN: DPRINTF("eeprom write enable: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; break; case E82545_NVM_OPCODE_READ: { uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; sc->nvm_mode = E82545_NVM_MODE_DATAOUT; sc->nvm_bits = E82545_NVM_DATA_BITS; if (addr < E82545_NVM_EEPROM_SIZE) { sc->nvm_data = sc->eeprom_data[addr]; DPRINTF("eeprom read: eeprom[0x%x] = 0x%x", addr, sc->nvm_data); } else { DPRINTF("eeprom illegal read: 0x%x", sc->nvm_opaddr); sc->nvm_data = 0; } break; } case E82545_NVM_OPCODE_WRITE: sc->nvm_mode = E82545_NVM_MODE_DATAIN; sc->nvm_bits = E82545_NVM_DATA_BITS; sc->nvm_data = 0; break; default: DPRINTF("eeprom unknown op: 0x%x", sc->nvm_opaddr); /* back to opcode mode */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; } } } else { DPRINTF("eeprom state machine wrong state! " "0x%x 0x%x 0x%x 0x%x", sc->nvm_mode, sc->nvm_bits, sc->nvm_opaddr, sc->nvm_data); } } static void e82545_itr_callback(int fd __unused, enum ev_type type __unused, void *param) { uint32_t new; struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); new = sc->esc_ICR & sc->esc_IMS; if (new && !sc->esc_irq_asserted) { DPRINTF("itr callback: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); } else { mevent_delete(sc->esc_mevpitr); sc->esc_mevpitr = NULL; } pthread_mutex_unlock(&sc->esc_mtx); } static void e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) { uint32_t new; DPRINTF("icr assert: 0x%x", bits); /* * An interrupt is only generated if bits are set that * aren't already in the ICR, these bits are unmasked, * and there isn't an interrupt already pending. */ new = bits & ~sc->esc_ICR & sc->esc_IMS; sc->esc_ICR |= bits; if (new == 0) { DPRINTF("icr assert: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("icr assert: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("icr assert: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_ims_change(struct e82545_softc *sc, uint32_t bits) { uint32_t new; /* * Changing the mask may allow previously asserted * but masked interrupt requests to generate an interrupt. */ new = bits & sc->esc_ICR & ~sc->esc_IMS; sc->esc_IMS |= bits; if (new == 0) { DPRINTF("ims change: masked %x, ims %x", new, sc->esc_IMS); } else if (sc->esc_mevpitr != NULL) { DPRINTF("ims change: throttled %x, ims %x", new, sc->esc_IMS); } else if (!sc->esc_irq_asserted) { DPRINTF("ims change: lintr assert %x", new); sc->esc_irq_asserted = 1; pci_lintr_assert(sc->esc_pi); if (sc->esc_ITR != 0) { sc->esc_mevpitr = mevent_add( (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ EVF_TIMER, e82545_itr_callback, sc); } } } static void e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) { DPRINTF("icr deassert: 0x%x", bits); sc->esc_ICR &= ~bits; /* * If there are no longer any interrupt sources and there * was an asserted interrupt, clear it */ if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { DPRINTF("icr deassert: lintr deassert %x", bits); pci_lintr_deassert(sc->esc_pi); sc->esc_irq_asserted = 0; } } static void e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) { DPRINTF("intr_write: off %x, val %x", offset, value); switch (offset) { case E1000_ICR: e82545_icr_deassert(sc, value); break; case E1000_ITR: sc->esc_ITR = value; break; case E1000_ICS: sc->esc_ICS = value; /* not used: store for debug */ e82545_icr_assert(sc, value); break; case E1000_IMS: e82545_ims_change(sc, value); break; case E1000_IMC: sc->esc_IMC = value; /* for debug */ sc->esc_IMS &= ~value; // XXX clear interrupts if all ICR bits now masked // and interrupt was pending ? break; default: break; } } static uint32_t e82545_intr_read(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; retval = 0; DPRINTF("intr_read: off %x", offset); switch (offset) { case E1000_ICR: retval = sc->esc_ICR; sc->esc_ICR = 0; e82545_icr_deassert(sc, ~0); break; case E1000_ITR: retval = sc->esc_ITR; break; case E1000_ICS: /* write-only register */ break; case E1000_IMS: retval = sc->esc_IMS; break; case E1000_IMC: /* write-only register */ break; default: break; } return (retval); } static void e82545_devctl(struct e82545_softc *sc, uint32_t val) { sc->esc_CTRL = val & ~E1000_CTRL_RST; if (val & E1000_CTRL_RST) { DPRINTF("e1k: s/w reset, ctl %x", val); e82545_reset(sc, 1); } /* XXX check for phy reset ? */ } static void e82545_rx_update_rdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | sc->esc_RDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, sc->esc_rdba, sc->esc_RDLEN); } static void e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ sc->esc_RCTL = val & ~0xF9204c01; DPRINTF("rx_ctl - %s RCTL %x, val %x", on ? "on" : "off", sc->esc_RCTL, val); /* state change requested */ if (on != sc->esc_rx_enabled) { if (on) { /* Catch disallowed/unimplemented settings */ //assert(!(val & E1000_RCTL_LBM_TCVR)); if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { sc->esc_rx_loopback = 1; } else { sc->esc_rx_loopback = 0; } e82545_rx_update_rdba(sc); e82545_rx_enable(sc); } else { e82545_rx_disable(sc); sc->esc_rx_loopback = 0; sc->esc_rdba = 0; sc->esc_rxdesc = NULL; } } } static void e82545_tx_update_tdba(struct e82545_softc *sc) { /* XXX verify desc base/len within phys mem range */ sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; /* Cache host mapping of guest descriptor array */ sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, sc->esc_TDLEN); } static void e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) { int on; on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); /* ignore TCTL_EN settings that don't change state */ if (on == sc->esc_tx_enabled) return; if (on) { e82545_tx_update_tdba(sc); e82545_tx_enable(sc); } else { e82545_tx_disable(sc); sc->esc_tdba = 0; sc->esc_txdesc = NULL; } /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ sc->esc_TCTL = val & ~0xFE800005; } static int e82545_bufsz(uint32_t rctl) { switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { case (E1000_RCTL_SZ_2048): return (2048); case (E1000_RCTL_SZ_1024): return (1024); case (E1000_RCTL_SZ_512): return (512); case (E1000_RCTL_SZ_256): return (256); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); } return (256); /* Forbidden value. */ } /* XXX one packet at a time until this is debugged */ static void e82545_rx_callback(int fd __unused, enum ev_type type __unused, void *param) { struct e82545_softc *sc = param; struct e1000_rx_desc *rxd; struct iovec vec[64]; ssize_t len; int left, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; uint32_t cause = 0; uint16_t *tp, tag, head; pthread_mutex_lock(&sc->esc_mtx); DPRINTF("rx_run: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped", sc->esc_rx_enabled, sc->esc_rx_loopback); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } bufsz = e82545_bufsz(sc->esc_RCTL); maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; size = sc->esc_RDLEN / 16; head = sc->esc_RDH; left = (size + sc->esc_RDT - head) % size; if (left < maxpktdesc) { DPRINTF("rx overflow (%d < %d) -- packet(s) dropped", left, maxpktdesc); while (netbe_rx_discard(sc->esc_be) > 0) { } goto done1; } sc->esc_rx_active = 1; pthread_mutex_unlock(&sc->esc_mtx); for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { /* Grab rx descriptor pointed to by the head pointer */ for (i = 0; i < maxpktdesc; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; vec[i].iov_base = paddr_guest2host(sc->esc_ctx, rxd->buffer_addr, bufsz); vec[i].iov_len = bufsz; } len = netbe_recv(sc->esc_be, vec, maxpktdesc); if (len <= 0) { DPRINTF("netbe_recv() returned %zd", len); goto done; } /* * Adjust the packet length based on whether the CRC needs * to be stripped or if the packet is less than the minimum * eth packet size. */ if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) len = ETHER_MIN_LEN - ETHER_CRC_LEN; if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) len += ETHER_CRC_LEN; n = (len + bufsz - 1) / bufsz; DPRINTF("packet read %zd bytes, %d segs, head %d", len, n, head); /* Apply VLAN filter. */ tp = (uint16_t *)vec[0].iov_base + 6; if ((sc->esc_RCTL & E1000_RCTL_VFE) && (ntohs(tp[0]) == sc->esc_VET)) { tag = ntohs(tp[1]) & 0x0fff; if ((sc->esc_fvlan[tag >> 5] & (1 << (tag & 0x1f))) != 0) { DPRINTF("known VLAN %d", tag); } else { DPRINTF("unknown VLAN %d", tag); n = 0; continue; } } /* Update all consumed descriptors. */ for (i = 0; i < n - 1; i++) { rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; rxd->status = E1000_RXD_STAT_DD; } rxd = &sc->esc_rxdesc[(head + i) % size]; rxd->length = len % bufsz; rxd->csum = 0; rxd->errors = 0; rxd->special = 0; /* XXX signal no checksum for now */ rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; /* Schedule receive interrupts. */ if ((uint32_t)len <= sc->esc_RSRPD) { cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; } else { /* XXX: RDRT and RADV timers should be here. */ cause |= E1000_ICR_RXT0; } head = (head + n) % size; left -= n; } done: pthread_mutex_lock(&sc->esc_mtx); sc->esc_rx_active = 0; if (sc->esc_rx_enabled == 0) pthread_cond_signal(&sc->esc_rx_cond); sc->esc_RDH = head; /* Respect E1000_RCTL_RDMTS */ left = (size + sc->esc_RDT - head) % size; if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) cause |= E1000_ICR_RXDMT0; /* Assert all accumulated interrupts. */ if (cause != 0) e82545_icr_assert(sc, cause); done1: DPRINTF("rx_run done: head %x, tail %x", sc->esc_RDH, sc->esc_RDT); pthread_mutex_unlock(&sc->esc_mtx); } static uint16_t e82545_carry(uint32_t sum) { sum = (sum & 0xFFFF) + (sum >> 16); if (sum > 0xFFFF) sum -= 0xFFFF; return (sum); } static uint16_t e82545_buf_checksum(uint8_t *buf, int len) { int i; uint32_t sum = 0; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1); i += 2) sum += *((u_int16_t *)(buf + i)); /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) sum += htons(buf[i] << 8); return (e82545_carry(sum)); } static uint16_t e82545_iov_checksum(struct iovec *iov, int iovcnt, unsigned int off, unsigned int len) { unsigned int now, odd; uint32_t sum = 0, s; /* Skip completely unneeded vectors. */ while (iovcnt > 0 && iov->iov_len <= off && off > 0) { off -= iov->iov_len; iov++; iovcnt--; } /* Calculate checksum of requested range. */ odd = 0; while (len > 0 && iovcnt > 0) { now = MIN(len, iov->iov_len - off); s = e82545_buf_checksum((uint8_t *)iov->iov_base + off, now); sum += odd ? (s << 8) : s; odd ^= (now & 1); len -= now; off = 0; iov++; iovcnt--; } return (e82545_carry(sum)); } /* * Return the transmit descriptor type. */ static int e82545_txdesc_type(uint32_t lower) { int type; type = 0; if (lower & E1000_TXD_CMD_DEXT) type = lower & E1000_TXD_MASK; return (type); } static void e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) { uint16_t cksum; unsigned int cklen; DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d", iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1U : UINT_MAX; cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; } static void e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) { if (sc->esc_be == NULL) return; (void) netbe_send(sc->esc_be, iov, iovcnt); } static void e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, int *tdwb) { union e1000_tx_udesc *dsc; for ( ; head != tail; head = (head + 1) % dsize) { dsc = &sc->esc_txdesc[head]; if (dsc->td.lower.data & E1000_TXD_CMD_RS) { dsc->td.upper.data |= E1000_TXD_STAT_DD; *tdwb = 1; } } } static int e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, uint16_t dsize, uint16_t *rhead, int *tdwb) { uint8_t *hdr, *hdrp; struct iovec iovb[I82545_MAX_TXSEGS + 2]; struct iovec tiov[I82545_MAX_TXSEGS + 2]; struct e1000_context_desc *cd; struct ck_info ckinfo[2]; struct iovec *iov; union e1000_tx_udesc *dsc; int desc, dtype, ntype, iovcnt, tcp, tso, paylen, seg, tiovcnt, pv; unsigned hdrlen, vlen, pktlen, len, left, mss, now, nnow, nleft, pvoff; uint32_t tcpsum, tcpseq; uint16_t ipcs, tcpcs, ipid, ohead; bool invalid; ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; iovcnt = 0; ntype = 0; tso = 0; pktlen = 0; ohead = head; invalid = false; /* iovb[0/1] may be used for writable copy of headers. */ iov = &iovb[2]; for (desc = 0; ; desc++, head = (head + 1) % dsize) { if (head == tail) { *rhead = head; return (0); } dsc = &sc->esc_txdesc[head]; dtype = e82545_txdesc_type(dsc->td.lower.data); if (desc == 0) { switch (dtype) { case E1000_TXD_TYP_C: DPRINTF("tx ctxt desc idx %d: %016jx " "%08x%08x", head, dsc->td.buffer_addr, dsc->td.upper.data, dsc->td.lower.data); /* Save context and return */ sc->esc_txctx = dsc->cd; goto done; case E1000_TXD_TYP_L: DPRINTF("tx legacy desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); /* * legacy cksum start valid in first descriptor */ ntype = dtype; ckinfo[0].ck_start = dsc->td.upper.fields.css; break; case E1000_TXD_TYP_D: DPRINTF("tx data desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); ntype = dtype; break; default: break; } } else { /* Descriptor type must be consistent */ assert(dtype == ntype); DPRINTF("tx next desc idx %d: %08x%08x", head, dsc->td.upper.data, dsc->td.lower.data); } len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : dsc->dd.lower.data & 0xFFFFF; /* Strip checksum supplied by guest. */ if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) { if (len <= 2) { WPRINTF("final descriptor too short (%d) -- dropped", len); invalid = true; } else len -= 2; } if (len > 0 && iovcnt < I82545_MAX_TXSEGS) { iov[iovcnt].iov_base = paddr_guest2host(sc->esc_ctx, dsc->td.buffer_addr, len); iov[iovcnt].iov_len = len; iovcnt++; pktlen += len; } /* * Pull out info that is valid in the final descriptor * and exit descriptor loop. */ if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { if (dtype == E1000_TXD_TYP_L) { if (dsc->td.lower.data & E1000_TXD_CMD_IC) { ckinfo[0].ck_valid = 1; ckinfo[0].ck_off = dsc->td.lower.flags.cso; ckinfo[0].ck_len = 0; } } else { cd = &sc->esc_txctx; if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) tso = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM) ckinfo[0].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_IXSM || tso) { ckinfo[0].ck_start = cd->lower_setup.ip_fields.ipcss; ckinfo[0].ck_off = cd->lower_setup.ip_fields.ipcso; ckinfo[0].ck_len = cd->lower_setup.ip_fields.ipcse; } if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM) ckinfo[1].ck_valid = 1; if (dsc->dd.upper.fields.popts & E1000_TXD_POPTS_TXSM || tso) { ckinfo[1].ck_start = cd->upper_setup.tcp_fields.tucss; ckinfo[1].ck_off = cd->upper_setup.tcp_fields.tucso; ckinfo[1].ck_len = cd->upper_setup.tcp_fields.tucse; } } break; } } if (invalid) goto done; if (iovcnt > I82545_MAX_TXSEGS) { WPRINTF("tx too many descriptors (%d > %d) -- dropped", iovcnt, I82545_MAX_TXSEGS); goto done; } hdrlen = vlen = 0; /* Estimate writable space for VLAN header insertion. */ if ((sc->esc_CTRL & E1000_CTRL_VME) && (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { hdrlen = ETHER_ADDR_LEN*2; vlen = ETHER_VLAN_ENCAP_LEN; } if (!tso) { /* Estimate required writable space for checksums. */ if (ckinfo[0].ck_valid) hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2U); if (ckinfo[1].ck_valid) hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2U); /* Round up writable space to the first vector. */ if (hdrlen != 0 && iov[0].iov_len > hdrlen && iov[0].iov_len < hdrlen + 100) hdrlen = iov[0].iov_len; } else { /* In case of TSO header length provided by software. */ hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; /* * Cap the header length at 240 based on 7.2.4.5 of * the Intel 82576EB (Rev 2.63) datasheet. */ if (hdrlen > 240) { WPRINTF("TSO hdrlen too large: %d", hdrlen); goto done; } /* * If VLAN insertion is requested, ensure the header * at least holds the amount of data copied during * VLAN insertion below. * * XXX: Realistic packets will include a full Ethernet * header before the IP header at ckinfo[0].ck_start, * but this check is sufficient to prevent * out-of-bounds access below. */ if (vlen != 0 && hdrlen < ETHER_ADDR_LEN*2) { WPRINTF("TSO hdrlen too small for vlan insertion " "(%d vs %d) -- dropped", hdrlen, ETHER_ADDR_LEN*2); goto done; } /* * Ensure that the header length covers the used fields * in the IP and TCP headers as well as the IP and TCP * checksums. The following fields are accessed below: * * Header | Field | Offset | Length * -------+-------+--------+------- * IPv4 | len | 2 | 2 * IPv4 | ID | 4 | 2 * IPv6 | len | 4 | 2 * TCP | seq # | 4 | 4 * TCP | flags | 13 | 1 * UDP | len | 4 | 4 */ if (hdrlen < ckinfo[0].ck_start + 6U || hdrlen < ckinfo[0].ck_off + 2U) { WPRINTF("TSO hdrlen too small for IP fields (%d) " "-- dropped", hdrlen); goto done; } if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) { if (hdrlen < ckinfo[1].ck_start + 14U) { WPRINTF("TSO hdrlen too small for TCP fields " "(%d) -- dropped", hdrlen); goto done; } } else { if (hdrlen < ckinfo[1].ck_start + 8U) { WPRINTF("TSO hdrlen too small for UDP fields " "(%d) -- dropped", hdrlen); goto done; } } if (ckinfo[1].ck_valid && hdrlen < ckinfo[1].ck_off + 2U) { WPRINTF("TSO hdrlen too small for TCP/UDP fields " "(%d) -- dropped", hdrlen); goto done; } } if (pktlen < hdrlen + vlen) { WPRINTF("packet too small for writable header"); goto done; } /* Allocate, fill and prepend writable header vector. */ if (hdrlen + vlen != 0) { hdr = __builtin_alloca(hdrlen + vlen); hdr += vlen; for (left = hdrlen, hdrp = hdr; left > 0; left -= now, hdrp += now) { now = MIN(left, iov->iov_len); memcpy(hdrp, iov->iov_base, now); iov->iov_base = (uint8_t *)iov->iov_base + now; iov->iov_len -= now; if (iov->iov_len == 0) { iov++; iovcnt--; } } iov--; iovcnt++; iov->iov_base = hdr; iov->iov_len = hdrlen; } else hdr = NULL; /* Insert VLAN tag. */ if (vlen != 0) { hdr -= ETHER_VLAN_ENCAP_LEN; memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); hdrlen += ETHER_VLAN_ENCAP_LEN; hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; iov->iov_base = hdr; iov->iov_len += ETHER_VLAN_ENCAP_LEN; /* Correct checksum offsets after VLAN tag insertion. */ ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[0].ck_len != 0) ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; if (ckinfo[1].ck_len != 0) ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; } /* Simple non-TSO case. */ if (!tso) { /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); if (ckinfo[1].ck_valid) e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); e82545_transmit_backend(sc, iov, iovcnt); goto done; } /* Doing TSO. */ tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; mss = sc->esc_txctx.tcp_seg_setup.fields.mss; paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); DPRINTF("tx %s segmentation offload %d+%d/%u bytes %d iovs", tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); tcpseq = 0; if (tcp) tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; tcpcs = 0; if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; pv = 1; pvoff = 0; for (seg = 0, left = paylen; left > 0; seg++, left -= now) { now = MIN(left, mss); /* Construct IOVs for the segment. */ /* Include whole original header. */ tiov[0].iov_base = hdr; tiov[0].iov_len = hdrlen; tiovcnt = 1; /* Include respective part of payload IOV. */ for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { nnow = MIN(nleft, iov[pv].iov_len - pvoff); tiov[tiovcnt].iov_base = (uint8_t *)iov[pv].iov_base + pvoff; tiov[tiovcnt++].iov_len = nnow; if (pvoff + nnow == iov[pv].iov_len) { pv++; pvoff = 0; } else pvoff += nnow; } DPRINTF("tx segment %d %d+%d bytes %d iovs", seg, hdrlen, now, tiovcnt); /* Update IP header. */ if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { /* IPv4 -- set length and ID */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = htons(hdrlen - ckinfo[0].ck_start + now); *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(ipid + seg); } else { /* IPv6 -- set length */ *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = htons(hdrlen - ckinfo[0].ck_start - 40 + now); } /* Update pseudo-header checksum. */ tcpsum = tcpcs; tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); /* Update TCP/UDP headers. */ if (tcp) { /* Update sequence number and FIN/PUSH flags. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = htonl(tcpseq + paylen - left); if (now < left) { hdr[ckinfo[1].ck_start + 13] &= ~(TH_FIN | TH_PUSH); } } else { /* Update payload length. */ *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = hdrlen - ckinfo[1].ck_start + now; } /* Calculate checksums and transmit. */ if (ckinfo[0].ck_valid) { *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); } if (ckinfo[1].ck_valid) { *(uint16_t *)&hdr[ckinfo[1].ck_off] = e82545_carry(tcpsum); e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); } e82545_transmit_backend(sc, tiov, tiovcnt); } done: head = (head + 1) % dsize; e82545_transmit_done(sc, ohead, head, dsize, tdwb); *rhead = head; return (desc + 1); } static void e82545_tx_run(struct e82545_softc *sc) { uint32_t cause; uint16_t head, rhead, tail, size; int lim, tdwb, sent; size = sc->esc_TDLEN / 16; if (size == 0) return; head = sc->esc_TDH % size; tail = sc->esc_TDT % size; DPRINTF("tx_run: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); pthread_mutex_unlock(&sc->esc_mtx); rhead = head; tdwb = 0; for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); if (sent == 0) break; head = rhead; } pthread_mutex_lock(&sc->esc_mtx); sc->esc_TDH = head; sc->esc_TDHr = rhead; cause = 0; if (tdwb) cause |= E1000_ICR_TXDW; if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) cause |= E1000_ICR_TXQE; if (cause) e82545_icr_assert(sc, cause); DPRINTF("tx_run done: head %x, rhead %x, tail %x", sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); } static _Noreturn void * e82545_tx_thread(void *param) { struct e82545_softc *sc = param; pthread_mutex_lock(&sc->esc_mtx); for (;;) { while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) break; sc->esc_tx_active = 0; if (sc->esc_tx_enabled == 0) pthread_cond_signal(&sc->esc_tx_cond); pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } sc->esc_tx_active = 1; /* Process some tx descriptors. Lock dropped inside. */ e82545_tx_run(sc); } } static void e82545_tx_start(struct e82545_softc *sc) { if (sc->esc_tx_active == 0) pthread_cond_signal(&sc->esc_tx_cond); } static void e82545_tx_enable(struct e82545_softc *sc) { sc->esc_tx_enabled = 1; } static void e82545_tx_disable(struct e82545_softc *sc) { sc->esc_tx_enabled = 0; while (sc->esc_tx_active) pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); } static void e82545_rx_enable(struct e82545_softc *sc) { sc->esc_rx_enabled = 1; } static void e82545_rx_disable(struct e82545_softc *sc) { sc->esc_rx_enabled = 0; while (sc->esc_rx_active) pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); } static void e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) { struct eth_uni *eu; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); eu->eu_addrsel = (wval >> 16) & 0x3; eu->eu_eth.octet[5] = wval >> 8; eu->eu_eth.octet[4] = wval; } else { /* RAL */ eu->eu_eth.octet[3] = wval >> 24; eu->eu_eth.octet[2] = wval >> 16; eu->eu_eth.octet[1] = wval >> 8; eu->eu_eth.octet[0] = wval; } } static uint32_t e82545_read_ra(struct e82545_softc *sc, int reg) { struct eth_uni *eu; uint32_t retval; int idx; idx = reg >> 1; assert(idx < 15); eu = &sc->esc_uni[idx]; if (reg & 0x1) { /* RAH */ retval = (eu->eu_valid << 31) | (eu->eu_addrsel << 16) | (eu->eu_eth.octet[5] << 8) | eu->eu_eth.octet[4]; } else { /* RAL */ retval = (eu->eu_eth.octet[3] << 24) | (eu->eu_eth.octet[2] << 16) | (eu->eu_eth.octet[1] << 8) | eu->eu_eth.octet[0]; } return (retval); } static void e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) { int ridx; if (offset & 0x3) { DPRINTF("Unaligned register write offset:0x%x value:0x%x", offset, value); return; } DPRINTF("Register write: 0x%x value: 0x%x", offset, value); switch (offset) { case E1000_CTRL: case E1000_CTRL_DUP: e82545_devctl(sc, value); break; case E1000_FCAL: sc->esc_FCAL = value; break; case E1000_FCAH: sc->esc_FCAH = value & ~0xFFFF0000; break; case E1000_FCT: sc->esc_FCT = value & ~0xFFFF0000; break; case E1000_VET: sc->esc_VET = value & ~0xFFFF0000; break; case E1000_FCTTV: sc->esc_FCTTV = value & ~0xFFFF0000; break; case E1000_LEDCTL: sc->esc_LEDCTL = value & ~0x30303000; break; case E1000_PBA: sc->esc_PBA = value & 0x0000FF80; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: e82545_intr_write(sc, offset, value); break; case E1000_RCTL: e82545_rx_ctl(sc, value); break; case E1000_FCRTL: sc->esc_FCRTL = value & ~0xFFFF0007; break; case E1000_FCRTH: sc->esc_FCRTH = value & ~0xFFFF0007; break; case E1000_RDBAL(0): sc->esc_RDBAL = value & ~0xF; if (sc->esc_rx_enabled) { /* Apparently legal: update cached address */ e82545_rx_update_rdba(sc); } break; case E1000_RDBAH(0): assert(!sc->esc_rx_enabled); sc->esc_RDBAH = value; break; case E1000_RDLEN(0): assert(!sc->esc_rx_enabled); sc->esc_RDLEN = value & ~0xFFF0007F; break; case E1000_RDH(0): /* XXX should only ever be zero ? Range check ? */ sc->esc_RDH = value; break; case E1000_RDT(0): /* XXX if this opens up the rx ring, do something ? */ sc->esc_RDT = value; break; case E1000_RDTR: /* ignore FPD bit 31 */ sc->esc_RDTR = value & ~0xFFFF0000; break; case E1000_RXDCTL(0): sc->esc_RXDCTL = value & ~0xFEC0C0C0; break; case E1000_RADV: sc->esc_RADV = value & ~0xFFFF0000; break; case E1000_RSRPD: sc->esc_RSRPD = value & ~0xFFFFF000; break; case E1000_RXCSUM: sc->esc_RXCSUM = value & ~0xFFFFF800; break; case E1000_TXCW: sc->esc_TXCW = value & ~0x3FFF0000; break; case E1000_TCTL: e82545_tx_ctl(sc, value); break; case E1000_TIPG: sc->esc_TIPG = value; break; case E1000_AIT: sc->esc_AIT = value; break; case E1000_TDBAL(0): sc->esc_TDBAL = value & ~0xF; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDBAH(0): sc->esc_TDBAH = value; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDLEN(0): sc->esc_TDLEN = value & ~0xFFF0007F; if (sc->esc_tx_enabled) e82545_tx_update_tdba(sc); break; case E1000_TDH(0): if (sc->esc_tx_enabled) { WPRINTF("ignoring write to TDH while transmit enabled"); break; } if (value != 0) { WPRINTF("ignoring non-zero value written to TDH"); break; } sc->esc_TDHr = sc->esc_TDH = value; break; case E1000_TDT(0): sc->esc_TDT = value; if (sc->esc_tx_enabled) e82545_tx_start(sc); break; case E1000_TIDV: sc->esc_TIDV = value & ~0xFFFF0000; break; case E1000_TXDCTL(0): //assert(!sc->esc_tx_enabled); sc->esc_TXDCTL = value & ~0xC0C0C0; break; case E1000_TADV: sc->esc_TADV = value & ~0xFFFF0000; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; e82545_write_ra(sc, ridx, value); break; case E1000_MTA ... (E1000_MTA + (127*4)): sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; break; case E1000_EECD: { //DPRINTF("EECD write 0x%x -> 0x%x", sc->eeprom_control, value); /* edge triggered low->high */ uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? 0 : (value & E1000_EECD_SK)); uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| E1000_EECD_DI|E1000_EECD_REQ); sc->eeprom_control &= ~eecd_mask; sc->eeprom_control |= (value & eecd_mask); /* grant/revoke immediately */ if (value & E1000_EECD_REQ) { sc->eeprom_control |= E1000_EECD_GNT; } else { sc->eeprom_control &= ~E1000_EECD_GNT; } if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { e82545_eecd_strobe(sc); } return; } case E1000_MDIC: { uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT); uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT); sc->mdi_control = (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); if ((value & E1000_MDIC_READY) != 0) { DPRINTF("Incorrect MDIC ready bit: 0x%x", value); return; } switch (value & E82545_MDIC_OP_MASK) { case E1000_MDIC_OP_READ: sc->mdi_control &= ~E82545_MDIC_DATA_MASK; sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); break; case E1000_MDIC_OP_WRITE: e82545_write_mdi(sc, reg_addr, phy_addr, value & E82545_MDIC_DATA_MASK); break; default: DPRINTF("Unknown MDIC op: 0x%x", value); return; } /* TODO: barrier? */ sc->mdi_control |= E1000_MDIC_READY; if (value & E82545_MDIC_IE) { // TODO: generate interrupt } return; } case E1000_MANC: case E1000_STATUS: return; default: DPRINTF("Unknown write register: 0x%x value:%x", offset, value); return; } } static uint32_t e82545_read_register(struct e82545_softc *sc, uint32_t offset) { uint32_t retval; int ridx; if (offset & 0x3) { DPRINTF("Unaligned register read offset:0x%x", offset); return 0; } DPRINTF("Register read: 0x%x", offset); switch (offset) { case E1000_CTRL: retval = sc->esc_CTRL; break; case E1000_STATUS: retval = E1000_STATUS_FD | E1000_STATUS_LU | E1000_STATUS_SPEED_1000; break; case E1000_FCAL: retval = sc->esc_FCAL; break; case E1000_FCAH: retval = sc->esc_FCAH; break; case E1000_FCT: retval = sc->esc_FCT; break; case E1000_VET: retval = sc->esc_VET; break; case E1000_FCTTV: retval = sc->esc_FCTTV; break; case E1000_LEDCTL: retval = sc->esc_LEDCTL; break; case E1000_PBA: retval = sc->esc_PBA; break; case E1000_ICR: case E1000_ITR: case E1000_ICS: case E1000_IMS: case E1000_IMC: retval = e82545_intr_read(sc, offset); break; case E1000_RCTL: retval = sc->esc_RCTL; break; case E1000_FCRTL: retval = sc->esc_FCRTL; break; case E1000_FCRTH: retval = sc->esc_FCRTH; break; case E1000_RDBAL(0): retval = sc->esc_RDBAL; break; case E1000_RDBAH(0): retval = sc->esc_RDBAH; break; case E1000_RDLEN(0): retval = sc->esc_RDLEN; break; case E1000_RDH(0): retval = sc->esc_RDH; break; case E1000_RDT(0): retval = sc->esc_RDT; break; case E1000_RDTR: retval = sc->esc_RDTR; break; case E1000_RXDCTL(0): retval = sc->esc_RXDCTL; break; case E1000_RADV: retval = sc->esc_RADV; break; case E1000_RSRPD: retval = sc->esc_RSRPD; break; case E1000_RXCSUM: retval = sc->esc_RXCSUM; break; case E1000_TXCW: retval = sc->esc_TXCW; break; case E1000_TCTL: retval = sc->esc_TCTL; break; case E1000_TIPG: retval = sc->esc_TIPG; break; case E1000_AIT: retval = sc->esc_AIT; break; case E1000_TDBAL(0): retval = sc->esc_TDBAL; break; case E1000_TDBAH(0): retval = sc->esc_TDBAH; break; case E1000_TDLEN(0): retval = sc->esc_TDLEN; break; case E1000_TDH(0): retval = sc->esc_TDH; break; case E1000_TDT(0): retval = sc->esc_TDT; break; case E1000_TIDV: retval = sc->esc_TIDV; break; case E1000_TXDCTL(0): retval = sc->esc_TXDCTL; break; case E1000_TADV: retval = sc->esc_TADV; break; case E1000_RAL(0) ... E1000_RAH(15): /* convert to u32 offset */ ridx = (offset - E1000_RAL(0)) >> 2; retval = e82545_read_ra(sc, ridx); break; case E1000_MTA ... (E1000_MTA + (127*4)): retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; break; case E1000_VFTA ... (E1000_VFTA + (127*4)): retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; break; case E1000_EECD: //DPRINTF("EECD read %x", sc->eeprom_control); retval = sc->eeprom_control; break; case E1000_MDIC: retval = sc->mdi_control; break; case E1000_MANC: retval = 0; break; /* stats that we emulate. */ case E1000_MPC: retval = sc->missed_pkt_count; break; case E1000_PRC64: retval = sc->pkt_rx_by_size[0]; break; case E1000_PRC127: retval = sc->pkt_rx_by_size[1]; break; case E1000_PRC255: retval = sc->pkt_rx_by_size[2]; break; case E1000_PRC511: retval = sc->pkt_rx_by_size[3]; break; case E1000_PRC1023: retval = sc->pkt_rx_by_size[4]; break; case E1000_PRC1522: retval = sc->pkt_rx_by_size[5]; break; case E1000_GPRC: retval = sc->good_pkt_rx_count; break; case E1000_BPRC: retval = sc->bcast_pkt_rx_count; break; case E1000_MPRC: retval = sc->mcast_pkt_rx_count; break; case E1000_GPTC: case E1000_TPT: retval = sc->good_pkt_tx_count; break; case E1000_GORCL: retval = (uint32_t)sc->good_octets_rx; break; case E1000_GORCH: retval = (uint32_t)(sc->good_octets_rx >> 32); break; case E1000_TOTL: case E1000_GOTCL: retval = (uint32_t)sc->good_octets_tx; break; case E1000_TOTH: case E1000_GOTCH: retval = (uint32_t)(sc->good_octets_tx >> 32); break; case E1000_ROC: retval = sc->oversize_rx_count; break; case E1000_TORL: retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); break; case E1000_TORH: retval = (uint32_t)((sc->good_octets_rx + sc->missed_octets) >> 32); break; case E1000_TPR: retval = sc->good_pkt_rx_count + sc->missed_pkt_count + sc->oversize_rx_count; break; case E1000_PTC64: retval = sc->pkt_tx_by_size[0]; break; case E1000_PTC127: retval = sc->pkt_tx_by_size[1]; break; case E1000_PTC255: retval = sc->pkt_tx_by_size[2]; break; case E1000_PTC511: retval = sc->pkt_tx_by_size[3]; break; case E1000_PTC1023: retval = sc->pkt_tx_by_size[4]; break; case E1000_PTC1522: retval = sc->pkt_tx_by_size[5]; break; case E1000_MPTC: retval = sc->mcast_pkt_tx_count; break; case E1000_BPTC: retval = sc->bcast_pkt_tx_count; break; case E1000_TSCTC: retval = sc->tso_tx_count; break; /* stats that are always 0. */ case E1000_CRCERRS: case E1000_ALGNERRC: case E1000_SYMERRS: case E1000_RXERRC: case E1000_SCC: case E1000_ECOL: case E1000_MCC: case E1000_LATECOL: case E1000_COLC: case E1000_DC: case E1000_TNCRS: case E1000_SEC: case E1000_CEXTERR: case E1000_RLEC: case E1000_XONRXC: case E1000_XONTXC: case E1000_XOFFRXC: case E1000_XOFFTXC: case E1000_FCRUC: case E1000_RNBC: case E1000_RUC: case E1000_RFC: case E1000_RJC: case E1000_MGTPRC: case E1000_MGTPDC: case E1000_MGTPTC: case E1000_TSCTFC: retval = 0; break; default: DPRINTF("Unknown read register: 0x%x", offset); retval = 0; break; } return (retval); } static void -e82545_write(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size, +e82545_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct e82545_softc *sc; //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d", baridx, offset, value, size); sc = pi->pi_arg; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr write sz:%d value:0x%lx", size, value); } else sc->io_addr = (uint32_t)value; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data write size:%d value:0x%lx", size, value); } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io write addr:0x%x value:0x%lx", sc->io_addr, value); } else e82545_write_register(sc, sc->io_addr, (uint32_t)value); break; default: DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d", offset, value, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx", size, offset, value); } else e82545_write_register(sc, (uint32_t)offset, (uint32_t)value); break; default: DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d", baridx, offset, value, size); } pthread_mutex_unlock(&sc->esc_mtx); } static uint64_t -e82545_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +e82545_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct e82545_softc *sc; uint64_t retval; //DPRINTF("Read bar:%d offset:0x%lx size:%d", baridx, offset, size); sc = pi->pi_arg; retval = 0; pthread_mutex_lock(&sc->esc_mtx); switch (baridx) { case E82545_BAR_IO: switch (offset) { case E82545_IOADDR: if (size != 4) { DPRINTF("Wrong io addr read sz:%d", size); } else retval = sc->io_addr; break; case E82545_IODATA: if (size != 4) { DPRINTF("Wrong io data read sz:%d", size); } if (sc->io_addr > E82545_IO_REGISTER_MAX) { DPRINTF("Non-register io read addr:0x%x", sc->io_addr); } else retval = e82545_read_register(sc, sc->io_addr); break; default: DPRINTF("Unknown io bar read offset:0x%lx size:%d", offset, size); break; } break; case E82545_BAR_REGISTER: if (size != 4) { DPRINTF("Wrong register read size:%d offset:0x%lx", size, offset); } else retval = e82545_read_register(sc, (uint32_t)offset); break; default: DPRINTF("Unknown read bar:%d offset:0x%lx size:%d", baridx, offset, size); break; } pthread_mutex_unlock(&sc->esc_mtx); return (retval); } static void e82545_reset(struct e82545_softc *sc, int drvr) { int i; e82545_rx_disable(sc); e82545_tx_disable(sc); /* clear outstanding interrupts */ if (sc->esc_irq_asserted) pci_lintr_deassert(sc->esc_pi); /* misc */ if (!drvr) { sc->esc_FCAL = 0; sc->esc_FCAH = 0; sc->esc_FCT = 0; sc->esc_VET = 0; sc->esc_FCTTV = 0; } sc->esc_LEDCTL = 0x07061302; sc->esc_PBA = 0x00100030; /* start nvm in opcode mode. */ sc->nvm_opaddr = 0; sc->nvm_mode = E82545_NVM_MODE_OPADDR; sc->nvm_bits = E82545_NVM_OPADDR_BITS; sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; e82545_init_eeprom(sc); /* interrupt */ sc->esc_ICR = 0; sc->esc_ITR = 250; sc->esc_ICS = 0; sc->esc_IMS = 0; sc->esc_IMC = 0; /* L2 filters */ if (!drvr) { memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); /* XXX not necessary on 82545 ?? */ sc->esc_uni[0].eu_valid = 1; memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, ETHER_ADDR_LEN); } else { /* Clear RAH valid bits */ for (i = 0; i < 16; i++) sc->esc_uni[i].eu_valid = 0; } /* receive */ if (!drvr) { sc->esc_RDBAL = 0; sc->esc_RDBAH = 0; } sc->esc_RCTL = 0; sc->esc_FCRTL = 0; sc->esc_FCRTH = 0; sc->esc_RDLEN = 0; sc->esc_RDH = 0; sc->esc_RDT = 0; sc->esc_RDTR = 0; sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ sc->esc_RADV = 0; sc->esc_RXCSUM = 0; /* transmit */ if (!drvr) { sc->esc_TDBAL = 0; sc->esc_TDBAH = 0; sc->esc_TIPG = 0; sc->esc_AIT = 0; sc->esc_TIDV = 0; sc->esc_TADV = 0; } sc->esc_tdba = 0; sc->esc_txdesc = NULL; sc->esc_TXCW = 0; sc->esc_TCTL = 0; sc->esc_TDLEN = 0; sc->esc_TDT = 0; sc->esc_TDHr = sc->esc_TDH = 0; sc->esc_TXDCTL = 0; } static int -e82545_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) +e82545_init(struct pci_devinst *pi, nvlist_t *nvl) { char nstr[80]; struct e82545_softc *sc; const char *mac; int err; /* Setup our softc */ sc = calloc(1, sizeof(*sc)); pi->pi_arg = sc; sc->esc_pi = pi; - sc->esc_ctx = ctx; + sc->esc_ctx = pi->pi_vmctx; pthread_mutex_init(&sc->esc_mtx, NULL); pthread_cond_init(&sc->esc_rx_cond, NULL); pthread_cond_init(&sc->esc_tx_cond, NULL); pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->esc_tx_tid, nstr); pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); /* TODO: this card also supports msi, but the freebsd driver for it * does not, so I have not implemented it. */ pci_lintr_request(pi); pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, E82545_BAR_REGISTER_LEN); pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, E82545_BAR_FLASH_LEN); pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, E82545_BAR_IO_LEN); mac = get_config_value_node(nvl, "mac"); if (mac != NULL) { err = net_parsemac(mac, sc->esc_mac.octet); if (err) { free(sc); return (err); } } else net_genmac(pi, sc->esc_mac.octet); err = netbe_init(&sc->esc_be, nvl, e82545_rx_callback, sc); if (err) { free(sc); return (err); } netbe_rx_enable(sc->esc_be); /* H/w initiated reset */ e82545_reset(sc, 0); return (0); } #ifdef BHYVE_SNAPSHOT static int e82545_snapshot(struct vm_snapshot_meta *meta) { int i; int ret; struct e82545_softc *sc; struct pci_devinst *pi; uint64_t bitmap_value; pi = meta->dev_data; sc = pi->pi_arg; /* esc_mevp and esc_mevpitr should be reinitiated at init. */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done); /* General */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done); /* Interrupt control */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done); /* * Transmit * * The fields in the unions are in superposition to access certain * bytes in the larger uint variables. * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1] */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done); /* Has dependency on esc_TDLEN; reoreder of fields from struct. */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN, true, meta, ret, done); /* L2 frame acceptance */ for (i = 0; i < (int)nitems(sc->esc_uni); i++) { SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done); } SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast), meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan), meta, ret, done); /* Receive */ SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done); /* Has dependency on esc_RDLEN; reoreder of fields from struct. */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN, true, meta, ret, done); /* IO Port register access */ SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done); /* Shadow copy of MDIC */ SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done); /* Shadow copy of EECD */ SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done); /* Latest NVM in/out */ SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done); /* Stats */ SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size), meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size), meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) bitmap_value = sc->nvm_bits; SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) sc->nvm_bits = bitmap_value; if (meta->op == VM_SNAPSHOT_SAVE) bitmap_value = sc->nvm_bits; SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) sc->nvm_bits = bitmap_value; /* EEPROM data */ SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data), meta, ret, done); done: return (ret); } #endif static const struct pci_devemu pci_de_e82545 = { .pe_emu = "e1000", .pe_init = e82545_init, .pe_legacy_config = netbe_legacy_config, .pe_barwrite = e82545_write, .pe_barread = e82545_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = e82545_snapshot, #endif }; PCI_EMUL_SET(pci_de_e82545); diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index cca0932180d6..93411751d635 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2646 +1,2642 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "bhyverun.h" #include "config.h" #include "debug.h" #include "inout.h" #include "ioapic.h" #include "mem.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) #define GB (1024 * 1024 * 1024UL) struct funcinfo { nvlist_t *fi_config; struct pci_devemu *fi_pde; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; int ii_pirq_pin; int ii_ioapic_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint8_t *pci_emul_rombase; static uint64_t pci_emul_romoffset; static uint8_t *pci_emul_romlim; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; struct pci_bar_allocation { TAILQ_ENTRY(pci_bar_allocation) chain; struct pci_devinst *pdi; int idx; enum pcibar_type type; uint64_t size; }; static TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER(pci_bars); #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_ROMSIZE 0x10000000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); /* * OVMF always uses 0xC0000000 as base address for 32 bit PCI MMIO. Don't * change this address without changing it in OVMF. */ #define PCI_EMUL_MEMBASE32 0xC0000000 #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMSIZE64 (32*GB) static struct pci_devemu *pci_emul_finddev(const char *name); static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); -static void pci_cfgrw(struct vmctx *ctx, int in, int bus, int slot, - int func, int coff, int bytes, uint32_t *val); +static void pci_cfgrw(int in, int bus, int slot, int func, int coff, + int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } static int is_pcir_bar(int coff) { return (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)); } static int is_pcir_bios(int coff) { return (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { EPRINTLN("Invalid PCI slot info field \"%s\"", aopt); } /* * Helper function to parse a list of comma-separated options where * each option is formatted as "name[=value]". If no value is * provided, the option is treated as a boolean and is given a value * of true. */ int pci_parse_legacy_config(nvlist_t *nvl, const char *opt) { char *config, *name, *tofree, *value; if (opt == NULL) return (0); config = tofree = strdup(opt); while ((name = strsep(&config, ",")) != NULL) { value = strchr(name, '='); if (value != NULL) { *value = '\0'; value++; set_config_value_node(nvl, name, value); } else set_config_bool_node(nvl, name, true); } free(tofree); return (0); } /* * PCI device configuration is stored in MIBs that encode the device's * location: * * pci... * * Where "bus", "slot", and "func" are all decimal values without * leading zeroes. Each valid device must have a "device" node which * identifies the driver model of the device. * * Device backends can provide a parser for the "config" string. If * a custom parser is not provided, pci_parse_legacy_config() is used * to parse the string. */ int pci_parse_slot(char *opt) { char node_name[sizeof("pci.XXX.XX.X")]; struct pci_devemu *pde; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; nvlist_t *nvl; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum, fnum, emul); goto done; } snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum, fnum); nvl = find_config_node(node_name); if (nvl != NULL) { EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum, fnum); goto done; } nvl = create_config_node(node_name); if (pde->pe_alias != NULL) set_config_value_node(nvl, "device", pde->pe_alias); else set_config_value_node(nvl, "device", pde->pe_emu); if (pde->pe_legacy_config != NULL) error = pde->pe_legacy_config(nvl, config); else error = pci_parse_legacy_config(nvl, config); done: free(str); return (error); } void pci_print_supported_devices(void) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } static int -pci_emul_io_handler(struct vmctx *ctx, int in, int port, +pci_emul_io_handler(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; assert(port >= 0); for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && (uint64_t)port >= pdi->pi_bar[i].addr && (uint64_t)port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) - *eax = (*pe->pe_barread)(ctx, pdi, i, + *eax = (*pe->pe_barread)(pdi, i, offset, bytes); else - (*pe->pe_barwrite)(ctx, pdi, i, offset, + (*pe->pe_barwrite)(pdi, i, offset, bytes, *eax); return (0); } } return (-1); } static int -pci_emul_mem_handler(struct vmctx *ctx, int vcpu __unused, int dir, +pci_emul_mem_handler(struct vmctx *ctx __unused, int vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int) arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { - (*pe->pe_barwrite)(ctx, pdi, bidx, offset, + (*pe->pe_barwrite)(pdi, bidx, offset, 4, *val & 0xffffffff); - (*pe->pe_barwrite)(ctx, pdi, bidx, offset + 4, + (*pe->pe_barwrite)(pdi, bidx, offset + 4, 4, *val >> 32); } else { - (*pe->pe_barwrite)(ctx, pdi, bidx, offset, + (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); } } else { if (size == 8) { - *val = (*pe->pe_barread)(ctx, pdi, bidx, + *val = (*pe->pe_barread)(pdi, bidx, offset, 4); - *val |= (*pe->pe_barread)(ctx, pdi, bidx, + *val |= (*pe->pe_barread)(pdi, bidx, offset + 4, 4) << 32; } else { - *val = (*pe->pe_barread)(ctx, pdi, bidx, + *val = (*pe->pe_barread)(pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { struct pci_devemu *pe; int error; struct inout_port iop; struct mem_range mr; pe = pi->pi_d; switch (pi->pi_bar[idx].type) { case PCIBAR_IO: bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); if (pe->pe_baraddr != NULL) - (*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration, + (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); break; case PCIBAR_MEM32: case PCIBAR_MEM64: bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); if (pe->pe_baraddr != NULL) - (*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration, + (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); break; case PCIBAR_ROM: error = 0; if (pe->pe_baraddr != NULL) - (*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration, + (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); break; default: error = EINVAL; break; } assert(error == 0); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Is the ROM enabled for the emulated pci device? */ static int romen(struct pci_devinst *pi) { return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) == PCIM_BIOS_ENABLE; } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX)); assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX)); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else if (type == PCIBAR_ROM) { if (size < ~PCIM_BIOS_ADDR_MASK + 1) size = ~PCIM_BIOS_ADDR_MASK + 1; } else { if (size < 16) size = 16; } /* * To reduce fragmentation of the MMIO space, we allocate the BARs by * size. Therefore, don't allocate the BAR yet. We create a list of all * BAR allocation which is sorted by BAR size. When all PCI devices are * initialized, we will assign an address to the BARs. */ /* create a new list entry */ struct pci_bar_allocation *const new_bar = malloc(sizeof(*new_bar)); memset(new_bar, 0, sizeof(*new_bar)); new_bar->pdi = pdi; new_bar->idx = idx; new_bar->type = type; new_bar->size = size; /* * Search for a BAR which size is lower than the size of our newly * allocated BAR. */ struct pci_bar_allocation *bar = NULL; TAILQ_FOREACH(bar, &pci_bars, chain) { if (bar->size < size) { break; } } if (bar == NULL) { /* * Either the list is empty or new BAR is the smallest BAR of * the list. Append it to the end of our list. */ TAILQ_INSERT_TAIL(&pci_bars, new_bar, chain); } else { /* * The found BAR is smaller than our new BAR. For that reason, * insert our new BAR before the found BAR. */ TAILQ_INSERT_BEFORE(bar, new_bar, chain); } /* * pci_passthru devices synchronize their physical and virtual command * register on init. For that reason, the virtual cmd reg should be * updated as early as possible. */ uint16_t enbit = 0; switch (type) { case PCIBAR_IO: enbit = PCIM_CMD_PORTEN; break; case PCIBAR_MEM64: case PCIBAR_MEM32: enbit = PCIM_CMD_MEMEN; break; default: enbit = 0; break; } const uint16_t cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); return (0); } static int pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx, const enum pcibar_type type, const uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (128MB currently). */ if (size > 128 * 1024 * 1024) { baseptr = &pci_emul_membase64; limit = pci_emul_memlim64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; case PCIBAR_ROM: /* do not claim memory for ROM. OVMF will do it for us. */ baseptr = NULL; limit = 0; mask = PCIM_BIOS_ADDR_MASK; lobits = 0; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } else { addr = 0; } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* * passthru devices are using same lobits as physical device they set * this property */ if (pdi->pi_bar[idx].lobits != 0) { lobits = pdi->pi_bar[idx].lobits; } else { pdi->pi_bar[idx].lobits = lobits; } /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } if (type != PCIBAR_ROM) { register_bar(pdi, idx); } return (0); } int pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, void **const addr) { /* allocate ROM space once on first call */ if (pci_emul_rombase == 0) { pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM, "pcirom", PCI_EMUL_ROMSIZE); if (pci_emul_rombase == MAP_FAILED) { warnx("%s: failed to create rom segment", __func__); return (-1); } pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE; pci_emul_romoffset = 0; } /* ROM size should be a power of 2 and greater than 2 KB */ const uint64_t rom_size = MAX(1UL << flsl(size), ~PCIM_BIOS_ADDR_MASK + 1); /* check if ROM fits into ROM space */ if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) { warnx("%s: no space left in rom segment:", __func__); warnx("%16lu bytes left", PCI_EMUL_ROMSIZE - pci_emul_romoffset); warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus, pdi->pi_slot, pdi->pi_func); return (-1); } /* allocate ROM BAR */ const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM, rom_size); if (error) return error; /* return address */ *addr = pci_emul_rombase + pci_emul_romoffset; /* save offset into ROM Space */ pdi->pi_romoffset = pci_emul_romoffset; /* increase offset for next ROM */ pci_emul_romoffset += rom_size; return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(const char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pdi->pi_lintr.pirq_pin = 0; pdi->pi_lintr.ioapic_irq = 0; pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); - err = (*pde->pe_init)(ctx, pdi, fi->fi_config); + err = (*pde->pe_init)(pdi, fi->fi_config); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; } CFGWRITE(pi, offset, val, bytes); msgctrl = pci_get_cfgdata16(pi, capoff + 2); addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } static void pciecap_cfgwrite(struct pci_devinst *pi, int capoff __unused, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. A capoff parameter of zero will force a search for the * offset and type. */ void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid) { uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; if (capoff == 0) { /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); capid = pci_get_cfgdata8(pi, capoff); } /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vmctx *ctx __unused, int vcpu __unused, int dir, uint64_t addr __unused, int size __unused, uint64_t *val, void *arg1 __unused, long arg2 __unused) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int -pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu __unused, int dir, +pci_emul_ecfg_handler(struct vmctx *ctx __unused, int vcpu __unused, int dir, uint64_t addr, int bytes, uint64_t *val, void *arg1 __unused, long arg2 __unused) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; - pci_cfgrw(ctx, in, bus, slot, func, coff, bytes, (uint32_t *)val); + pci_cfgrw(in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } #define BUSIO_ROUNDUP 32 #define BUSMEM32_ROUNDUP (1024 * 1024) #define BUSMEM64_ROUNDUP (512 * 1024 * 1024) int init_pci(struct vmctx *ctx) { char node_name[sizeof("pci.XXX.XX.X")]; struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; nvlist_t *nvl; const char *emul; size_t lowmem; int bus, slot, func; int error; if (vm_get_lowmem_limit(ctx) > PCI_EMUL_MEMBASE32) errx(EX_OSERR, "Invalid lowmem limit"); pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = PCI_EMUL_MEMBASE32; pci_emul_membase64 = 4*GB + vm_get_highmem_size(ctx); pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64); pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64; for (bus = 0; bus < MAXBUSES; bus++) { snprintf(node_name, sizeof(node_name), "pci.%d", bus); nvl = find_config_node(node_name); if (nvl == NULL) continue; pci_businfo[bus] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bus]; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; /* first run: init devices */ for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bus, slot, func); nvl = find_config_node(node_name); if (nvl == NULL) continue; fi->fi_config = nvl; emul = get_config_value_node(nvl, "device"); if (emul == NULL) { EPRINTLN("pci slot %d:%d:%d: missing " "\"device\" value", bus, slot, func); return (EINVAL); } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown " "device \"%s\"", bus, slot, func, emul); return (EINVAL); } if (pde->pe_alias != NULL) { EPRINTLN("pci slot %d:%d:%d: legacy " "device \"%s\", use \"%s\" instead", bus, slot, func, emul, pde->pe_alias); return (EINVAL); } fi->fi_pde = pde; error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* second run: assign BARs and free list */ struct pci_bar_allocation *bar; struct pci_bar_allocation *bar_tmp; TAILQ_FOREACH_SAFE(bar, &pci_bars, chain, bar_tmp) { pci_emul_assign_bar(bar->pdi, bar->idx, bar->type, bar->size); free(bar); } TAILQ_INIT(&pci_bars); /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM32_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM32_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM64_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM64_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } lpc_pirq_routed(); /* * The guest physical memory map looks like the following: * [0, lowmem) guest system memory * [lowmem, 0xC0000000) memory hole (may be absent) * [0xC0000000, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } static void pci_apic_prt_entry(int bus __unused, int slot, int pin, int pirq_pin __unused, int ioapic_irq, void *arg __unused) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus __unused, int slot, int pin, int pirq_pin, int ioapic_irq __unused, void *arg __unused) { char *name; name = lpc_pirq_name(pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int count, func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); count = pci_count_lintr(bus); if (count != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); done: dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; /* * Attempt to allocate an I/O APIC pin for this intpin if one * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); assert(ii->ii_ioapic_irq > 0); /* * Attempt to allocate a PIRQ pin for this intpin if one is * not yet assigned. */ if (ii->ii_pirq_pin == 0) ii->ii_pirq_pin = pirq_alloc_pin(pi); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; pi->pi_lintr.pirq_pin = ii->ii_pirq_pin; pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, ii->ii_pirq_pin, ii->ii_ioapic_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } /* * Update device state in response to changes to the PCI command * register. */ void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old) { int i; uint16_t changed, new; new = pci_get_cfgdata16(pi, PCIR_COMMAND); changed = old ^ new; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (new & PCIM_CMD_PORTEN) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_ROM: /* skip (un-)register of ROM if it disabled */ if (!romen(pi)) break; /* fallthrough */ case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (new & PCIM_CMD_MEMEN) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int rshift; uint32_t cmd, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ pci_emul_cmd_changed(pi, cmd); } static void -pci_cfgrw(struct vmctx *ctx, int in, int bus, int slot, int func, - int coff, int bytes, uint32_t *eax) +pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, + uint32_t *eax) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the * the guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *eax = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *eax = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *eax = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { - needcfg = pe->pe_cfgread(ctx, pi, coff, bytes, eax); + needcfg = pe->pe_cfgread(pi, coff, bytes, eax); } else { needcfg = 1; } if (needcfg) *eax = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && - (*pe->pe_cfgwrite)(ctx, pi, coff, bytes, *eax) == 0) + (*pe->pe_cfgwrite)(pi, coff, bytes, *eax) == 0) return; /* * Special handling for write to BAR and ROM registers */ if (is_pcir_bar(coff) || is_pcir_bios(coff)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; if (is_pcir_bar(coff)) { idx = (coff - PCIR_BAR(0)) / 4; } else if (is_pcir_bios(coff)) { idx = PCI_ROM_IDX; } else { errx(4, "%s: invalid BAR offset %d", __func__, coff); } mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *eax & mask; addr &= 0xffff; bar = addr | pi->pi_bar[idx].lobits; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *eax & mask; bar |= pi->pi_bar[idx].lobits; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *eax & mask; bar |= pi->pi_bar[idx].lobits; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*eax << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; case PCIBAR_ROM: addr = bar = *eax & mask; if (memen(pi) && romen(pi)) { unregister_bar(pi, idx); } pi->pi_bar[idx].addr = addr; pi->pi_bar[idx].lobits = *eax & PCIM_BIOS_ENABLE; /* romen could have changed it value */ if (memen(pi) && romen(pi)) { register_bar(pi, idx); } bar |= pi->pi_bar[idx].lobits; break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } } } static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx __unused, int in, int port __unused, int bytes, uint32_t *eax, void *arg __unused) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = (x & PCI_REGMAX) & ~0x03; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int -pci_emul_cfgdata(struct vmctx *ctx, int in, int port, +pci_emul_cfgdata(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg __unused) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { - pci_cfgrw(ctx, in, cfgbus, cfgslot, cfgfunc, coff, bytes, - eax); + pci_cfgrw(in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #ifdef BHYVE_SNAPSHOT /* * Saves/restores PCI device emulated state. Returns 0 on success. */ static int pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) { struct pci_devinst *pi; int i; int ret; pi = meta->dev_data; SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), meta, ret, done); for (i = 0; i < (int)nitems(pi->pi_bar); i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); } /* Restore MSI-X table. */ for (i = 0; i < pi->pi_msix.table_count; i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, meta, ret, done); } done: return (ret); } static int pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde, struct pci_devinst **pdi) { struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; int bus, slot, func; assert(dev_name != NULL); assert(pde != NULL); assert(pdi != NULL); for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_pde == NULL) continue; if (strcmp(dev_name, fi->fi_pde->pe_emu) != 0) continue; *pde = fi->fi_pde; *pdi = fi->fi_devi; return (0); } } } return (EINVAL); } int pci_snapshot(struct vm_snapshot_meta *meta) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(meta->dev_name != NULL); ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi); if (ret != 0) { fprintf(stderr, "%s: no such name: %s\r\n", __func__, meta->dev_name); memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); return (0); } meta->dev_data = pdi; if (pde->pe_snapshot == NULL) { fprintf(stderr, "%s: not implemented yet for: %s\r\n", __func__, meta->dev_name); return (-1); } ret = pci_snapshot_pci_dev(meta); if (ret != 0) { fprintf(stderr, "%s: failed to snapshot pci dev\r\n", __func__); return (-1); } ret = (*pde->pe_snapshot)(meta); return (ret); } int -pci_pause(struct vmctx *ctx, const char *dev_name) +pci_pause(const char *dev_name) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(dev_name != NULL); ret = pci_find_slotted_dev(dev_name, &pde, &pdi); if (ret != 0) { /* * It is possible to call this function without * checking that the device is inserted first. */ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); return (0); } if (pde->pe_pause == NULL) { /* The pause/resume functionality is optional. */ fprintf(stderr, "%s: not implemented for: %s\n", __func__, dev_name); return (0); } - return (*pde->pe_pause)(ctx, pdi); + return (*pde->pe_pause)(pdi); } int -pci_resume(struct vmctx *ctx, const char *dev_name) +pci_resume(const char *dev_name) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(dev_name != NULL); ret = pci_find_slotted_dev(dev_name, &pde, &pdi); if (ret != 0) { /* * It is possible to call this function without * checking that the device is inserted first. */ fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); return (0); } if (pde->pe_resume == NULL) { /* The pause/resume functionality is optional. */ fprintf(stderr, "%s: not implemented for: %s\n", __func__, dev_name); return (0); } - return (*pde->pe_resume)(ctx, pdi); + return (*pde->pe_resume)(pdi); } #endif #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int -pci_emul_dinit(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl __unused) +pci_emul_dinit(struct pci_devinst *pi, nvlist_t *nvl __unused) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void -pci_emul_diow(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size, +pci_emul_diow(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t -pci_emul_dior(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +pci_emul_dior(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } #ifdef BHYVE_SNAPSHOT static int pci_emul_snapshot(struct vm_snapshot_meta *meta __unused) { return (0); } #endif static const struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_emul_snapshot, #endif }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 8a7bcb7cc83a..117fc78898a2 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -1,315 +1,310 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ #define _PCI_EMUL_H_ #include #include #include #include #include #include #include #define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */ #define PCI_BARMAX_WITH_ROM (PCI_BARMAX + 1) #define PCI_ROM_IDX (PCI_BARMAX + 1) struct vmctx; struct pci_devinst; struct memory_region; struct vm_snapshot_meta; struct pci_devemu { const char *pe_emu; /* Name of device emulation */ /* instance creation */ - int (*pe_init)(struct vmctx *, struct pci_devinst *, - nvlist_t *); + int (*pe_init)(struct pci_devinst *, nvlist_t *); int (*pe_legacy_config)(nvlist_t *, const char *); const char *pe_alias; /* ACPI DSDT enumeration */ void (*pe_write_dsdt)(struct pci_devinst *); /* config space read/write callbacks */ - int (*pe_cfgwrite)(struct vmctx *ctx, - struct pci_devinst *pi, int offset, + int (*pe_cfgwrite)(struct pci_devinst *pi, int offset, int bytes, uint32_t val); - int (*pe_cfgread)(struct vmctx *ctx, - struct pci_devinst *pi, int offset, + int (*pe_cfgread)(struct pci_devinst *pi, int offset, int bytes, uint32_t *retval); /* BAR read/write callbacks */ - void (*pe_barwrite)(struct vmctx *ctx, - struct pci_devinst *pi, int baridx, + void (*pe_barwrite)(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); - uint64_t (*pe_barread)(struct vmctx *ctx, - struct pci_devinst *pi, int baridx, + uint64_t (*pe_barread)(struct pci_devinst *pi, int baridx, uint64_t offset, int size); - void (*pe_baraddr)(struct vmctx *ctx, struct pci_devinst *pi, + void (*pe_baraddr)(struct pci_devinst *pi, int baridx, int enabled, uint64_t address); /* Save/restore device state */ int (*pe_snapshot)(struct vm_snapshot_meta *meta); - int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi); - int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi); + int (*pe_pause)(struct pci_devinst *pi); + int (*pe_resume)(struct pci_devinst *pi); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); enum pcibar_type { PCIBAR_NONE, PCIBAR_IO, PCIBAR_MEM32, PCIBAR_MEM64, PCIBAR_MEMHI64, PCIBAR_ROM, }; struct pcibar { enum pcibar_type type; /* io or memory */ uint64_t size; uint64_t addr; uint8_t lobits; }; #define PI_NAMESZ 40 struct msix_table_entry { uint64_t addr; uint32_t msg_data; uint32_t vector_control; } __packed; /* * In case the structure is modified to hold extra information, use a define * for the size that should be emulated. */ #define MSIX_TABLE_ENTRY_SIZE 16 #define MAX_MSIX_TABLE_ENTRIES 2048 #define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8) enum lintr_stat { IDLE, ASSERTED, PENDING }; struct pci_devinst { struct pci_devemu *pi_d; struct vmctx *pi_vmctx; uint8_t pi_bus, pi_slot, pi_func; char pi_name[PI_NAMESZ]; int pi_bar_getsize; int pi_prevcap; int pi_capend; struct { int8_t pin; enum lintr_stat state; int pirq_pin; int ioapic_irq; pthread_mutex_t lock; } pi_lintr; struct { int enabled; uint64_t addr; uint64_t msg_data; int maxmsgnum; } pi_msi; struct { int enabled; int table_bar; int pba_bar; uint32_t table_offset; int table_count; uint32_t pba_offset; int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ uint8_t *mapped_addr; size_t mapped_size; } pi_msix; void *pi_arg; /* devemu-private data */ u_char pi_cfgdata[PCI_REGMAX + 1]; /* ROM is handled like a BAR */ struct pcibar pi_bar[PCI_BARMAX_WITH_ROM + 1]; uint64_t pi_romoffset; }; struct msicap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t addrlo; uint32_t addrhi; uint16_t msgdata; } __packed; static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); struct msixcap { uint8_t capid; uint8_t nextptr; uint16_t msgctrl; uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); struct pciecap { uint8_t capid; uint8_t nextptr; uint16_t pcie_capabilities; uint32_t dev_capabilities; /* all devices */ uint16_t dev_control; uint16_t dev_status; uint32_t link_capabilities; /* devices with links */ uint16_t link_control; uint16_t link_status; uint32_t slot_capabilities; /* ports with slots */ uint16_t slot_control; uint16_t slot_status; uint16_t root_control; /* root ports */ uint16_t root_capabilities; uint32_t root_status; uint32_t dev_capabilities2; /* all devices */ uint16_t dev_control2; uint16_t dev_status2; uint32_t link_capabilities2; /* devices with links */ uint16_t link_control2; uint16_t link_status2; uint32_t slot_capabilities2; /* ports with slots */ uint16_t slot_control2; uint16_t slot_status2; } __packed; static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); int init_pci(struct vmctx *ctx); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); int pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, void **const addr); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid); void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old); void pci_generate_msi(struct pci_devinst *pi, int msgnum); void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); void pci_lintr_deassert(struct pci_devinst *pi); void pci_lintr_request(struct pci_devinst *pi); int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); int pci_msi_maxmsgnum(struct pci_devinst *pi); int pci_parse_legacy_config(nvlist_t *nvl, const char *opt); int pci_parse_slot(char *opt); void pci_print_supported_devices(void); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value); uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size); int pci_count_lintr(int bus); void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg); void pci_write_dsdt(void); uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); #ifdef BHYVE_SNAPSHOT int pci_snapshot(struct vm_snapshot_meta *meta); -int pci_pause(struct vmctx *ctx, const char *dev_name); -int pci_resume(struct vmctx *ctx, const char *dev_name); +int pci_pause(const char *dev_name); +int pci_resume(const char *dev_name); #endif static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) { assert(offset <= PCI_REGMAX); *(uint8_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); *(uint16_t *)(pi->pi_cfgdata + offset) = val; } static __inline void pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); *(uint32_t *)(pi->pi_cfgdata + offset) = val; } static __inline uint8_t pci_get_cfgdata8(struct pci_devinst *pi, int offset) { assert(offset <= PCI_REGMAX); return (*(uint8_t *)(pi->pi_cfgdata + offset)); } static __inline uint16_t pci_get_cfgdata16(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0); return (*(uint16_t *)(pi->pi_cfgdata + offset)); } static __inline uint32_t pci_get_cfgdata32(struct pci_devinst *pi, int offset) { assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0); return (*(uint32_t *)(pi->pi_cfgdata + offset)); } #endif /* _PCI_EMUL_H_ */ diff --git a/usr.sbin/bhyve/pci_fbuf.c b/usr.sbin/bhyve/pci_fbuf.c index 4cfe872bd934..91215a124c59 100644 --- a/usr.sbin/bhyve/pci_fbuf.c +++ b/usr.sbin/bhyve/pci_fbuf.c @@ -1,475 +1,474 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Nahanni Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "bhyvegc.h" #include "bhyverun.h" #include "config.h" #include "debug.h" #include "console.h" #include "inout.h" #include "pci_emul.h" #include "rfb.h" #include "vga.h" /* * bhyve Framebuffer device emulation. * BAR0 points to the current mode information. * BAR1 is the 32-bit framebuffer address. * * -s ,fbuf,wait,vga=on|io|off,rfb=:port,w=width,h=height */ static int fbuf_debug = 1; #define DEBUG_INFO 1 #define DEBUG_VERBOSE 4 #define DPRINTF(level, params) if (level <= fbuf_debug) PRINTLN params #define KB (1024UL) #define MB (1024 * 1024UL) #define DMEMSZ 128 #define FB_SIZE (16*MB) #define COLS_MAX 1920 #define ROWS_MAX 1200 #define COLS_DEFAULT 1024 #define ROWS_DEFAULT 768 #define COLS_MIN 640 #define ROWS_MIN 480 struct pci_fbuf_softc { struct pci_devinst *fsc_pi; struct { uint32_t fbsize; uint16_t width; uint16_t height; uint16_t depth; uint16_t refreshrate; uint8_t reserved[116]; } __packed memregs; /* rfb server */ char *rfb_host; char *rfb_password; int rfb_port; int rfb_wait; int vga_enabled; int vga_full; uint32_t fbaddr; char *fb_base; uint16_t gc_width; uint16_t gc_height; void *vgasc; struct bhyvegc_image *gc_image; }; static struct pci_fbuf_softc *fbuf_sc; #define PCI_FBUF_MSI_MSGS 4 static void -pci_fbuf_write(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size, +pci_fbuf_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_fbuf_softc *sc; uint8_t *p; assert(baridx == 0); sc = pi->pi_arg; DPRINTF(DEBUG_VERBOSE, ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); if (offset + size > DMEMSZ) { printf("fbuf: write too large, offset %ld size %d\n", offset, size); return; } p = (uint8_t *)&sc->memregs + offset; switch (size) { case 1: *p = value; break; case 2: *(uint16_t *)p = value; break; case 4: *(uint32_t *)p = value; break; case 8: *(uint64_t *)p = value; break; default: printf("fbuf: write unknown size %d\n", size); break; } if (!sc->gc_image->vgamode && sc->memregs.width == 0 && sc->memregs.height == 0) { DPRINTF(DEBUG_INFO, ("switching to VGA mode")); sc->gc_image->vgamode = 1; sc->gc_width = 0; sc->gc_height = 0; } else if (sc->gc_image->vgamode && sc->memregs.width != 0 && sc->memregs.height != 0) { DPRINTF(DEBUG_INFO, ("switching to VESA mode")); sc->gc_image->vgamode = 0; } } static uint64_t -pci_fbuf_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +pci_fbuf_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_fbuf_softc *sc; uint8_t *p; uint64_t value; assert(baridx == 0); sc = pi->pi_arg; if (offset + size > DMEMSZ) { printf("fbuf: read too large, offset %ld size %d\n", offset, size); return (0); } p = (uint8_t *)&sc->memregs + offset; value = 0; switch (size) { case 1: value = *p; break; case 2: value = *(uint16_t *)p; break; case 4: value = *(uint32_t *)p; break; case 8: value = *(uint64_t *)p; break; default: printf("fbuf: read unknown size %d\n", size); break; } DPRINTF(DEBUG_VERBOSE, ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx", offset, size, value)); return (value); } static void -pci_fbuf_baraddr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, - int enabled, uint64_t address) +pci_fbuf_baraddr(struct pci_devinst *pi, int baridx, int enabled, + uint64_t address) { struct pci_fbuf_softc *sc; int prot; if (baridx != 1) return; sc = pi->pi_arg; if (!enabled) { - if (vm_munmap_memseg(ctx, sc->fbaddr, FB_SIZE) != 0) + if (vm_munmap_memseg(pi->pi_vmctx, sc->fbaddr, FB_SIZE) != 0) EPRINTLN("pci_fbuf: munmap_memseg failed"); sc->fbaddr = 0; } else { prot = PROT_READ | PROT_WRITE; - if (vm_mmap_memseg(ctx, address, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) + if (vm_mmap_memseg(pi->pi_vmctx, address, VM_FRAMEBUFFER, 0, + FB_SIZE, prot) != 0) EPRINTLN("pci_fbuf: mmap_memseg failed"); sc->fbaddr = address; } } static int pci_fbuf_parse_config(struct pci_fbuf_softc *sc, nvlist_t *nvl) { const char *value; char *cp; sc->rfb_wait = get_config_bool_node_default(nvl, "wait", false); /* Prefer "rfb" to "tcp". */ value = get_config_value_node(nvl, "rfb"); if (value == NULL) value = get_config_value_node(nvl, "tcp"); if (value != NULL) { /* * IPv4 -- host-ip:port * IPv6 -- [host-ip%zone]:port * XXX for now port is mandatory for IPv4. */ if (value[0] == '[') { cp = strchr(value + 1, ']'); if (cp == NULL || cp == value + 1) { EPRINTLN("fbuf: Invalid IPv6 address: \"%s\"", value); return (-1); } sc->rfb_host = strndup(value + 1, cp - (value + 1)); cp++; if (*cp == ':') { cp++; if (*cp == '\0') { EPRINTLN( "fbuf: Missing port number: \"%s\"", value); return (-1); } sc->rfb_port = atoi(cp); } else if (*cp != '\0') { EPRINTLN("fbuf: Invalid IPv6 address: \"%s\"", value); return (-1); } } else { cp = strchr(value, ':'); if (cp == NULL) { sc->rfb_port = atoi(value); } else { sc->rfb_host = strndup(value, cp - value); cp++; if (*cp == '\0') { EPRINTLN( "fbuf: Missing port number: \"%s\"", value); return (-1); } sc->rfb_port = atoi(cp); } } } value = get_config_value_node(nvl, "vga"); if (value != NULL) { if (strcmp(value, "off") == 0) { sc->vga_enabled = 0; } else if (strcmp(value, "io") == 0) { sc->vga_enabled = 1; sc->vga_full = 0; } else if (strcmp(value, "on") == 0) { sc->vga_enabled = 1; sc->vga_full = 1; } else { EPRINTLN("fbuf: Invalid vga setting: \"%s\"", value); return (-1); } } value = get_config_value_node(nvl, "w"); if (value != NULL) { sc->memregs.width = atoi(value); if (sc->memregs.width > COLS_MAX) { EPRINTLN("fbuf: width %d too large", sc->memregs.width); return (-1); } if (sc->memregs.width == 0) sc->memregs.width = 1920; } value = get_config_value_node(nvl, "h"); if (value != NULL) { sc->memregs.height = atoi(value); if (sc->memregs.height > ROWS_MAX) { EPRINTLN("fbuf: height %d too large", sc->memregs.height); return (-1); } if (sc->memregs.height == 0) sc->memregs.height = 1080; } value = get_config_value_node(nvl, "password"); if (value != NULL) sc->rfb_password = strdup(value); return (0); } static void pci_fbuf_render(struct bhyvegc *gc, void *arg) { struct pci_fbuf_softc *sc; sc = arg; if (sc->vga_full && sc->gc_image->vgamode) { /* TODO: mode switching to vga and vesa should use the special * EFI-bhyve protocol port. */ vga_render(gc, sc->vgasc); return; } if (sc->gc_width != sc->memregs.width || sc->gc_height != sc->memregs.height) { bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height); sc->gc_width = sc->memregs.width; sc->gc_height = sc->memregs.height; } return; } static int -pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) +pci_fbuf_init(struct pci_devinst *pi, nvlist_t *nvl) { int error; struct pci_fbuf_softc *sc; if (fbuf_sc != NULL) { EPRINTLN("Only one frame buffer device is allowed."); return (-1); } sc = calloc(1, sizeof(struct pci_fbuf_softc)); pi->pi_arg = sc; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); - sc->fb_base = vm_create_devmem( - ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE); + sc->fb_base = vm_create_devmem(pi->pi_vmctx, VM_FRAMEBUFFER, + "framebuffer", FB_SIZE); if (sc->fb_base == MAP_FAILED) { error = -1; goto done; } error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); assert(error == 0); error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); assert(error == 0); sc->memregs.fbsize = FB_SIZE; sc->memregs.width = COLS_DEFAULT; sc->memregs.height = ROWS_DEFAULT; sc->memregs.depth = 32; sc->vga_enabled = 1; sc->vga_full = 0; sc->fsc_pi = pi; error = pci_fbuf_parse_config(sc, nvl); if (error != 0) goto done; /* XXX until VGA rendering is enabled */ if (sc->vga_full != 0) { EPRINTLN("pci_fbuf: VGA rendering not enabled"); goto done; } DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]", sc->fb_base, FB_SIZE)); console_init(sc->memregs.width, sc->memregs.height, sc->fb_base); console_fb_register(pci_fbuf_render, sc); if (sc->vga_enabled) sc->vgasc = vga_init(!sc->vga_full); sc->gc_image = console_get_image(); fbuf_sc = sc; memset((void *)sc->fb_base, 0, FB_SIZE); error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password); done: if (error) free(sc); return (error); } #ifdef BHYVE_SNAPSHOT static int pci_fbuf_snapshot(struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err); err: return (ret); } #endif static const struct pci_devemu pci_fbuf = { .pe_emu = "fbuf", .pe_init = pci_fbuf_init, .pe_barwrite = pci_fbuf_write, .pe_barread = pci_fbuf_read, .pe_baraddr = pci_fbuf_baraddr, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_fbuf_snapshot, #endif }; PCI_EMUL_SET(pci_fbuf); diff --git a/usr.sbin/bhyve/pci_hda.c b/usr.sbin/bhyve/pci_hda.c index a85935a5cae8..9b0d4bd02ba0 100644 --- a/usr.sbin/bhyve/pci_hda.c +++ b/usr.sbin/bhyve/pci_hda.c @@ -1,1292 +1,1289 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Alex Teaca * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include "pci_hda.h" #include "bhyverun.h" #include "config.h" #include "pci_emul.h" #include "hdac_reg.h" /* * HDA defines */ #define PCIR_HDCTL 0x40 #define INTEL_VENDORID 0x8086 #define HDA_INTEL_82801G 0x27d8 #define HDA_IOSS_NO 0x08 #define HDA_OSS_NO 0x04 #define HDA_ISS_NO 0x04 #define HDA_CODEC_MAX 0x0f #define HDA_LAST_OFFSET \ (0x2084 + ((HDA_ISS_NO) * 0x20) + ((HDA_OSS_NO) * 0x20)) #define HDA_SET_REG_TABLE_SZ \ (0x80 + ((HDA_ISS_NO) * 0x20) + ((HDA_OSS_NO) * 0x20)) #define HDA_CORB_ENTRY_LEN 0x04 #define HDA_RIRB_ENTRY_LEN 0x08 #define HDA_BDL_ENTRY_LEN 0x10 #define HDA_DMA_PIB_ENTRY_LEN 0x08 #define HDA_STREAM_TAGS_CNT 0x10 #define HDA_STREAM_REGS_BASE 0x80 #define HDA_STREAM_REGS_LEN 0x20 #define HDA_DMA_ACCESS_LEN (sizeof(uint32_t)) #define HDA_BDL_MAX_LEN 0x0100 #define HDAC_SDSTS_FIFORDY (1 << 5) #define HDA_RIRBSTS_IRQ_MASK (HDAC_RIRBSTS_RINTFL | HDAC_RIRBSTS_RIRBOIS) #define HDA_STATESTS_IRQ_MASK ((1 << HDA_CODEC_MAX) - 1) #define HDA_SDSTS_IRQ_MASK \ (HDAC_SDSTS_DESE | HDAC_SDSTS_FIFOE | HDAC_SDSTS_BCIS) /* * HDA data structures */ struct hda_softc; typedef void (*hda_set_reg_handler)(struct hda_softc *sc, uint32_t offset, uint32_t old); struct hda_bdle { uint32_t addrl; uint32_t addrh; uint32_t len; uint32_t ioc; } __packed; struct hda_bdle_desc { void *addr; uint8_t ioc; uint32_t len; }; struct hda_codec_cmd_ctl { const char *name; void *dma_vaddr; uint8_t run; uint16_t rp; uint16_t size; uint16_t wp; }; struct hda_stream_desc { uint8_t dir; uint8_t run; uint8_t stream; /* bp is the no. of bytes transferred in the current bdle */ uint32_t bp; /* be is the no. of bdles transferred in the bdl */ uint32_t be; uint32_t bdl_cnt; struct hda_bdle_desc bdl[HDA_BDL_MAX_LEN]; }; struct hda_softc { struct pci_devinst *pci_dev; uint32_t regs[HDA_LAST_OFFSET]; uint8_t lintr; uint8_t rirb_cnt; uint64_t wall_clock_start; struct hda_codec_cmd_ctl corb; struct hda_codec_cmd_ctl rirb; uint8_t codecs_no; struct hda_codec_inst *codecs[HDA_CODEC_MAX]; /* Base Address of the DMA Position Buffer */ void *dma_pib_vaddr; struct hda_stream_desc streams[HDA_IOSS_NO]; /* 2 tables for output and input */ uint8_t stream_map[2][HDA_STREAM_TAGS_CNT]; }; /* * HDA module function declarations */ static inline void hda_set_reg_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t value); static inline uint32_t hda_get_reg_by_offset(struct hda_softc *sc, uint32_t offset); static inline void hda_set_field_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t mask, uint32_t value); static struct hda_softc *hda_init(nvlist_t *nvl); static void hda_update_intr(struct hda_softc *sc); static void hda_response_interrupt(struct hda_softc *sc); static int hda_codec_constructor(struct hda_softc *sc, struct hda_codec_class *codec, const char *play, const char *rec); static struct hda_codec_class *hda_find_codec_class(const char *name); static int hda_send_command(struct hda_softc *sc, uint32_t verb); static int hda_notify_codecs(struct hda_softc *sc, uint8_t run, uint8_t stream, uint8_t dir); static void hda_reset(struct hda_softc *sc); static void hda_reset_regs(struct hda_softc *sc); static void hda_stream_reset(struct hda_softc *sc, uint8_t stream_ind); static int hda_stream_start(struct hda_softc *sc, uint8_t stream_ind); static int hda_stream_stop(struct hda_softc *sc, uint8_t stream_ind); static uint32_t hda_read(struct hda_softc *sc, uint32_t offset); static int hda_write(struct hda_softc *sc, uint32_t offset, uint8_t size, uint32_t value); static inline void hda_print_cmd_ctl_data(struct hda_codec_cmd_ctl *p); static int hda_corb_start(struct hda_softc *sc); static int hda_corb_run(struct hda_softc *sc); static int hda_rirb_start(struct hda_softc *sc); static void *hda_dma_get_vaddr(struct hda_softc *sc, uint64_t dma_paddr, size_t len); static void hda_dma_st_dword(void *dma_vaddr, uint32_t data); static uint32_t hda_dma_ld_dword(void *dma_vaddr); static inline uint8_t hda_get_stream_by_offsets(uint32_t offset, uint8_t reg_offset); static inline uint32_t hda_get_offset_stream(uint8_t stream_ind); static void hda_set_gctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_statests(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_corbwp(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_corbctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_rirbctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_rirbsts(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_dpiblbase(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_sdctl(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_sdctl2(struct hda_softc *sc, uint32_t offset, uint32_t old); static void hda_set_sdsts(struct hda_softc *sc, uint32_t offset, uint32_t old); static int hda_signal_state_change(struct hda_codec_inst *hci); static int hda_response(struct hda_codec_inst *hci, uint32_t response, uint8_t unsol); static int hda_transfer(struct hda_codec_inst *hci, uint8_t stream, uint8_t dir, uint8_t *buf, size_t count); static void hda_set_pib(struct hda_softc *sc, uint8_t stream_ind, uint32_t pib); static uint64_t hda_get_clock_ns(void); /* * PCI HDA function declarations */ -static int pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl); -static void pci_hda_write(struct vmctx *ctx, struct pci_devinst *pi, - int baridx, uint64_t offset, int size, uint64_t value); -static uint64_t pci_hda_read(struct vmctx *ctx, struct pci_devinst *pi, - int baridx, uint64_t offset, int size); +static int pci_hda_init(struct pci_devinst *pi, nvlist_t *nvl); +static void pci_hda_write(struct pci_devinst *pi, int baridx, uint64_t offset, + int size, uint64_t value); +static uint64_t pci_hda_read(struct pci_devinst *pi, int baridx, + uint64_t offset, int size); /* * HDA global data */ static const hda_set_reg_handler hda_set_reg_table[] = { [HDAC_GCTL] = hda_set_gctl, [HDAC_STATESTS] = hda_set_statests, [HDAC_CORBWP] = hda_set_corbwp, [HDAC_CORBCTL] = hda_set_corbctl, [HDAC_RIRBCTL] = hda_set_rirbctl, [HDAC_RIRBSTS] = hda_set_rirbsts, [HDAC_DPIBLBASE] = hda_set_dpiblbase, #define HDAC_ISTREAM(n, iss, oss) \ [_HDAC_ISDCTL(n, iss, oss)] = hda_set_sdctl, \ [_HDAC_ISDCTL(n, iss, oss) + 2] = hda_set_sdctl2, \ [_HDAC_ISDSTS(n, iss, oss)] = hda_set_sdsts, \ #define HDAC_OSTREAM(n, iss, oss) \ [_HDAC_OSDCTL(n, iss, oss)] = hda_set_sdctl, \ [_HDAC_OSDCTL(n, iss, oss) + 2] = hda_set_sdctl2, \ [_HDAC_OSDSTS(n, iss, oss)] = hda_set_sdsts, \ HDAC_ISTREAM(0, HDA_ISS_NO, HDA_OSS_NO) HDAC_ISTREAM(1, HDA_ISS_NO, HDA_OSS_NO) HDAC_ISTREAM(2, HDA_ISS_NO, HDA_OSS_NO) HDAC_ISTREAM(3, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(0, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(1, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(2, HDA_ISS_NO, HDA_OSS_NO) HDAC_OSTREAM(3, HDA_ISS_NO, HDA_OSS_NO) [HDA_SET_REG_TABLE_SZ] = NULL, }; static const uint16_t hda_corb_sizes[] = { [HDAC_CORBSIZE_CORBSIZE_2] = 2, [HDAC_CORBSIZE_CORBSIZE_16] = 16, [HDAC_CORBSIZE_CORBSIZE_256] = 256, [HDAC_CORBSIZE_CORBSIZE_MASK] = 0, }; static const uint16_t hda_rirb_sizes[] = { [HDAC_RIRBSIZE_RIRBSIZE_2] = 2, [HDAC_RIRBSIZE_RIRBSIZE_16] = 16, [HDAC_RIRBSIZE_RIRBSIZE_256] = 256, [HDAC_RIRBSIZE_RIRBSIZE_MASK] = 0, }; static const struct hda_ops hops = { .signal = hda_signal_state_change, .response = hda_response, .transfer = hda_transfer, }; static const struct pci_devemu pci_de_hda = { .pe_emu = "hda", .pe_init = pci_hda_init, .pe_barwrite = pci_hda_write, .pe_barread = pci_hda_read }; PCI_EMUL_SET(pci_de_hda); SET_DECLARE(hda_codec_class_set, struct hda_codec_class); #if DEBUG_HDA == 1 FILE *dbg; #endif /* * HDA module function definitions */ static inline void hda_set_reg_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t value) { assert(offset < HDA_LAST_OFFSET); sc->regs[offset] = value; } static inline uint32_t hda_get_reg_by_offset(struct hda_softc *sc, uint32_t offset) { assert(offset < HDA_LAST_OFFSET); return sc->regs[offset]; } static inline void hda_set_field_by_offset(struct hda_softc *sc, uint32_t offset, uint32_t mask, uint32_t value) { uint32_t reg_value = 0; reg_value = hda_get_reg_by_offset(sc, offset); reg_value &= ~mask; reg_value |= (value & mask); hda_set_reg_by_offset(sc, offset, reg_value); } static struct hda_softc * hda_init(nvlist_t *nvl) { struct hda_softc *sc = NULL; struct hda_codec_class *codec = NULL; const char *value; char *play; char *rec; int err; #if DEBUG_HDA == 1 dbg = fopen("/tmp/bhyve_hda.log", "w+"); #endif sc = calloc(1, sizeof(*sc)); if (!sc) return (NULL); hda_reset_regs(sc); /* * TODO search all configured codecs * For now we play with one single codec */ codec = hda_find_codec_class("hda_codec"); if (codec) { value = get_config_value_node(nvl, "play"); if (value == NULL) play = NULL; else play = strdup(value); value = get_config_value_node(nvl, "rec"); if (value == NULL) rec = NULL; else rec = strdup(value); DPRINTF("play: %s rec: %s", play, rec); if (play != NULL || rec != NULL) { err = hda_codec_constructor(sc, codec, play, rec); assert(!err); } free(play); free(rec); } return (sc); } static void hda_update_intr(struct hda_softc *sc) { struct pci_devinst *pi = sc->pci_dev; uint32_t intctl = hda_get_reg_by_offset(sc, HDAC_INTCTL); uint32_t intsts = 0; uint32_t sdsts = 0; uint32_t rirbsts = 0; uint32_t wakeen = 0; uint32_t statests = 0; uint32_t off = 0; int i; /* update the CIS bits */ rirbsts = hda_get_reg_by_offset(sc, HDAC_RIRBSTS); if (rirbsts & (HDAC_RIRBSTS_RINTFL | HDAC_RIRBSTS_RIRBOIS)) intsts |= HDAC_INTSTS_CIS; wakeen = hda_get_reg_by_offset(sc, HDAC_WAKEEN); statests = hda_get_reg_by_offset(sc, HDAC_STATESTS); if (statests & wakeen) intsts |= HDAC_INTSTS_CIS; /* update the SIS bits */ for (i = 0; i < HDA_IOSS_NO; i++) { off = hda_get_offset_stream(i); sdsts = hda_get_reg_by_offset(sc, off + HDAC_SDSTS); if (sdsts & HDAC_SDSTS_BCIS) intsts |= (1 << i); } /* update the GIS bit */ if (intsts) intsts |= HDAC_INTSTS_GIS; hda_set_reg_by_offset(sc, HDAC_INTSTS, intsts); if ((intctl & HDAC_INTCTL_GIE) && ((intsts & \ ~HDAC_INTSTS_GIS) & intctl)) { if (!sc->lintr) { pci_lintr_assert(pi); sc->lintr = 1; } } else { if (sc->lintr) { pci_lintr_deassert(pi); sc->lintr = 0; } } } static void hda_response_interrupt(struct hda_softc *sc) { uint8_t rirbctl = hda_get_reg_by_offset(sc, HDAC_RIRBCTL); if ((rirbctl & HDAC_RIRBCTL_RINTCTL) && sc->rirb_cnt) { sc->rirb_cnt = 0; hda_set_field_by_offset(sc, HDAC_RIRBSTS, HDAC_RIRBSTS_RINTFL, HDAC_RIRBSTS_RINTFL); hda_update_intr(sc); } } static int hda_codec_constructor(struct hda_softc *sc, struct hda_codec_class *codec, const char *play, const char *rec) { struct hda_codec_inst *hci = NULL; if (sc->codecs_no >= HDA_CODEC_MAX) return (-1); hci = calloc(1, sizeof(struct hda_codec_inst)); if (!hci) return (-1); hci->hda = sc; hci->hops = &hops; hci->cad = sc->codecs_no; hci->codec = codec; sc->codecs[sc->codecs_no++] = hci; if (!codec->init) { DPRINTF("This codec does not implement the init function"); return (-1); } return (codec->init(hci, play, rec)); } static struct hda_codec_class * hda_find_codec_class(const char *name) { struct hda_codec_class **pdpp = NULL, *pdp = NULL; SET_FOREACH(pdpp, hda_codec_class_set) { pdp = *pdpp; if (!strcmp(pdp->name, name)) { return (pdp); } } return (NULL); } static int hda_send_command(struct hda_softc *sc, uint32_t verb) { struct hda_codec_inst *hci = NULL; struct hda_codec_class *codec = NULL; uint8_t cad = (verb >> HDA_CMD_CAD_SHIFT) & 0x0f; hci = sc->codecs[cad]; if (!hci) return (-1); DPRINTF("cad: 0x%x verb: 0x%x", cad, verb); codec = hci->codec; assert(codec); if (!codec->command) { DPRINTF("This codec does not implement the command function"); return (-1); } return (codec->command(hci, verb)); } static int hda_notify_codecs(struct hda_softc *sc, uint8_t run, uint8_t stream, uint8_t dir) { struct hda_codec_inst *hci = NULL; struct hda_codec_class *codec = NULL; int err; int i; /* Notify each codec */ for (i = 0; i < sc->codecs_no; i++) { hci = sc->codecs[i]; assert(hci); codec = hci->codec; assert(codec); if (codec->notify) { err = codec->notify(hci, run, stream, dir); if (!err) break; } } return (i == sc->codecs_no ? (-1) : 0); } static void hda_reset(struct hda_softc *sc) { int i; struct hda_codec_inst *hci = NULL; struct hda_codec_class *codec = NULL; hda_reset_regs(sc); /* Reset each codec */ for (i = 0; i < sc->codecs_no; i++) { hci = sc->codecs[i]; assert(hci); codec = hci->codec; assert(codec); if (codec->reset) codec->reset(hci); } sc->wall_clock_start = hda_get_clock_ns(); } static void hda_reset_regs(struct hda_softc *sc) { uint32_t off = 0; uint8_t i; DPRINTF("Reset the HDA controller registers ..."); memset(sc->regs, 0, sizeof(sc->regs)); hda_set_reg_by_offset(sc, HDAC_GCAP, HDAC_GCAP_64OK | (HDA_ISS_NO << HDAC_GCAP_ISS_SHIFT) | (HDA_OSS_NO << HDAC_GCAP_OSS_SHIFT)); hda_set_reg_by_offset(sc, HDAC_VMAJ, 0x01); hda_set_reg_by_offset(sc, HDAC_OUTPAY, 0x3c); hda_set_reg_by_offset(sc, HDAC_INPAY, 0x1d); hda_set_reg_by_offset(sc, HDAC_CORBSIZE, HDAC_CORBSIZE_CORBSZCAP_256 | HDAC_CORBSIZE_CORBSIZE_256); hda_set_reg_by_offset(sc, HDAC_RIRBSIZE, HDAC_RIRBSIZE_RIRBSZCAP_256 | HDAC_RIRBSIZE_RIRBSIZE_256); for (i = 0; i < HDA_IOSS_NO; i++) { off = hda_get_offset_stream(i); hda_set_reg_by_offset(sc, off + HDAC_SDFIFOS, HDA_FIFO_SIZE); } } static void hda_stream_reset(struct hda_softc *sc, uint8_t stream_ind) { struct hda_stream_desc *st = &sc->streams[stream_ind]; uint32_t off = hda_get_offset_stream(stream_ind); DPRINTF("Reset the HDA stream: 0x%x", stream_ind); /* Reset the Stream Descriptor registers */ memset(sc->regs + HDA_STREAM_REGS_BASE + off, 0, HDA_STREAM_REGS_LEN); /* Reset the Stream Descriptor */ memset(st, 0, sizeof(*st)); hda_set_field_by_offset(sc, off + HDAC_SDSTS, HDAC_SDSTS_FIFORDY, HDAC_SDSTS_FIFORDY); hda_set_field_by_offset(sc, off + HDAC_SDCTL0, HDAC_SDCTL_SRST, HDAC_SDCTL_SRST); } static int hda_stream_start(struct hda_softc *sc, uint8_t stream_ind) { struct hda_stream_desc *st = &sc->streams[stream_ind]; struct hda_bdle_desc *bdle_desc = NULL; struct hda_bdle *bdle = NULL; uint32_t lvi = 0; uint32_t bdl_cnt = 0; uint64_t bdpl = 0; uint64_t bdpu = 0; uint64_t bdl_paddr = 0; void *bdl_vaddr = NULL; uint32_t bdle_sz = 0; uint64_t bdle_addrl = 0; uint64_t bdle_addrh = 0; uint64_t bdle_paddr = 0; void *bdle_vaddr = NULL; uint32_t off = hda_get_offset_stream(stream_ind); uint32_t sdctl = 0; uint8_t strm = 0; uint8_t dir = 0; assert(!st->run); lvi = hda_get_reg_by_offset(sc, off + HDAC_SDLVI); bdpl = hda_get_reg_by_offset(sc, off + HDAC_SDBDPL); bdpu = hda_get_reg_by_offset(sc, off + HDAC_SDBDPU); bdl_cnt = lvi + 1; assert(bdl_cnt <= HDA_BDL_MAX_LEN); bdl_paddr = bdpl | (bdpu << 32); bdl_vaddr = hda_dma_get_vaddr(sc, bdl_paddr, HDA_BDL_ENTRY_LEN * bdl_cnt); if (!bdl_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } DPRINTF("stream: 0x%x bdl_cnt: 0x%x bdl_paddr: 0x%lx", stream_ind, bdl_cnt, bdl_paddr); st->bdl_cnt = bdl_cnt; bdle = (struct hda_bdle *)bdl_vaddr; for (size_t i = 0; i < bdl_cnt; i++, bdle++) { bdle_sz = bdle->len; assert(!(bdle_sz % HDA_DMA_ACCESS_LEN)); bdle_addrl = bdle->addrl; bdle_addrh = bdle->addrh; bdle_paddr = bdle_addrl | (bdle_addrh << 32); bdle_vaddr = hda_dma_get_vaddr(sc, bdle_paddr, bdle_sz); if (!bdle_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } bdle_desc = &st->bdl[i]; bdle_desc->addr = bdle_vaddr; bdle_desc->len = bdle_sz; bdle_desc->ioc = bdle->ioc; DPRINTF("bdle: 0x%zx bdle_sz: 0x%x", i, bdle_sz); } sdctl = hda_get_reg_by_offset(sc, off + HDAC_SDCTL0); strm = (sdctl >> 20) & 0x0f; dir = stream_ind >= HDA_ISS_NO; DPRINTF("strm: 0x%x, dir: 0x%x", strm, dir); sc->stream_map[dir][strm] = stream_ind; st->stream = strm; st->dir = dir; st->bp = 0; st->be = 0; hda_set_pib(sc, stream_ind, 0); st->run = 1; hda_notify_codecs(sc, 1, strm, dir); return (0); } static int hda_stream_stop(struct hda_softc *sc, uint8_t stream_ind) { struct hda_stream_desc *st = &sc->streams[stream_ind]; uint8_t strm = st->stream; uint8_t dir = st->dir; DPRINTF("stream: 0x%x, strm: 0x%x, dir: 0x%x", stream_ind, strm, dir); st->run = 0; hda_notify_codecs(sc, 0, strm, dir); return (0); } static uint32_t hda_read(struct hda_softc *sc, uint32_t offset) { if (offset == HDAC_WALCLK) return (24 * (hda_get_clock_ns() - \ sc->wall_clock_start) / 1000); return (hda_get_reg_by_offset(sc, offset)); } static int hda_write(struct hda_softc *sc, uint32_t offset, uint8_t size, uint32_t value) { uint32_t old = hda_get_reg_by_offset(sc, offset); uint32_t masks[] = {0x00000000, 0x000000ff, 0x0000ffff, 0x00ffffff, 0xffffffff}; hda_set_reg_handler set_reg_handler = hda_set_reg_table[offset]; hda_set_field_by_offset(sc, offset, masks[size], value); if (set_reg_handler) set_reg_handler(sc, offset, old); return (0); } static inline void hda_print_cmd_ctl_data(struct hda_codec_cmd_ctl *p) { #if DEBUG_HDA == 1 const char *name = p->name; #endif DPRINTF("%s size: %d", name, p->size); DPRINTF("%s dma_vaddr: %p", name, p->dma_vaddr); DPRINTF("%s wp: 0x%x", name, p->wp); DPRINTF("%s rp: 0x%x", name, p->rp); } static int hda_corb_start(struct hda_softc *sc) { struct hda_codec_cmd_ctl *corb = &sc->corb; uint8_t corbsize = 0; uint64_t corblbase = 0; uint64_t corbubase = 0; uint64_t corbpaddr = 0; corb->name = "CORB"; corbsize = hda_get_reg_by_offset(sc, HDAC_CORBSIZE) & \ HDAC_CORBSIZE_CORBSIZE_MASK; corb->size = hda_corb_sizes[corbsize]; if (!corb->size) { DPRINTF("Invalid corb size"); return (-1); } corblbase = hda_get_reg_by_offset(sc, HDAC_CORBLBASE); corbubase = hda_get_reg_by_offset(sc, HDAC_CORBUBASE); corbpaddr = corblbase | (corbubase << 32); DPRINTF("CORB dma_paddr: %p", (void *)corbpaddr); corb->dma_vaddr = hda_dma_get_vaddr(sc, corbpaddr, HDA_CORB_ENTRY_LEN * corb->size); if (!corb->dma_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } corb->wp = hda_get_reg_by_offset(sc, HDAC_CORBWP); corb->rp = hda_get_reg_by_offset(sc, HDAC_CORBRP); corb->run = 1; hda_print_cmd_ctl_data(corb); return (0); } static int hda_corb_run(struct hda_softc *sc) { struct hda_codec_cmd_ctl *corb = &sc->corb; uint32_t verb = 0; int err; corb->wp = hda_get_reg_by_offset(sc, HDAC_CORBWP); while (corb->rp != corb->wp && corb->run) { corb->rp++; corb->rp %= corb->size; verb = hda_dma_ld_dword((uint8_t *)corb->dma_vaddr + HDA_CORB_ENTRY_LEN * corb->rp); err = hda_send_command(sc, verb); assert(!err); } hda_set_reg_by_offset(sc, HDAC_CORBRP, corb->rp); if (corb->run) hda_response_interrupt(sc); return (0); } static int hda_rirb_start(struct hda_softc *sc) { struct hda_codec_cmd_ctl *rirb = &sc->rirb; uint8_t rirbsize = 0; uint64_t rirblbase = 0; uint64_t rirbubase = 0; uint64_t rirbpaddr = 0; rirb->name = "RIRB"; rirbsize = hda_get_reg_by_offset(sc, HDAC_RIRBSIZE) & \ HDAC_RIRBSIZE_RIRBSIZE_MASK; rirb->size = hda_rirb_sizes[rirbsize]; if (!rirb->size) { DPRINTF("Invalid rirb size"); return (-1); } rirblbase = hda_get_reg_by_offset(sc, HDAC_RIRBLBASE); rirbubase = hda_get_reg_by_offset(sc, HDAC_RIRBUBASE); rirbpaddr = rirblbase | (rirbubase << 32); DPRINTF("RIRB dma_paddr: %p", (void *)rirbpaddr); rirb->dma_vaddr = hda_dma_get_vaddr(sc, rirbpaddr, HDA_RIRB_ENTRY_LEN * rirb->size); if (!rirb->dma_vaddr) { DPRINTF("Fail to get the guest virtual address"); return (-1); } rirb->wp = hda_get_reg_by_offset(sc, HDAC_RIRBWP); rirb->rp = 0x0000; rirb->run = 1; hda_print_cmd_ctl_data(rirb); return (0); } static void * hda_dma_get_vaddr(struct hda_softc *sc, uint64_t dma_paddr, size_t len) { struct pci_devinst *pi = sc->pci_dev; assert(pi); return (paddr_guest2host(pi->pi_vmctx, (uintptr_t)dma_paddr, len)); } static void hda_dma_st_dword(void *dma_vaddr, uint32_t data) { *(uint32_t*)dma_vaddr = data; } static uint32_t hda_dma_ld_dword(void *dma_vaddr) { return (*(uint32_t*)dma_vaddr); } static inline uint8_t hda_get_stream_by_offsets(uint32_t offset, uint8_t reg_offset) { uint8_t stream_ind = (offset - reg_offset) >> 5; assert(stream_ind < HDA_IOSS_NO); return (stream_ind); } static inline uint32_t hda_get_offset_stream(uint8_t stream_ind) { return (stream_ind << 5); } static void hda_set_gctl(struct hda_softc *sc, uint32_t offset, uint32_t old __unused) { uint32_t value = hda_get_reg_by_offset(sc, offset); if (!(value & HDAC_GCTL_CRST)) { hda_reset(sc); } } static void hda_set_statests(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_reg_by_offset(sc, offset, old); /* clear the corresponding bits written by the software (guest) */ hda_set_field_by_offset(sc, offset, value & HDA_STATESTS_IRQ_MASK, 0); hda_update_intr(sc); } static void hda_set_corbwp(struct hda_softc *sc, uint32_t offset __unused, uint32_t old __unused) { hda_corb_run(sc); } static void hda_set_corbctl(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); int err; struct hda_codec_cmd_ctl *corb = NULL; if (value & HDAC_CORBCTL_CORBRUN) { if (!(old & HDAC_CORBCTL_CORBRUN)) { err = hda_corb_start(sc); assert(!err); } } else { corb = &sc->corb; memset(corb, 0, sizeof(*corb)); } hda_corb_run(sc); } static void hda_set_rirbctl(struct hda_softc *sc, uint32_t offset, uint32_t old __unused) { uint32_t value = hda_get_reg_by_offset(sc, offset); int err; struct hda_codec_cmd_ctl *rirb = NULL; if (value & HDAC_RIRBCTL_RIRBDMAEN) { err = hda_rirb_start(sc); assert(!err); } else { rirb = &sc->rirb; memset(rirb, 0, sizeof(*rirb)); } } static void hda_set_rirbsts(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_reg_by_offset(sc, offset, old); /* clear the corresponding bits written by the software (guest) */ hda_set_field_by_offset(sc, offset, value & HDA_RIRBSTS_IRQ_MASK, 0); hda_update_intr(sc); } static void hda_set_dpiblbase(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); uint64_t dpiblbase = 0; uint64_t dpibubase = 0; uint64_t dpibpaddr = 0; if ((value & HDAC_DPLBASE_DPLBASE_DMAPBE) != (old & \ HDAC_DPLBASE_DPLBASE_DMAPBE)) { if (value & HDAC_DPLBASE_DPLBASE_DMAPBE) { dpiblbase = value & HDAC_DPLBASE_DPLBASE_MASK; dpibubase = hda_get_reg_by_offset(sc, HDAC_DPIBUBASE); dpibpaddr = dpiblbase | (dpibubase << 32); DPRINTF("DMA Position In Buffer dma_paddr: %p", (void *)dpibpaddr); sc->dma_pib_vaddr = hda_dma_get_vaddr(sc, dpibpaddr, HDA_DMA_PIB_ENTRY_LEN * HDA_IOSS_NO); if (!sc->dma_pib_vaddr) { DPRINTF("Fail to get the guest \ virtual address"); assert(0); } } else { DPRINTF("DMA Position In Buffer Reset"); sc->dma_pib_vaddr = NULL; } } } static void hda_set_sdctl(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint8_t stream_ind = hda_get_stream_by_offsets(offset, HDAC_SDCTL0); uint32_t value = hda_get_reg_by_offset(sc, offset); int err; DPRINTF("stream_ind: 0x%x old: 0x%x value: 0x%x", stream_ind, old, value); if (value & HDAC_SDCTL_SRST) { hda_stream_reset(sc, stream_ind); } if ((value & HDAC_SDCTL_RUN) != (old & HDAC_SDCTL_RUN)) { if (value & HDAC_SDCTL_RUN) { err = hda_stream_start(sc, stream_ind); assert(!err); } else { err = hda_stream_stop(sc, stream_ind); assert(!err); } } } static void hda_set_sdctl2(struct hda_softc *sc, uint32_t offset, uint32_t old __unused) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_field_by_offset(sc, offset - 2, 0x00ff0000, value << 16); } static void hda_set_sdsts(struct hda_softc *sc, uint32_t offset, uint32_t old) { uint32_t value = hda_get_reg_by_offset(sc, offset); hda_set_reg_by_offset(sc, offset, old); /* clear the corresponding bits written by the software (guest) */ hda_set_field_by_offset(sc, offset, value & HDA_SDSTS_IRQ_MASK, 0); hda_update_intr(sc); } static int hda_signal_state_change(struct hda_codec_inst *hci) { struct hda_softc *sc = NULL; uint32_t sdiwake = 0; assert(hci); assert(hci->hda); DPRINTF("cad: 0x%x", hci->cad); sc = hci->hda; sdiwake = 1 << hci->cad; hda_set_field_by_offset(sc, HDAC_STATESTS, sdiwake, sdiwake); hda_update_intr(sc); return (0); } static int hda_response(struct hda_codec_inst *hci, uint32_t response, uint8_t unsol) { struct hda_softc *sc = NULL; struct hda_codec_cmd_ctl *rirb = NULL; uint32_t response_ex = 0; uint8_t rintcnt = 0; assert(hci); assert(hci->cad <= HDA_CODEC_MAX); response_ex = hci->cad | unsol; sc = hci->hda; assert(sc); rirb = &sc->rirb; if (rirb->run) { rirb->wp++; rirb->wp %= rirb->size; hda_dma_st_dword((uint8_t *)rirb->dma_vaddr + HDA_RIRB_ENTRY_LEN * rirb->wp, response); hda_dma_st_dword((uint8_t *)rirb->dma_vaddr + HDA_RIRB_ENTRY_LEN * rirb->wp + 0x04, response_ex); hda_set_reg_by_offset(sc, HDAC_RIRBWP, rirb->wp); sc->rirb_cnt++; } rintcnt = hda_get_reg_by_offset(sc, HDAC_RINTCNT); if (sc->rirb_cnt == rintcnt) hda_response_interrupt(sc); return (0); } static int hda_transfer(struct hda_codec_inst *hci, uint8_t stream, uint8_t dir, uint8_t *buf, size_t count) { struct hda_softc *sc = NULL; struct hda_stream_desc *st = NULL; struct hda_bdle_desc *bdl = NULL; struct hda_bdle_desc *bdle_desc = NULL; uint8_t stream_ind = 0; uint32_t lpib = 0; uint32_t off = 0; size_t left = 0; uint8_t irq = 0; assert(hci); assert(hci->hda); assert(buf); assert(!(count % HDA_DMA_ACCESS_LEN)); if (!stream) { DPRINTF("Invalid stream"); return (-1); } sc = hci->hda; assert(stream < HDA_STREAM_TAGS_CNT); stream_ind = sc->stream_map[dir][stream]; if (!dir) assert(stream_ind < HDA_ISS_NO); else assert(stream_ind >= HDA_ISS_NO && stream_ind < HDA_IOSS_NO); st = &sc->streams[stream_ind]; if (!st->run) { DPRINTF("Stream 0x%x stopped", stream); return (-1); } assert(st->stream == stream); off = hda_get_offset_stream(stream_ind); lpib = hda_get_reg_by_offset(sc, off + HDAC_SDLPIB); bdl = st->bdl; assert(st->be < st->bdl_cnt); assert(st->bp < bdl[st->be].len); left = count; while (left) { bdle_desc = &bdl[st->be]; if (dir) *(uint32_t *)buf = hda_dma_ld_dword( (uint8_t *)bdle_desc->addr + st->bp); else hda_dma_st_dword((uint8_t *)bdle_desc->addr + st->bp, *(uint32_t *)buf); buf += HDA_DMA_ACCESS_LEN; st->bp += HDA_DMA_ACCESS_LEN; lpib += HDA_DMA_ACCESS_LEN; left -= HDA_DMA_ACCESS_LEN; if (st->bp == bdle_desc->len) { st->bp = 0; if (bdle_desc->ioc) irq = 1; st->be++; if (st->be == st->bdl_cnt) { st->be = 0; lpib = 0; } bdle_desc = &bdl[st->be]; } } hda_set_pib(sc, stream_ind, lpib); if (irq) { hda_set_field_by_offset(sc, off + HDAC_SDSTS, HDAC_SDSTS_BCIS, HDAC_SDSTS_BCIS); hda_update_intr(sc); } return (0); } static void hda_set_pib(struct hda_softc *sc, uint8_t stream_ind, uint32_t pib) { uint32_t off = hda_get_offset_stream(stream_ind); hda_set_reg_by_offset(sc, off + HDAC_SDLPIB, pib); /* LPIB Alias */ hda_set_reg_by_offset(sc, 0x2000 + off + HDAC_SDLPIB, pib); if (sc->dma_pib_vaddr) *(uint32_t *)((uint8_t *)sc->dma_pib_vaddr + stream_ind * HDA_DMA_PIB_ENTRY_LEN) = pib; } static uint64_t hda_get_clock_ns(void) { struct timespec ts; int err; err = clock_gettime(CLOCK_MONOTONIC, &ts); assert(!err); return (ts.tv_sec * 1000000000LL + ts.tv_nsec); } /* * PCI HDA function definitions */ static int -pci_hda_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) +pci_hda_init(struct pci_devinst *pi, nvlist_t *nvl) { struct hda_softc *sc = NULL; - assert(ctx != NULL); assert(pi != NULL); pci_set_cfgdata16(pi, PCIR_VENDOR, INTEL_VENDORID); pci_set_cfgdata16(pi, PCIR_DEVICE, HDA_INTEL_82801G); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_MULTIMEDIA_HDA); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_MULTIMEDIA); /* select the Intel HDA mode */ pci_set_cfgdata8(pi, PCIR_HDCTL, 0x01); /* allocate one BAR register for the Memory address offsets */ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, HDA_LAST_OFFSET); /* allocate an IRQ pin for our slot */ pci_lintr_request(pi); sc = hda_init(nvl); if (!sc) return (-1); sc->pci_dev = pi; pi->pi_arg = sc; return (0); } static void -pci_hda_write(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size, +pci_hda_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct hda_softc *sc = pi->pi_arg; int err; assert(sc); assert(baridx == 0); assert(size <= 4); DPRINTF("offset: 0x%lx value: 0x%lx", offset, value); err = hda_write(sc, offset, size, value); assert(!err); } static uint64_t -pci_hda_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +pci_hda_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct hda_softc *sc = pi->pi_arg; uint64_t value = 0; assert(sc); assert(baridx == 0); assert(size <= 4); value = hda_read(sc, offset); DPRINTF("offset: 0x%lx value: 0x%lx", offset, value); return (value); } diff --git a/usr.sbin/bhyve/pci_hostbridge.c b/usr.sbin/bhyve/pci_hostbridge.c index 50c334d8e408..f0878d97a77e 100644 --- a/usr.sbin/bhyve/pci_hostbridge.c +++ b/usr.sbin/bhyve/pci_hostbridge.c @@ -1,88 +1,87 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include "config.h" #include "pci_emul.h" static int -pci_hostbridge_init(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl) +pci_hostbridge_init(struct pci_devinst *pi, nvlist_t *nvl) { const char *value; u_int vendor, device; vendor = 0x1275; /* NetApp */ device = 0x1275; /* NetApp */ value = get_config_value_node(nvl, "vendor"); if (value != NULL) vendor = strtol(value, NULL, 0); value = get_config_value_node(nvl, "devid"); if (value != NULL) device = strtol(value, NULL, 0); /* config space */ pci_set_cfgdata16(pi, PCIR_VENDOR, vendor); pci_set_cfgdata16(pi, PCIR_DEVICE, device); pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); return (0); } static int pci_amd_hostbridge_legacy_config(nvlist_t *nvl, const char *opts __unused) { set_config_value_node(nvl, "vendor", "0x1022"); /* AMD */ set_config_value_node(nvl, "devid", "0x7432"); /* made up */ return (0); } static const struct pci_devemu pci_de_amd_hostbridge = { .pe_emu = "amd_hostbridge", .pe_legacy_config = pci_amd_hostbridge_legacy_config, .pe_alias = "hostbridge", }; PCI_EMUL_SET(pci_de_amd_hostbridge); static const struct pci_devemu pci_de_hostbridge = { .pe_emu = "hostbridge", .pe_init = pci_hostbridge_init, }; PCI_EMUL_SET(pci_de_hostbridge); diff --git a/usr.sbin/bhyve/pci_lpc.c b/usr.sbin/bhyve/pci_lpc.c index e275946d9621..548726e27d0d 100644 --- a/usr.sbin/bhyve/pci_lpc.c +++ b/usr.sbin/bhyve/pci_lpc.c @@ -1,530 +1,527 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Neel Natu * Copyright (c) 2013 Tycho Nightingale * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "acpi.h" #include "debug.h" #include "bootrom.h" #include "config.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "pctestdev.h" #include "uart_emul.h" #define IO_ICU1 0x20 #define IO_ICU2 0xA0 SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt); SET_DECLARE(lpc_sysres_set, struct lpc_sysres); #define ELCR_PORT 0x4d0 SYSRES_IO(ELCR_PORT, 2); #define IO_TIMER1_PORT 0x40 #define NMISC_PORT 0x61 SYSRES_IO(NMISC_PORT, 1); static struct pci_devinst *lpc_bridge; #define LPC_UART_NUM 4 static struct lpc_uart_softc { struct uart_softc *uart_softc; int iobase; int irq; int enabled; } lpc_uart_softc[LPC_UART_NUM]; static const char *lpc_uart_names[LPC_UART_NUM] = { "com1", "com2", "com3", "com4" }; static const char *lpc_uart_acpi_names[LPC_UART_NUM] = { "COM1", "COM2", "COM3", "COM4" }; /* * LPC device configuration is in the following form: * [,] * For e.g. "com1,stdio" or "bootrom,/var/romfile" */ int lpc_device_parse(const char *opts) { int unit, error; char *str, *cpy, *lpcdev, *node_name; const char *romfile, *varfile; error = -1; str = cpy = strdup(opts); lpcdev = strsep(&str, ","); if (lpcdev != NULL) { if (strcasecmp(lpcdev, "bootrom") == 0) { romfile = strsep(&str, ","); if (romfile == NULL) { errx(4, "invalid bootrom option \"%s\"", opts); } set_config_value("lpc.bootrom", romfile); varfile = strsep(&str, ","); if (varfile != NULL) { set_config_value("lpc.bootvars", varfile); } error = 0; goto done; } for (unit = 0; unit < LPC_UART_NUM; unit++) { if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { asprintf(&node_name, "lpc.%s.path", lpc_uart_names[unit]); set_config_value(node_name, str); free(node_name); error = 0; goto done; } } if (strcasecmp(lpcdev, pctestdev_getname()) == 0) { asprintf(&node_name, "lpc.%s", pctestdev_getname()); set_config_bool(node_name, true); free(node_name); error = 0; goto done; } } done: free(cpy); return (error); } void lpc_print_supported_devices(void) { size_t i; printf("bootrom\n"); for (i = 0; i < LPC_UART_NUM; i++) printf("%s\n", lpc_uart_names[i]); printf("%s\n", pctestdev_getname()); } const char * lpc_bootrom(void) { return (get_config_value("lpc.bootrom")); } static void lpc_uart_intr_assert(void *arg) { struct lpc_uart_softc *sc = arg; assert(sc->irq >= 0); vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq); } static void lpc_uart_intr_deassert(void *arg __unused) { /* * The COM devices on the LPC bus generate edge triggered interrupts, * so nothing more to do here. */ } static int lpc_uart_io_handler(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg) { int offset; struct lpc_uart_softc *sc = arg; offset = port - sc->iobase; switch (bytes) { case 1: if (in) *eax = uart_read(sc->uart_softc, offset); else uart_write(sc->uart_softc, offset, *eax); break; case 2: if (in) { *eax = uart_read(sc->uart_softc, offset); *eax |= uart_read(sc->uart_softc, offset + 1) << 8; } else { uart_write(sc->uart_softc, offset, *eax); uart_write(sc->uart_softc, offset + 1, *eax >> 8); } break; default: return (-1); } return (0); } static int lpc_init(struct vmctx *ctx) { struct lpc_uart_softc *sc; struct inout_port iop; const char *backend, *name; char *node_name; int unit, error; const nvlist_t *nvl; nvl = find_config_node("lpc"); if (nvl != NULL && nvlist_exists(nvl, "bootrom")) { error = bootrom_loadrom(ctx, nvl); if (error) return (error); } /* COM1 and COM2 */ for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; name = lpc_uart_names[unit]; if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) { EPRINTLN("Unable to allocate resources for " "LPC device %s", name); return (-1); } pci_irq_reserve(sc->irq); sc->uart_softc = uart_init(lpc_uart_intr_assert, lpc_uart_intr_deassert, sc); asprintf(&node_name, "lpc.%s.path", name); backend = get_config_value(node_name); free(node_name); if (uart_set_backend(sc->uart_softc, backend) != 0) { EPRINTLN("Unable to initialize backend '%s' " "for LPC device %s", backend, name); return (-1); } bzero(&iop, sizeof(struct inout_port)); iop.name = name; iop.port = sc->iobase; iop.size = UART_IO_BAR_SIZE; iop.flags = IOPORT_F_INOUT; iop.handler = lpc_uart_io_handler; iop.arg = sc; error = register_inout(&iop); assert(error == 0); sc->enabled = 1; } /* pc-testdev */ asprintf(&node_name, "lpc.%s", pctestdev_getname()); if (get_config_bool_default(node_name, false)) { error = pctestdev_init(ctx); if (error) return (error); } free(node_name); return (0); } static void pci_lpc_write_dsdt(struct pci_devinst *pi) { struct lpc_dsdt **ldpp, *ldp; dsdt_line(""); dsdt_line("Device (ISA)"); dsdt_line("{"); dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func); dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)"); dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)"); dsdt_line(" {"); dsdt_line(" Offset (0x60),"); dsdt_line(" PIRA, 8,"); dsdt_line(" PIRB, 8,"); dsdt_line(" PIRC, 8,"); dsdt_line(" PIRD, 8,"); dsdt_line(" Offset (0x68),"); dsdt_line(" PIRE, 8,"); dsdt_line(" PIRF, 8,"); dsdt_line(" PIRG, 8,"); dsdt_line(" PIRH, 8"); dsdt_line(" }"); dsdt_line(""); dsdt_indent(1); SET_FOREACH(ldpp, lpc_dsdt_set) { ldp = *ldpp; ldp->handler(); } dsdt_line(""); dsdt_line("Device (PIC)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_ICU1, 2); dsdt_fixed_ioport(IO_ICU2, 2); dsdt_fixed_irq(2); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_line(""); dsdt_line("Device (TIMR)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(IO_TIMER1_PORT, 4); dsdt_fixed_irq(0); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); dsdt_unindent(1); dsdt_line("}"); } static void pci_lpc_sysres_dsdt(void) { struct lpc_sysres **lspp, *lsp; dsdt_line(""); dsdt_line("Device (SIO)"); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); SET_FOREACH(lspp, lpc_sysres_set) { lsp = *lspp; switch (lsp->type) { case LPC_SYSRES_IO: dsdt_fixed_ioport(lsp->base, lsp->length); break; case LPC_SYSRES_MEM: dsdt_fixed_mem32(lsp->base, lsp->length); break; } } dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } LPC_DSDT(pci_lpc_sysres_dsdt); static void pci_lpc_uart_dsdt(void) { struct lpc_uart_softc *sc; int unit; for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; if (!sc->enabled) continue; dsdt_line(""); dsdt_line("Device (%s)", lpc_uart_acpi_names[unit]); dsdt_line("{"); dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))"); dsdt_line(" Name (_UID, %d)", unit + 1); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_indent(2); dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE); dsdt_fixed_irq(sc->irq); dsdt_unindent(2); dsdt_line(" })"); dsdt_line("}"); } } LPC_DSDT(pci_lpc_uart_dsdt); static int -pci_lpc_cfgwrite(struct vmctx *ctx, struct pci_devinst *pi, - int coff, int bytes, uint32_t val) +pci_lpc_cfgwrite(struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int pirq_pin; if (bytes == 1) { pirq_pin = 0; if (coff >= 0x60 && coff <= 0x63) pirq_pin = coff - 0x60 + 1; if (coff >= 0x68 && coff <= 0x6b) pirq_pin = coff - 0x68 + 5; if (pirq_pin != 0) { - pirq_write(ctx, pirq_pin, val); + pirq_write(pi->pi_vmctx, pirq_pin, val); pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin)); return (0); } } return (-1); } static void -pci_lpc_write(struct vmctx *ctx __unused, - struct pci_devinst *pi __unused, int baridx __unused, +pci_lpc_write(struct pci_devinst *pi __unused, int baridx __unused, uint64_t offset __unused, int size __unused, uint64_t value __unused) { } static uint64_t -pci_lpc_read(struct vmctx *ctx __unused, - struct pci_devinst *pi __unused, int baridx __unused, uint64_t offset __unused, - int size __unused) +pci_lpc_read(struct pci_devinst *pi __unused, int baridx __unused, + uint64_t offset __unused, int size __unused) { return (0); } #define LPC_DEV 0x7000 #define LPC_VENDOR 0x8086 static int -pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl __unused) +pci_lpc_init(struct pci_devinst *pi, nvlist_t *nvl __unused) { /* * Do not allow more than one LPC bridge to be configured. */ if (lpc_bridge != NULL) { EPRINTLN("Only one LPC bridge is allowed."); return (-1); } /* * Enforce that the LPC can only be configured on bus 0. This * simplifies the ACPI DSDT because it can provide a decode for * all legacy i/o ports behind bus 0. */ if (pi->pi_bus != 0) { EPRINTLN("LPC bridge can be present only on bus 0."); return (-1); } - if (lpc_init(ctx) != 0) + if (lpc_init(pi->pi_vmctx) != 0) return (-1); /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV); pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA); lpc_bridge = pi; return (0); } char * lpc_pirq_name(int pin) { char *name; if (lpc_bridge == NULL) return (NULL); asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1); return (name); } void lpc_pirq_routed(void) { int pin; if (lpc_bridge == NULL) return; for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1)); for (pin = 0; pin < 4; pin++) pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); } #ifdef BHYVE_SNAPSHOT static int pci_lpc_snapshot(struct vm_snapshot_meta *meta) { int unit, ret; struct uart_softc *sc; for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = lpc_uart_softc[unit].uart_softc; ret = uart_snapshot(sc, meta); if (ret != 0) goto done; } done: return (ret); } #endif static const struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, .pe_write_dsdt = pci_lpc_write_dsdt, .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, .pe_barread = pci_lpc_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_lpc_snapshot, #endif }; PCI_EMUL_SET(pci_de_lpc); diff --git a/usr.sbin/bhyve/pci_nvme.c b/usr.sbin/bhyve/pci_nvme.c index 75e6f247501a..cf1c815096d3 100644 --- a/usr.sbin/bhyve/pci_nvme.c +++ b/usr.sbin/bhyve/pci_nvme.c @@ -1,3395 +1,3394 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang * Copyright (c) 2020 Chuck Tuffli * * Function crc16 Copyright (c) 2017, Fedor Uporov * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * bhyve PCIe-NVMe device emulation. * * options: * -s ,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm= * * accepted devpath: * /dev/blockdev * /path/to/image * ram=size_in_MiB * * maxq = max number of queues * qsz = max elements in each queue * ioslots = max number of concurrent io requests * sectsz = sector size (defaults to blockif sector size) * ser = serial number (20-chars max) * eui64 = IEEE Extended Unique Identifier (8 byte value) * dsm = DataSet Management support. Option is one of auto, enable,disable * */ /* TODO: - create async event for smart and log - intr coalesce */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "block_if.h" #include "config.h" #include "debug.h" #include "pci_emul.h" static int nvme_debug = 0; #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) /* defaults; can be overridden */ #define NVME_MSIX_BAR 4 #define NVME_IOSLOTS 8 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ #define NVME_MMIO_SPACE_MIN (1 << 14) #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 /* Memory Page size Minimum reported in CAP register */ #define NVME_MPSMIN 0 /* MPSMIN converted to bytes */ #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) #define NVME_MDTS 9 /* Note the + 1 allows for the initial descriptor to not be page aligned */ #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) /* This is a synthetic status code to indicate there is no status */ #define NVME_NO_STATUS 0xffff #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) /* Reported temperature in Kelvin (i.e. room temperature) */ #define NVME_TEMPERATURE 296 /* helpers */ /* Convert a zero-based value into a one-based value */ #define ONE_BASED(zero) ((zero) + 1) /* Convert a one-based value into a zero-based value */ #define ZERO_BASED(one) ((one) - 1) /* Encode number of SQ's and CQ's for Set/Get Features */ #define NVME_FEATURE_NUM_QUEUES(sc) \ (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) enum nvme_controller_register_offsets { NVME_CR_CAP_LOW = 0x00, NVME_CR_CAP_HI = 0x04, NVME_CR_VS = 0x08, NVME_CR_INTMS = 0x0c, NVME_CR_INTMC = 0x10, NVME_CR_CC = 0x14, NVME_CR_CSTS = 0x1c, NVME_CR_NSSR = 0x20, NVME_CR_AQA = 0x24, NVME_CR_ASQ_LOW = 0x28, NVME_CR_ASQ_HI = 0x2c, NVME_CR_ACQ_LOW = 0x30, NVME_CR_ACQ_HI = 0x34, }; enum nvme_cmd_cdw11 { NVME_CMD_CDW11_PC = 0x0001, NVME_CMD_CDW11_IEN = 0x0002, NVME_CMD_CDW11_IV = 0xFFFF0000, }; enum nvme_copy_dir { NVME_COPY_TO_PRP, NVME_COPY_FROM_PRP, }; #define NVME_CQ_INTEN 0x01 #define NVME_CQ_INTCOAL 0x02 struct nvme_completion_queue { struct nvme_completion *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; }; struct nvme_submission_queue { struct nvme_command *qbase; pthread_mutex_t mtx; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ int qpriority; }; enum nvme_storage_type { NVME_STOR_BLOCKIF = 0, NVME_STOR_RAM = 1, }; struct pci_nvme_blockstore { enum nvme_storage_type type; void *ctx; uint64_t size; uint32_t sectsz; uint32_t sectsz_bits; uint64_t eui64; uint32_t deallocate:1; }; /* * Calculate the number of additional page descriptors for guest IO requests * based on the advertised Max Data Transfer (MDTS) and given the number of * default iovec's in a struct blockif_req. */ #define MDTS_PAD_SIZE \ ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 0 ) struct pci_nvme_ioreq { struct pci_nvme_softc *sc; STAILQ_ENTRY(pci_nvme_ioreq) link; struct nvme_submission_queue *nvme_sq; uint16_t sqid; /* command information */ uint16_t opc; uint16_t cid; uint32_t nsid; uint64_t prev_gpaddr; size_t prev_size; size_t bytes; struct blockif_req io_req; struct iovec iovpadding[MDTS_PAD_SIZE]; }; enum nvme_dsm_type { /* Dataset Management bit in ONCS reflects backing storage capability */ NVME_DATASET_MANAGEMENT_AUTO, /* Unconditionally set Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_ENABLE, /* Unconditionally clear Dataset Management bit in ONCS */ NVME_DATASET_MANAGEMENT_DISABLE, }; struct pci_nvme_softc; struct nvme_feature_obj; typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); struct nvme_feature_obj { uint32_t cdw11; nvme_feature_cb set; nvme_feature_cb get; bool namespace_specific; }; #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) typedef enum { PCI_NVME_AE_TYPE_ERROR = 0, PCI_NVME_AE_TYPE_SMART, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AE_TYPE_IO_CMD = 6, PCI_NVME_AE_TYPE_VENDOR = 7, PCI_NVME_AE_TYPE_MAX /* Must be last */ } pci_nvme_async_type; /* Asynchronous Event Requests */ struct pci_nvme_aer { STAILQ_ENTRY(pci_nvme_aer) link; uint16_t cid; /* Command ID of the submitted AER */ }; /** Asynchronous Event Information - Notice */ typedef enum { PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, PCI_NVME_AEI_NOTICE_FW_ACTIVATION, PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, PCI_NVME_AEI_NOTICE_ANA_CHANGE, PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, PCI_NVME_AEI_NOTICE_MAX, } pci_nvme_async_event_info_notice; #define PCI_NVME_AEI_NOTICE_SHIFT 8 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) /* Asynchronous Event Notifications */ struct pci_nvme_aen { pci_nvme_async_type atype; uint32_t event_data; bool posted; }; /* * By default, enable all Asynchrnous Event Notifications: * SMART / Health Critical Warnings * Namespace Attribute Notices */ #define PCI_NVME_AEN_DEFAULT_MASK 0x11f typedef enum { NVME_CNTRLTYPE_IO = 1, NVME_CNTRLTYPE_DISCOVERY = 2, NVME_CNTRLTYPE_ADMIN = 3, } pci_nvme_cntrl_type; struct pci_nvme_softc { struct pci_devinst *nsc_pi; pthread_mutex_t mtx; struct nvme_registers regs; struct nvme_namespace_data nsdata; struct nvme_controller_data ctrldata; struct nvme_error_information_entry err_log; struct nvme_health_information_page health_log; struct nvme_firmware_page fw_log; struct nvme_ns_list ns_log; struct pci_nvme_blockstore nvstore; uint16_t max_qentries; /* max entries per queue */ uint32_t max_queues; /* max number of IO SQ's or CQ's */ uint32_t num_cqueues; uint32_t num_squeues; bool num_q_is_set; /* Has host set Number of Queues */ struct pci_nvme_ioreq *ioreqs; STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ uint32_t pending_ios; uint32_t ioslots; sem_t iosemlock; /* * Memory mapped Submission and Completion queues * Each array includes both Admin and IO queues */ struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; struct nvme_feature_obj feat[NVME_FID_MAX]; enum nvme_dsm_type dataset_management; /* Accounting for SMART data */ __uint128_t read_data_units; __uint128_t write_data_units; __uint128_t read_commands; __uint128_t write_commands; uint32_t read_dunits_remainder; uint32_t write_dunits_remainder; STAILQ_HEAD(, pci_nvme_aer) aer_list; pthread_mutex_t aer_mtx; uint32_t aer_count; struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; pthread_t aen_tid; pthread_mutex_t aen_mtx; pthread_cond_t aen_cond; }; static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status); static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); static void pci_nvme_io_done(struct blockif_req *, int); /* Controller Configuration utils */ #define NVME_CC_GET_EN(cc) \ ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) #define NVME_CC_GET_CSS(cc) \ ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) #define NVME_CC_GET_SHN(cc) \ ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) #define NVME_CC_GET_IOSQES(cc) \ ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) #define NVME_CC_GET_IOCQES(cc) \ ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) #define NVME_CC_WRITE_MASK \ ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) #define NVME_CC_NEN_WRITE_MASK \ ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) /* Controller Status utils */ #define NVME_CSTS_GET_RDY(sts) \ ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) /* Completion Queue status word utils */ #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) #define NVME_STATUS_MASK \ ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ NVME_CTRLR_DATA_ONCS_DSM_SHIFT) static void nvme_feature_invalid_cb(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_temperature(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_num_queues(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_iv_config(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void nvme_feature_async_event(struct pci_nvme_softc *, struct nvme_feature_obj *, struct nvme_command *, struct nvme_completion *); static void *aen_thr(void *arg); static __inline void cpywithpad(char *dst, size_t dst_size, const char *src, char pad) { size_t len; len = strnlen(src, dst_size); memset(dst, pad, dst_size); memcpy(dst, src, len); } static __inline void pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) { *status &= ~NVME_STATUS_MASK; *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; } static __inline void pci_nvme_status_genc(uint16_t *status, uint16_t code) { pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); } /* * Initialize the requested number or IO Submission and Completion Queues. * Admin queues are allocated implicitly. */ static void pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) { uint32_t i; /* * Allocate and initialize the Submission Queues */ if (nsq > NVME_QUEUES) { WPRINTF("%s: clamping number of SQ from %u to %u", __func__, nsq, NVME_QUEUES); nsq = NVME_QUEUES; } sc->num_squeues = nsq; sc->submit_queues = calloc(sc->num_squeues + 1, sizeof(struct nvme_submission_queue)); if (sc->submit_queues == NULL) { WPRINTF("%s: SQ allocation failed", __func__); sc->num_squeues = 0; } else { struct nvme_submission_queue *sq = sc->submit_queues; for (i = 0; i < sc->num_squeues + 1; i++) pthread_mutex_init(&sq[i].mtx, NULL); } /* * Allocate and initialize the Completion Queues */ if (ncq > NVME_QUEUES) { WPRINTF("%s: clamping number of CQ from %u to %u", __func__, ncq, NVME_QUEUES); ncq = NVME_QUEUES; } sc->num_cqueues = ncq; sc->compl_queues = calloc(sc->num_cqueues + 1, sizeof(struct nvme_completion_queue)); if (sc->compl_queues == NULL) { WPRINTF("%s: CQ allocation failed", __func__); sc->num_cqueues = 0; } else { struct nvme_completion_queue *cq = sc->compl_queues; for (i = 0; i < sc->num_cqueues + 1; i++) pthread_mutex_init(&cq[i].mtx, NULL); } } static void pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; cd->vid = 0xFB5D; cd->ssvid = 0x0000; cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); /* Num of submission commands that we can handle at a time (2^rab) */ cd->rab = 4; /* FreeBSD OUI */ cd->ieee[0] = 0x58; cd->ieee[1] = 0x9c; cd->ieee[2] = 0xfc; cd->mic = 0; cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = NVME_REV(1,4); cd->cntrltype = NVME_CNTRLTYPE_IO; cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); cd->acl = 2; cd->aerl = 4; /* Advertise 1, Read-only firmware slot */ cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); cd->lpa = 0; /* TODO: support some simple things like SMART */ cd->elpe = 0; /* max error log page entries */ /* * Report a single power state (zero-based value) * power_state[] values are left as zero to indicate "Not reported" */ cd->npss = 0; /* Warning Composite Temperature Threshold */ cd->wctemp = 0x0157; cd->cctemp = 0x0157; /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); cd->nn = 1; /* number of namespaces */ cd->oncs = 0; switch (sc->dataset_management) { case NVME_DATASET_MANAGEMENT_AUTO: if (sc->nvstore.deallocate) cd->oncs |= NVME_ONCS_DSM; break; case NVME_DATASET_MANAGEMENT_ENABLE: cd->oncs |= NVME_ONCS_DSM; break; default: break; } cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; } /* * Calculate the CRC-16 of the given buffer * See copyright attribution at top of file */ static uint16_t crc16(uint16_t crc, const void *buffer, unsigned int len) { const unsigned char *cp = buffer; /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ static uint16_t const crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; while (len--) crc = (((crc >> 8) & 0xffU) ^ crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; return crc; } static void pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, struct nvme_namespace_data *nd) { /* Get capacity and block size information from backing store */ nd->nsze = nvstore->size / nvstore->sectsz; nd->ncap = nd->nsze; nd->nuse = nd->nsze; } static void pci_nvme_init_nsdata(struct pci_nvme_softc *sc, struct nvme_namespace_data *nd, uint32_t nsid, struct pci_nvme_blockstore *nvstore) { pci_nvme_init_nsdata_size(nvstore, nd); if (nvstore->type == NVME_STOR_BLOCKIF) nvstore->deallocate = blockif_candelete(nvstore->ctx); nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ nd->flbas = 0; /* Create an EUI-64 if user did not provide one */ if (nvstore->eui64 == 0) { char *data = NULL; uint64_t eui64 = nvstore->eui64; asprintf(&data, "%s%u%u%u", get_config_value("name"), sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); if (data != NULL) { eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); free(data); } nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); } be64enc(nd->eui64, nvstore->eui64); /* LBA data-sz = 2^lbads */ nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; } static void pci_nvme_init_logpages(struct pci_nvme_softc *sc) { __uint128_t power_cycles = 1; memset(&sc->err_log, 0, sizeof(sc->err_log)); memset(&sc->health_log, 0, sizeof(sc->health_log)); memset(&sc->fw_log, 0, sizeof(sc->fw_log)); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); /* Set read/write remainder to round up according to spec */ sc->read_dunits_remainder = 999; sc->write_dunits_remainder = 999; /* Set nominal Health values checked by implementations */ sc->health_log.temperature = NVME_TEMPERATURE; sc->health_log.available_spare = 100; sc->health_log.available_spare_threshold = 10; /* Set Active Firmware Info to slot 1 */ sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, sizeof(sc->fw_log.revision[0])); memcpy(&sc->health_log.power_cycles, &power_cycles, sizeof(sc->health_log.power_cycles)); } static void pci_nvme_init_features(struct pci_nvme_softc *sc) { enum nvme_feature fid; for (fid = 0; fid < NVME_FID_MAX; fid++) { switch (fid) { case NVME_FEAT_ARBITRATION: case NVME_FEAT_POWER_MANAGEMENT: case NVME_FEAT_INTERRUPT_COALESCING: //XXX case NVME_FEAT_WRITE_ATOMICITY: /* Mandatory but no special handling required */ //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: // this returns a data buffer break; case NVME_FEAT_TEMPERATURE_THRESHOLD: sc->feat[fid].set = nvme_feature_temperature; break; case NVME_FEAT_ERROR_RECOVERY: sc->feat[fid].namespace_specific = true; break; case NVME_FEAT_NUMBER_OF_QUEUES: sc->feat[fid].set = nvme_feature_num_queues; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: sc->feat[fid].set = nvme_feature_iv_config; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: sc->feat[fid].set = nvme_feature_async_event; /* Enable all AENs by default */ sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; break; default: sc->feat[fid].set = nvme_feature_invalid_cb; sc->feat[fid].get = nvme_feature_invalid_cb; } } } static void pci_nvme_aer_reset(struct pci_nvme_softc *sc) { STAILQ_INIT(&sc->aer_list); sc->aer_count = 0; } static void pci_nvme_aer_init(struct pci_nvme_softc *sc) { pthread_mutex_init(&sc->aer_mtx, NULL); pci_nvme_aer_reset(sc); } static void pci_nvme_aer_destroy(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); while (!STAILQ_EMPTY(&sc->aer_list)) { aer = STAILQ_FIRST(&sc->aer_list); STAILQ_REMOVE_HEAD(&sc->aer_list, link); free(aer); } pthread_mutex_unlock(&sc->aer_mtx); pci_nvme_aer_reset(sc); } static bool pci_nvme_aer_available(struct pci_nvme_softc *sc) { return (sc->aer_count != 0); } static bool pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; /* AERL is a zero based value while aer_count is one's based */ return (sc->aer_count == (cd->aerl + 1U)); } /* * Add an Async Event Request * * Stores an AER to be returned later if the Controller needs to notify the * host of an event. * Note that while the NVMe spec doesn't require Controllers to return AER's * in order, this implementation does preserve the order. */ static int pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) { struct pci_nvme_aer *aer = NULL; aer = calloc(1, sizeof(struct pci_nvme_aer)); if (aer == NULL) return (-1); /* Save the Command ID for use in the completion message */ aer->cid = cid; pthread_mutex_lock(&sc->aer_mtx); sc->aer_count++; STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); pthread_mutex_unlock(&sc->aer_mtx); return (0); } /* * Get an Async Event Request structure * * Returns a pointer to an AER previously submitted by the host or NULL if * no AER's exist. Caller is responsible for freeing the returned struct. */ static struct pci_nvme_aer * pci_nvme_aer_get(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer = NULL; pthread_mutex_lock(&sc->aer_mtx); aer = STAILQ_FIRST(&sc->aer_list); if (aer != NULL) { STAILQ_REMOVE_HEAD(&sc->aer_list, link); sc->aer_count--; } pthread_mutex_unlock(&sc->aer_mtx); return (aer); } static void pci_nvme_aen_reset(struct pci_nvme_softc *sc) { uint32_t atype; memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { sc->aen[atype].atype = atype; } } static void pci_nvme_aen_init(struct pci_nvme_softc *sc) { char nstr[80]; pci_nvme_aen_reset(sc); pthread_mutex_init(&sc->aen_mtx, NULL); pthread_create(&sc->aen_tid, NULL, aen_thr, sc); snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); pthread_set_name_np(sc->aen_tid, nstr); } static void pci_nvme_aen_destroy(struct pci_nvme_softc *sc) { pci_nvme_aen_reset(sc); } /* Notify the AEN thread of pending work */ static void pci_nvme_aen_notify(struct pci_nvme_softc *sc) { pthread_cond_signal(&sc->aen_cond); } /* * Post an Asynchronous Event Notification */ static int32_t pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, uint32_t event_data) { struct pci_nvme_aen *aen; if (atype >= PCI_NVME_AE_TYPE_MAX) { return(EINVAL); } pthread_mutex_lock(&sc->aen_mtx); aen = &sc->aen[atype]; /* Has the controller already posted an event of this type? */ if (aen->posted) { pthread_mutex_unlock(&sc->aen_mtx); return(EALREADY); } aen->event_data = event_data; aen->posted = true; pthread_mutex_unlock(&sc->aen_mtx); pci_nvme_aen_notify(sc); return(0); } static void pci_nvme_aen_process(struct pci_nvme_softc *sc) { struct pci_nvme_aer *aer; struct pci_nvme_aen *aen; pci_nvme_async_type atype; uint32_t mask; uint16_t status; uint8_t lid; assert(pthread_mutex_isowned_np(&sc->aen_mtx)); for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { aen = &sc->aen[atype]; /* Previous iterations may have depleted the available AER's */ if (!pci_nvme_aer_available(sc)) { DPRINTF("%s: no AER", __func__); break; } if (!aen->posted) { DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); continue; } status = NVME_SC_SUCCESS; /* Is the event masked? */ mask = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); switch (atype) { case PCI_NVME_AE_TYPE_ERROR: lid = NVME_LOG_ERROR; break; case PCI_NVME_AE_TYPE_SMART: mask &= 0xff; if ((mask & aen->event_data) == 0) continue; lid = NVME_LOG_HEALTH_INFORMATION; break; case PCI_NVME_AE_TYPE_NOTICE: if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { EPRINTLN("%s unknown AEN notice type %u", __func__, aen->event_data); status = NVME_SC_INTERNAL_DEVICE_ERROR; lid = 0; break; } if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) continue; switch (aen->event_data) { case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: lid = NVME_LOG_CHANGED_NAMESPACE; break; case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: lid = NVME_LOG_FIRMWARE_SLOT; break; case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; break; case PCI_NVME_AEI_NOTICE_ANA_CHANGE: lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; break; case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; break; case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: lid = NVME_LOG_LBA_STATUS_INFORMATION; break; case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; break; default: lid = 0; } break; default: /* bad type?!? */ EPRINTLN("%s unknown AEN type %u", __func__, atype); status = NVME_SC_INTERNAL_DEVICE_ERROR; lid = 0; break; } aer = pci_nvme_aer_get(sc); assert(aer != NULL); DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); pci_nvme_cq_update(sc, &sc->compl_queues[0], (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ aer->cid, 0, /* SQID */ status); aen->event_data = 0; aen->posted = false; pci_generate_msix(sc->nsc_pi, 0); } } static void * aen_thr(void *arg) { struct pci_nvme_softc *sc; sc = arg; pthread_mutex_lock(&sc->aen_mtx); for (;;) { pci_nvme_aen_process(sc); pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); } pthread_mutex_unlock(&sc->aen_mtx); pthread_exit(NULL); return (NULL); } static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { uint32_t i; DPRINTF("%s", __func__); sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | (60 << NVME_CAP_LO_REG_TO_SHIFT); sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ sc->regs.cc = 0; assert(sc->submit_queues != NULL); for (i = 0; i < sc->num_squeues + 1; i++) { sc->submit_queues[i].qbase = NULL; sc->submit_queues[i].size = 0; sc->submit_queues[i].cqid = 0; sc->submit_queues[i].tail = 0; sc->submit_queues[i].head = 0; } assert(sc->compl_queues != NULL); for (i = 0; i < sc->num_cqueues + 1; i++) { sc->compl_queues[i].qbase = NULL; sc->compl_queues[i].size = 0; sc->compl_queues[i].tail = 0; sc->compl_queues[i].head = 0; } sc->num_q_is_set = false; pci_nvme_aer_destroy(sc); pci_nvme_aen_destroy(sc); /* * Clear CSTS.RDY last to prevent the host from enabling Controller * before cleanup completes */ sc->regs.csts = 0; } static void pci_nvme_reset(struct pci_nvme_softc *sc) { pthread_mutex_lock(&sc->mtx); pci_nvme_reset_locked(sc); pthread_mutex_unlock(&sc->mtx); } static int -pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) +pci_nvme_init_controller(struct pci_nvme_softc *sc) { uint16_t acqs, asqs; DPRINTF("%s", __func__); /* * NVMe 2.0 states that "enabling a controller while this field is * cleared to 0h produces undefined results" for both ACQS and * ASQS. If zero, set CFS and do not become ready. */ asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); if (asqs < 2) { EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, asqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->submit_queues[0].size = asqs; - sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, - sizeof(struct nvme_command) * asqs); + sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, + sc->regs.asq, sizeof(struct nvme_command) * asqs); if (sc->submit_queues[0].qbase == NULL) { EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, sc->regs.asq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", __func__, sc->regs.asq, sc->submit_queues[0].qbase); acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & NVME_AQA_REG_ACQS_MASK); if (acqs < 2) { EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, acqs - 1, sc->regs.aqa); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].size = acqs; - sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, - sizeof(struct nvme_completion) * acqs); + sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, + sc->regs.acq, sizeof(struct nvme_completion) * acqs); if (sc->compl_queues[0].qbase == NULL) { EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, sc->regs.acq); sc->regs.csts |= NVME_CSTS_CFS; return (-1); } sc->compl_queues[0].intr_en = NVME_CQ_INTEN; DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", __func__, sc->regs.acq, sc->compl_queues[0].qbase); return (0); } static int nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, size_t len, enum nvme_copy_dir dir) { uint8_t *p; size_t bytes; if (len > (8 * 1024)) { return (-1); } /* Copy from the start of prp1 to the end of the physical page */ bytes = PAGE_SIZE - (prp1 & PAGE_MASK); bytes = MIN(bytes, len); p = vm_map_gpa(ctx, prp1, bytes); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, bytes); else memcpy(b, p, bytes); b += bytes; len -= bytes; if (len == 0) { return (0); } len = MIN(len, PAGE_SIZE); p = vm_map_gpa(ctx, prp2, len); if (p == NULL) { return (-1); } if (dir == NVME_COPY_TO_PRP) memcpy(p, b, len); else memcpy(b, p, len); return (0); } /* * Write a Completion Queue Entry update * * Write the completion and update the doorbell value */ static void pci_nvme_cq_update(struct pci_nvme_softc *sc, struct nvme_completion_queue *cq, uint32_t cdw0, uint16_t cid, uint16_t sqid, uint16_t status) { struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; struct nvme_completion *cqe; assert(cq->qbase != NULL); pthread_mutex_lock(&cq->mtx); cqe = &cq->qbase[cq->tail]; /* Flip the phase bit */ status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; cqe->cdw0 = cdw0; cqe->sqhd = sq->head; cqe->sqid = sqid; cqe->cid = cid; cqe->status = status; cq->tail++; if (cq->tail >= cq->size) { cq->tail = 0; } pthread_mutex_unlock(&cq->mtx); } static int nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); if (qid == 0 || qid > sc->num_squeues || (sc->submit_queues[qid].qbase == NULL)) { WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } sc->submit_queues[qid].qbase = NULL; sc->submit_queues[qid].cqid = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { if (command->cdw11 & NVME_CMD_CDW11_PC) { uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; if ((qid == 0) || (qid > sc->num_squeues) || (sc->submit_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_squeues %u", __func__, qid, sc->num_squeues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } nsq = &sc->submit_queues[qid]; nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } nsq->head = nsq->tail = 0; nsq->cqid = (command->cdw11 >> 16) & 0xffff; if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } if (sc->compl_queues[nsq->cqid].qbase == NULL) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_COMPLETION_QUEUE_INVALID); return (1); } nsq->qpriority = (command->cdw11 >> 1) & 0x03; nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)nsq->size); DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, qid, nsq->size, nsq->qbase, nsq->cqid); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); DPRINTF("%s completed creating IOSQ qid %u", __func__, qid); } else { /* * Guest sent non-cont submission queue request. * This setting is unsupported by this emulation. */ WPRINTF("%s unsupported non-contig (list-based) " "create i/o submission queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } return (1); } static int nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint16_t qid = command->cdw10 & 0xffff; uint16_t sqid; DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); if (qid == 0 || qid > sc->num_cqueues || (sc->compl_queues[qid].qbase == NULL)) { WPRINTF("%s queue index %u / num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } /* Deleting an Active CQ is an error */ for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) if (sc->submit_queues[sqid].cqid == qid) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_DELETION); return (1); } sc->compl_queues[qid].qbase = NULL; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_completion_queue *ncq; uint16_t qid = command->cdw10 & 0xffff; /* Only support Physically Contiguous queues */ if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { WPRINTF("%s unsupported non-contig (list-based) " "create i/o completion queue", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if ((qid == 0) || (qid > sc->num_cqueues) || (sc->compl_queues[qid].qbase != NULL)) { WPRINTF("%s queue index %u > num_cqueues %u", __func__, qid, sc->num_cqueues); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_QUEUE_IDENTIFIER); return (1); } ncq = &sc->compl_queues[qid]; ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; if (ncq->intr_vec > (sc->max_queues + 1)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_INTERRUPT_VECTOR); return (1); } ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { /* * Queues must specify at least two entries * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec */ pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); return (1); } ncq->head = ncq->tail = 0; ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)ncq->size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint64_t logoff; uint32_t logsize; uint8_t logpage; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); /* * Command specifies the number of dwords to return in fields NUMDU * and NUMDL. This is a zero-based value. */ logpage = command->cdw10 & 0xFF; logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; logsize *= sizeof(uint32_t); logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; DPRINTF("%s log page %u len %u", __func__, logpage, logsize); switch (logpage) { case NVME_LOG_ERROR: if (logoff >= sizeof(sc->err_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->err_log + logoff, MIN(logsize - logoff, sizeof(sc->err_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_HEALTH_INFORMATION: if (logoff >= sizeof(sc->health_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } pthread_mutex_lock(&sc->mtx); memcpy(&sc->health_log.data_units_read, &sc->read_data_units, sizeof(sc->health_log.data_units_read)); memcpy(&sc->health_log.data_units_written, &sc->write_data_units, sizeof(sc->health_log.data_units_written)); memcpy(&sc->health_log.host_read_commands, &sc->read_commands, sizeof(sc->health_log.host_read_commands)); memcpy(&sc->health_log.host_write_commands, &sc->write_commands, sizeof(sc->health_log.host_write_commands)); pthread_mutex_unlock(&sc->mtx); nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->health_log + logoff, MIN(logsize - logoff, sizeof(sc->health_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_FIRMWARE_SLOT: if (logoff >= sizeof(sc->fw_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->fw_log + logoff, MIN(logsize - logoff, sizeof(sc->fw_log)), NVME_COPY_TO_PRP); break; case NVME_LOG_CHANGED_NAMESPACE: if (logoff >= sizeof(sc->ns_log)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ns_log + logoff, MIN(logsize - logoff, sizeof(sc->ns_log)), NVME_COPY_TO_PRP); memset(&sc->ns_log, 0, sizeof(sc->ns_log)); break; default: DPRINTF("%s get log page %x command not supported", __func__, logpage); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_LOG_PAGE); } return (1); } static int nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { void *dest; uint16_t status; DPRINTF("%s identify 0x%x nsid 0x%x", __func__, command->cdw10 & 0xFF, command->nsid); status = 0; pci_nvme_status_genc(&status, NVME_SC_SUCCESS); switch (command->cdw10 & 0xFF) { case 0x00: /* return Identify Namespace data structure */ /* Global NS only valid with NS Management */ if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), NVME_COPY_TO_PRP); break; case 0x01: /* return Identify Controller data structure */ nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, command->prp2, (uint8_t *)&sc->ctrldata, sizeof(sc->ctrldata), NVME_COPY_TO_PRP); break; case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All unused entries shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); ((uint32_t *)dest)[0] = 1; break; case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ if (command->nsid != 1) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); break; } dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint32_t) * 1024); /* All bytes after the descriptor shall be zero */ memset(dest, 0, sizeof(uint32_t) * 1024); /* Return NIDT=1 (i.e. EUI64) descriptor */ ((uint8_t *)dest)[0] = 1; ((uint8_t *)dest)[1] = sizeof(uint64_t); memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); break; case 0x13: /* * Controller list is optional but used by UNH tests. Return * a valid but empty list. */ dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(uint16_t) * 2048); memset(dest, 0, sizeof(uint16_t) * 2048); break; default: DPRINTF("%s unsupported identify command requested 0x%x", __func__, command->cdw10 & 0xFF); pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); break; } compl->status = status; return (1); } static const char * nvme_fid_to_name(uint8_t fid) { const char *name; switch (fid) { case NVME_FEAT_ARBITRATION: name = "Arbitration"; break; case NVME_FEAT_POWER_MANAGEMENT: name = "Power Management"; break; case NVME_FEAT_LBA_RANGE_TYPE: name = "LBA Range Type"; break; case NVME_FEAT_TEMPERATURE_THRESHOLD: name = "Temperature Threshold"; break; case NVME_FEAT_ERROR_RECOVERY: name = "Error Recovery"; break; case NVME_FEAT_VOLATILE_WRITE_CACHE: name = "Volatile Write Cache"; break; case NVME_FEAT_NUMBER_OF_QUEUES: name = "Number of Queues"; break; case NVME_FEAT_INTERRUPT_COALESCING: name = "Interrupt Coalescing"; break; case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: name = "Interrupt Vector Configuration"; break; case NVME_FEAT_WRITE_ATOMICITY: name = "Write Atomicity Normal"; break; case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: name = "Asynchronous Event Configuration"; break; case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: name = "Autonomous Power State Transition"; break; case NVME_FEAT_HOST_MEMORY_BUFFER: name = "Host Memory Buffer"; break; case NVME_FEAT_TIMESTAMP: name = "Timestamp"; break; case NVME_FEAT_KEEP_ALIVE_TIMER: name = "Keep Alive Timer"; break; case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: name = "Host Controlled Thermal Management"; break; case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: name = "Non-Operation Power State Config"; break; case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: name = "Read Recovery Level Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: name = "Predictable Latency Mode Config"; break; case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: name = "Predictable Latency Mode Window"; break; case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: name = "LBA Status Information Report Interval"; break; case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: name = "Host Behavior Support"; break; case NVME_FEAT_SANITIZE_CONFIG: name = "Sanitize Config"; break; case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: name = "Endurance Group Event Configuration"; break; case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: name = "Software Progress Marker"; break; case NVME_FEAT_HOST_IDENTIFIER: name = "Host Identifier"; break; case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: name = "Reservation Notification Mask"; break; case NVME_FEAT_RESERVATION_PERSISTENCE: name = "Reservation Persistence"; break; case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: name = "Namespace Write Protection Config"; break; default: name = "Unknown"; break; } return (name); } static void nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command __unused, struct nvme_completion *compl) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } static void nvme_feature_iv_config(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint32_t i; uint32_t cdw11 = command->cdw11; uint16_t iv; bool cd; pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); iv = cdw11 & 0xffff; cd = cdw11 & (1 << 16); if (iv > (sc->max_queues + 1)) { return; } /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ if ((iv == 0) && !cd) return; /* Requested Interrupt Vector must be used by a CQ */ for (i = 0; i < sc->num_cqueues + 1; i++) { if (sc->compl_queues[i].intr_vec == iv) { pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } } } #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) static void nvme_feature_async_event(struct pci_nvme_softc *sc __unused, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); } #define NVME_TEMP_THRESH_OVER 0 #define NVME_TEMP_THRESH_UNDER 1 static void nvme_feature_temperature(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t tmpth; /* Temperature Threshold */ uint8_t tmpsel; /* Threshold Temperature Select */ uint8_t thsel; /* Threshold Type Select */ bool set_crit = false; bool report_crit; tmpth = command->cdw11 & 0xffff; tmpsel = (command->cdw11 >> 16) & 0xf; thsel = (command->cdw11 >> 20) & 0x3; DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); /* Check for unsupported values */ if (((tmpsel != 0) && (tmpsel != 0xf)) || (thsel > NVME_TEMP_THRESH_UNDER)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) set_crit = true; pthread_mutex_lock(&sc->mtx); if (set_crit) sc->health_log.critical_warning |= NVME_CRIT_WARN_ST_TEMPERATURE; else sc->health_log.critical_warning &= ~NVME_CRIT_WARN_ST_TEMPERATURE; pthread_mutex_unlock(&sc->mtx); report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & NVME_CRIT_WARN_ST_TEMPERATURE; if (set_crit && report_crit) pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, sc->health_log.critical_warning); DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); } static void nvme_feature_num_queues(struct pci_nvme_softc *sc, struct nvme_feature_obj *feat __unused, struct nvme_command *command, struct nvme_completion *compl) { uint16_t nqr; /* Number of Queues Requested */ if (sc->num_q_is_set) { WPRINTF("%s: Number of Queues already set", __func__); pci_nvme_status_genc(&compl->status, NVME_SC_COMMAND_SEQUENCE_ERROR); return; } nqr = command->cdw11 & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_squeues = ONE_BASED(nqr); if (sc->num_squeues > sc->max_queues) { DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, sc->max_queues); sc->num_squeues = sc->max_queues; } nqr = (command->cdw11 >> 16) & 0xFFFF; if (nqr == 0xffff) { WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return; } sc->num_cqueues = ONE_BASED(nqr); if (sc->num_cqueues > sc->max_queues) { DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, sc->max_queues); sc->num_cqueues = sc->max_queues; } /* Patch the command value which will be saved on callback's return */ command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); sc->num_q_is_set = true; } static int nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, struct nvme_completion *compl) { struct nvme_feature_obj *feat; uint32_t nsid = command->nsid; uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sv) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_SAVEABLE); return (1); } feat = &sc->feat[fid]; if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (!feat->namespace_specific && !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_NS_SPECIFIC); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); if (feat->set) feat->set(sc, feat, command, compl); else { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_FEATURE_NOT_CHANGEABLE); return (1); } DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); if (compl->status == NVME_SC_SUCCESS) { feat->cdw11 = command->cdw11; if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && (command->cdw11 != 0)) pci_nvme_aen_notify(sc); } return (0); } #define NVME_FEATURES_SEL_SUPPORTED 0x3 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) static int nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { struct nvme_feature_obj *feat; uint8_t fid = command->cdw10 & 0xFF; uint8_t sel = (command->cdw10 >> 8) & 0x7; DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); if (fid >= NVME_FID_MAX) { DPRINTF("%s invalid feature 0x%x", __func__, fid); pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } compl->cdw0 = 0; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); feat = &sc->feat[fid]; if (feat->get) { feat->get(sc, feat, command, compl); } if (compl->status == NVME_SC_SUCCESS) { if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; else compl->cdw0 = feat->cdw11; } return (0); } static int nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { uint8_t ses, lbaf, pi; /* Only supports Secure Erase Setting - User Data Erase */ ses = (command->cdw10 >> 9) & 0x7; if (ses > 0x1) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } /* Only supports a single LBA Format */ lbaf = command->cdw10 & 0xf; if (lbaf != 0) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FORMAT); return (1); } /* Doesn't support Protection Infomation */ pi = (command->cdw10 >> 5) & 0x7; if (pi != 0) { pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); return (1); } if (sc->nvstore.type == NVME_STOR_RAM) { if (sc->nvstore.ctx) free(sc->nvstore.ctx); sc->nvstore.ctx = calloc(1, sc->nvstore.size); pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); } else { struct pci_nvme_ioreq *req; int err; req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); return (1); } req->nvme_sq = &sc->submit_queues[0]; req->sqid = 0; req->opc = command->opc; req->cid = command->cid; req->nsid = command->nsid; req->io_req.br_offset = 0; req->io_req.br_resid = sc->nvstore.size; req->io_req.br_callback = pci_nvme_io_done; err = blockif_delete(sc->nvstore.ctx, &req->io_req); if (err) { pci_nvme_status_genc(&compl->status, NVME_SC_INTERNAL_DEVICE_ERROR); pci_nvme_release_ioreq(sc, req); } else compl->status = NVME_NO_STATUS; } return (1); } static int nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, struct nvme_completion *compl) { DPRINTF("%s submission queue %u, command ID 0x%x", __func__, command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); /* TODO: search for the command ID and abort it */ compl->cdw0 = 1; pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); return (1); } static int nvme_opc_async_event_req(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, sc->aer_count, sc->ctrldata.aerl, command->cid); /* Don't exceed the Async Event Request Limit (AERL). */ if (pci_nvme_aer_limit_reached(sc)) { pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); return (1); } if (pci_nvme_aer_add(sc, command->cid)) { pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, NVME_SC_INTERNAL_DEVICE_ERROR); return (1); } /* * Raise events when they happen based on the Set Features cmd. * These events happen async, so only set completion successful if * there is an event reflective of the request to get event. */ compl->status = NVME_NO_STATUS; pci_nvme_aen_notify(sc); return (0); } static void pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) { struct nvme_completion compl; struct nvme_command *cmd; struct nvme_submission_queue *sq; struct nvme_completion_queue *cq; uint16_t sqhead; DPRINTF("%s index %u", __func__, (uint32_t)value); sq = &sc->submit_queues[0]; cq = &sc->compl_queues[0]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); while (sqhead != atomic_load_acq_short(&sq->tail)) { cmd = &(sq->qbase)[sqhead]; compl.cdw0 = 0; compl.status = 0; switch (cmd->opc) { case NVME_OPC_DELETE_IO_SQ: DPRINTF("%s command DELETE_IO_SQ", __func__); nvme_opc_delete_io_sq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_SQ: DPRINTF("%s command CREATE_IO_SQ", __func__); nvme_opc_create_io_sq(sc, cmd, &compl); break; case NVME_OPC_DELETE_IO_CQ: DPRINTF("%s command DELETE_IO_CQ", __func__); nvme_opc_delete_io_cq(sc, cmd, &compl); break; case NVME_OPC_CREATE_IO_CQ: DPRINTF("%s command CREATE_IO_CQ", __func__); nvme_opc_create_io_cq(sc, cmd, &compl); break; case NVME_OPC_GET_LOG_PAGE: DPRINTF("%s command GET_LOG_PAGE", __func__); nvme_opc_get_log_page(sc, cmd, &compl); break; case NVME_OPC_IDENTIFY: DPRINTF("%s command IDENTIFY", __func__); nvme_opc_identify(sc, cmd, &compl); break; case NVME_OPC_ABORT: DPRINTF("%s command ABORT", __func__); nvme_opc_abort(sc, cmd, &compl); break; case NVME_OPC_SET_FEATURES: DPRINTF("%s command SET_FEATURES", __func__); nvme_opc_set_features(sc, cmd, &compl); break; case NVME_OPC_GET_FEATURES: DPRINTF("%s command GET_FEATURES", __func__); nvme_opc_get_features(sc, cmd, &compl); break; case NVME_OPC_FIRMWARE_ACTIVATE: DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); pci_nvme_status_tc(&compl.status, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_INVALID_FIRMWARE_SLOT); break; case NVME_OPC_ASYNC_EVENT_REQUEST: DPRINTF("%s command ASYNC_EVENT_REQ", __func__); nvme_opc_async_event_req(sc, cmd, &compl); break; case NVME_OPC_FORMAT_NVM: DPRINTF("%s command FORMAT_NVM", __func__); if ((sc->ctrldata.oacs & (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); break; } nvme_opc_format_nvm(sc, cmd, &compl); break; case NVME_OPC_SECURITY_SEND: case NVME_OPC_SECURITY_RECEIVE: case NVME_OPC_SANITIZE: case NVME_OPC_GET_LBA_STATUS: DPRINTF("%s command OPC=%#x (unsupported)", __func__, cmd->opc); /* Valid but unsupported opcodes */ pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); break; default: DPRINTF("%s command OPC=%#X (not implemented)", __func__, cmd->opc); pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); } sqhead = (sqhead + 1) % sq->size; if (NVME_COMPLETION_VALID(compl)) { pci_nvme_cq_update(sc, &sc->compl_queues[0], compl.cdw0, cmd->cid, 0, /* SQID */ compl.status); } } DPRINTF("setting sqhead %u", sqhead); sq->head = sqhead; if (cq->head != cq->tail) pci_generate_msix(sc->nsc_pi, 0); pthread_mutex_unlock(&sq->mtx); } /* * Update the Write and Read statistics reported in SMART data * * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. */ static void pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, size_t bytes, uint16_t status) { pthread_mutex_lock(&sc->mtx); switch (opc) { case NVME_OPC_WRITE: sc->write_commands++; if (status != NVME_SC_SUCCESS) break; sc->write_dunits_remainder += (bytes / 512); while (sc->write_dunits_remainder >= 1000) { sc->write_data_units++; sc->write_dunits_remainder -= 1000; } break; case NVME_OPC_READ: sc->read_commands++; if (status != NVME_SC_SUCCESS) break; sc->read_dunits_remainder += (bytes / 512); while (sc->read_dunits_remainder >= 1000) { sc->read_data_units++; sc->read_dunits_remainder -= 1000; } break; default: DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); break; } pthread_mutex_unlock(&sc->mtx); } /* * Check if the combination of Starting LBA (slba) and number of blocks * exceeds the range of the underlying storage. * * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores * the capacity in bytes as a uint64_t, care must be taken to avoid integer * overflow. */ static bool pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, uint32_t nblocks) { size_t offset, bytes; /* Overflow check of multiplying Starting LBA by the sector size */ if (slba >> (64 - nvstore->sectsz_bits)) return (true); offset = slba << nvstore->sectsz_bits; bytes = nblocks << nvstore->sectsz_bits; /* Overflow check of Number of Logical Blocks */ if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) return (true); return (false); } static int pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) { int iovidx; bool range_is_contiguous; if (req == NULL) return (-1); if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { return (-1); } /* * Minimize the number of IOVs by concatenating contiguous address * ranges. If the IOV count is zero, there is no previous range to * concatenate. */ if (req->io_req.br_iovcnt == 0) range_is_contiguous = false; else range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; if (range_is_contiguous) { iovidx = req->io_req.br_iovcnt - 1; req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, req->prev_gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->prev_size += size; req->io_req.br_resid += size; req->io_req.br_iov[iovidx].iov_len = req->prev_size; } else { iovidx = req->io_req.br_iovcnt; if (iovidx == 0) { req->io_req.br_offset = offset; req->io_req.br_resid = 0; req->io_req.br_param = req; } req->io_req.br_iov[iovidx].iov_base = paddr_guest2host(req->sc->nsc_pi->pi_vmctx, gpaddr, size); if (req->io_req.br_iov[iovidx].iov_base == NULL) return (-1); req->io_req.br_iov[iovidx].iov_len = size; req->prev_gpaddr = gpaddr; req->prev_size = size; req->io_req.br_resid += size; req->io_req.br_iovcnt++; } return (0); } static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), NVME_STATUS_GET_SC(status)); pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); if (cq->head != cq->tail) { if (cq->intr_en & NVME_CQ_INTEN) { pci_generate_msix(sc->nsc_pi, cq->intr_vec); } else { DPRINTF("%s: CQ%u interrupt disabled", __func__, sq->cqid); } } } static void pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) { req->sc = NULL; req->nvme_sq = NULL; req->sqid = 0; pthread_mutex_lock(&sc->mtx); STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); sc->pending_ios--; /* when no more IO pending, can set to ready if device reset/enabled */ if (sc->pending_ios == 0 && NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) sc->regs.csts |= NVME_CSTS_RDY; pthread_mutex_unlock(&sc->mtx); sem_post(&sc->iosemlock); } static struct pci_nvme_ioreq * pci_nvme_get_ioreq(struct pci_nvme_softc *sc) { struct pci_nvme_ioreq *req = NULL; sem_wait(&sc->iosemlock); pthread_mutex_lock(&sc->mtx); req = STAILQ_FIRST(&sc->ioreqs_free); assert(req != NULL); STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); req->sc = sc; sc->pending_ios++; pthread_mutex_unlock(&sc->mtx); req->io_req.br_iovcnt = 0; req->io_req.br_offset = 0; req->io_req.br_resid = 0; req->io_req.br_param = req; req->prev_gpaddr = 0; req->prev_size = 0; return req; } static void pci_nvme_io_done(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct nvme_submission_queue *sq = req->nvme_sq; uint16_t code, status; DPRINTF("%s error %d %s", __func__, err, strerror(err)); /* TODO return correct error */ code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; status = 0; pci_nvme_status_genc(&status, code); pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); pci_nvme_stats_write_read_update(req->sc, req->opc, req->bytes, status); pci_nvme_release_ioreq(req->sc, req); } /* * Implements the Flush command. The specification states: * If a volatile write cache is not present, Flush commands complete * successfully and have no effect * in the description of the Volatile Write Cache (VWC) field of the Identify * Controller data. Therefore, set status to Success if the command is * not supported (i.e. RAM or as indicated by the blockif). */ static bool nvme_opc_flush(struct pci_nvme_softc *sc __unused, struct nvme_command *cmd __unused, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { bool pending = false; if (nvstore->type == NVME_STOR_RAM) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); } else { int err; req->io_req.br_callback = pci_nvme_io_done; err = blockif_flush(nvstore->ctx, &req->io_req); switch (err) { case 0: pending = true; break; case EOPNOTSUPP: pci_nvme_status_genc(status, NVME_SC_SUCCESS); break; default: pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); } } return (pending); } static uint16_t nvme_write_read_ram(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint8_t *buf = nvstore->ctx; enum nvme_copy_dir dir; uint16_t status; if (is_write) dir = NVME_COPY_TO_PRP; else dir = NVME_COPY_FROM_PRP; status = 0; if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, buf + offset, bytes, dir)) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); else pci_nvme_status_genc(&status, NVME_SC_SUCCESS); return (status); } static uint16_t nvme_write_read_blockif(struct pci_nvme_softc *sc, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint64_t prp1, uint64_t prp2, size_t offset, uint64_t bytes, bool is_write) { uint64_t size; int err; uint16_t status = NVME_NO_STATUS; size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; if (bytes == 0) { ; } else if (bytes <= PAGE_SIZE) { size = bytes; if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { err = -1; goto out; } } else { void *vmctx = sc->nsc_pi->pi_vmctx; uint64_t *prp_list = &prp2; uint64_t *last = prp_list; /* PRP2 is pointer to a physical region page list */ while (bytes) { /* Last entry in list points to the next list */ if ((prp_list == last) && (bytes > PAGE_SIZE)) { uint64_t prp = *prp_list; prp_list = paddr_guest2host(vmctx, prp, PAGE_SIZE - (prp % PAGE_SIZE)); if (prp_list == NULL) { err = -1; goto out; } last = prp_list + (NVME_PRP2_ITEMS - 1); } size = MIN(bytes, PAGE_SIZE); if (pci_nvme_append_iov_req(sc, req, *prp_list, size, offset)) { err = -1; goto out; } offset += size; bytes -= size; prp_list++; } } req->io_req.br_callback = pci_nvme_io_done; if (is_write) err = blockif_write(nvstore->ctx, &req->io_req); else err = blockif_read(nvstore->ctx, &req->io_req); out: if (err) pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); return (status); } static bool nvme_opc_write_read(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { uint64_t lba, nblocks, bytes; size_t offset; bool is_write = cmd->opc == NVME_OPC_WRITE; bool pending = false; lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; nblocks = (cmd->cdw12 & 0xFFFF) + 1; bytes = nblocks << nvstore->sectsz_bits; if (bytes > NVME_MAX_DATA_SIZE) { WPRINTF("%s command would exceed MDTS", __func__); pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); goto out; } if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", __func__, lba, nblocks); pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } offset = lba << nvstore->sectsz_bits; req->bytes = bytes; req->io_req.br_offset = lba; /* PRP bits 1:0 must be zero */ cmd->prp1 &= ~0x3UL; cmd->prp2 &= ~0x3UL; if (nvstore->type == NVME_STOR_RAM) { *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, cmd->prp2, offset, bytes, is_write); } else { *status = nvme_write_read_blockif(sc, nvstore, req, cmd->prp1, cmd->prp2, offset, bytes, is_write); if (*status == NVME_NO_STATUS) pending = true; } out: if (!pending) pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); return (pending); } static void pci_nvme_dealloc_sm(struct blockif_req *br, int err) { struct pci_nvme_ioreq *req = br->br_param; struct pci_nvme_softc *sc = req->sc; bool done = true; uint16_t status; status = 0; if (err) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { pci_nvme_status_genc(&status, NVME_SC_SUCCESS); } else { struct iovec *iov = req->io_req.br_iov; req->prev_gpaddr++; iov += req->prev_gpaddr; /* The iov_* values already include the sector size */ req->io_req.br_offset = (off_t)iov->iov_base; req->io_req.br_resid = iov->iov_len; if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); } else done = false; } if (done) { pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, status); pci_nvme_release_ioreq(sc, req); } } static bool nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { struct nvme_dsm_range *range = NULL; uint32_t nr, r, non_zero, dr; int err; bool pending = false; if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); goto out; } nr = cmd->cdw10 & 0xff; /* copy locally because a range entry could straddle PRPs */ range = calloc(1, NVME_MAX_DSM_TRIM); if (range == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); /* Check for invalid ranges and the number of non-zero lengths */ non_zero = 0; for (r = 0; r <= nr; r++) { if (pci_nvme_out_of_range(nvstore, range[r].starting_lba, range[r].length)) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } if (range[r].length != 0) non_zero++; } if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { size_t offset, bytes; int sectsz_bits = sc->nvstore.sectsz_bits; /* * DSM calls are advisory only, and compliant controllers * may choose to take no actions (i.e. return Success). */ if (!nvstore->deallocate) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } /* If all ranges have a zero length, return Success */ if (non_zero == 0) { pci_nvme_status_genc(status, NVME_SC_SUCCESS); goto out; } if (req == NULL) { pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); goto out; } offset = range[0].starting_lba << sectsz_bits; bytes = range[0].length << sectsz_bits; /* * If the request is for more than a single range, store * the ranges in the br_iov. Optimize for the common case * of a single range. * * Note that NVMe Number of Ranges is a zero based value */ req->io_req.br_iovcnt = 0; req->io_req.br_offset = offset; req->io_req.br_resid = bytes; if (nr == 0) { req->io_req.br_callback = pci_nvme_io_done; } else { struct iovec *iov = req->io_req.br_iov; for (r = 0, dr = 0; r <= nr; r++) { offset = range[r].starting_lba << sectsz_bits; bytes = range[r].length << sectsz_bits; if (bytes == 0) continue; if ((nvstore->size - offset) < bytes) { pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); goto out; } iov[dr].iov_base = (void *)offset; iov[dr].iov_len = bytes; dr++; } req->io_req.br_callback = pci_nvme_dealloc_sm; /* * Use prev_gpaddr to track the current entry and * prev_size to track the number of entries */ req->prev_gpaddr = 0; req->prev_size = dr; } err = blockif_delete(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); else pending = true; } out: free(range); return (pending); } static void pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) { struct nvme_submission_queue *sq; uint16_t status; uint16_t sqhead; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; pthread_mutex_lock(&sq->mtx); sqhead = sq->head; DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", idx, sqhead, sq->tail, sq->qbase); while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; struct pci_nvme_ioreq *req; uint32_t nsid; bool pending; pending = false; req = NULL; status = 0; cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; nsid = le32toh(cmd->nsid); if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { pci_nvme_status_genc(&status, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); status |= NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; goto complete; } req = pci_nvme_get_ioreq(sc); if (req == NULL) { pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); WPRINTF("%s: unable to allocate IO req", __func__); goto complete; } req->nvme_sq = sq; req->sqid = idx; req->opc = cmd->opc; req->cid = cmd->cid; req->nsid = cmd->nsid; switch (cmd->opc) { case NVME_OPC_FLUSH: pending = nvme_opc_flush(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE: case NVME_OPC_READ: pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, req, &status); break; case NVME_OPC_WRITE_ZEROES: /* TODO: write zeroes WPRINTF("%s write zeroes lba 0x%lx blocks %u", __func__, lba, cmd->cdw12 & 0xFFFF); */ pci_nvme_status_genc(&status, NVME_SC_SUCCESS); break; case NVME_OPC_DATASET_MANAGEMENT: pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, &status); break; default: WPRINTF("%s unhandled io command 0x%x", __func__, cmd->opc); pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); } complete: if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); if (req != NULL) pci_nvme_release_ioreq(sc, req); } } sq->head = sqhead; pthread_mutex_unlock(&sq->mtx); } static void pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, uint64_t idx, int is_sq, uint64_t value) { DPRINTF("nvme doorbell %lu, %s, val 0x%lx", idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); if (is_sq) { if (idx > sc->num_squeues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); if (idx == 0) { pci_nvme_handle_admin_cmd(sc, value); } else { /* submission queue; handle new entries in SQ */ if (idx > sc->num_squeues) { WPRINTF("%s SQ index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_squeues); return; } pci_nvme_handle_io_cmd(sc, (uint16_t)idx); } } else { if (idx > sc->num_cqueues) { WPRINTF("%s queue index %lu overflow from " "guest (max %u)", __func__, idx, sc->num_cqueues); return; } atomic_store_short(&sc->compl_queues[idx].head, (uint16_t)value); } } static void pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) { const char *s = iswrite ? "WRITE" : "READ"; switch (offset) { case NVME_CR_CAP_LOW: DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); break; case NVME_CR_CAP_HI: DPRINTF("%s %s NVME_CR_CAP_HI", func, s); break; case NVME_CR_VS: DPRINTF("%s %s NVME_CR_VS", func, s); break; case NVME_CR_INTMS: DPRINTF("%s %s NVME_CR_INTMS", func, s); break; case NVME_CR_INTMC: DPRINTF("%s %s NVME_CR_INTMC", func, s); break; case NVME_CR_CC: DPRINTF("%s %s NVME_CR_CC", func, s); break; case NVME_CR_CSTS: DPRINTF("%s %s NVME_CR_CSTS", func, s); break; case NVME_CR_NSSR: DPRINTF("%s %s NVME_CR_NSSR", func, s); break; case NVME_CR_AQA: DPRINTF("%s %s NVME_CR_AQA", func, s); break; case NVME_CR_ASQ_LOW: DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); break; case NVME_CR_ASQ_HI: DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); break; case NVME_CR_ACQ_LOW: DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); break; case NVME_CR_ACQ_HI: DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); break; default: DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); } } static void -pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, - uint64_t offset, int size, uint64_t value) +pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, + uint64_t value) { uint32_t ccreg; if (offset >= NVME_DOORBELL_OFFSET) { uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; uint64_t idx = belloffset / 8; /* door bell size = 2*int */ int is_sq = (belloffset % 8) < 4; if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", offset); return; } if (belloffset > ((sc->max_queues+1) * 8 - 4)) { WPRINTF("guest attempted an overflow write offset " "0x%lx, val 0x%lx in %s", offset, value, __func__); return; } if (is_sq) { if (sc->submit_queues[idx].qbase == NULL) return; } else if (sc->compl_queues[idx].qbase == NULL) return; pci_nvme_handle_doorbell(sc, idx, is_sq, value); return; } DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", offset, size, value); if (size != 4) { WPRINTF("guest wrote invalid size %d (offset 0x%lx, " "val 0x%lx) to bar0 in %s", size, offset, value, __func__); /* TODO: shutdown device */ return; } pci_nvme_bar0_reg_dumps(__func__, offset, 1); pthread_mutex_lock(&sc->mtx); switch (offset) { case NVME_CR_CAP_LOW: case NVME_CR_CAP_HI: /* readonly */ break; case NVME_CR_VS: /* readonly */ break; case NVME_CR_INTMS: /* MSI-X, so ignore */ break; case NVME_CR_INTMC: /* MSI-X, so ignore */ break; case NVME_CR_CC: ccreg = (uint32_t)value; DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " "iocqes %u", __func__, NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), NVME_CC_GET_IOCQES(ccreg)); if (NVME_CC_GET_SHN(ccreg)) { /* perform shutdown - flush out data to backend */ sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << NVME_CSTS_REG_SHST_SHIFT); sc->regs.csts |= NVME_SHST_COMPLETE << NVME_CSTS_REG_SHST_SHIFT; } if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { if (NVME_CC_GET_EN(ccreg) == 0) /* transition 1-> causes controller reset */ pci_nvme_reset_locked(sc); else - pci_nvme_init_controller(ctx, sc); + pci_nvme_init_controller(sc); } /* Insert the iocqes, iosqes and en bits from the write */ sc->regs.cc &= ~NVME_CC_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; if (NVME_CC_GET_EN(ccreg) == 0) { /* Insert the ams, mps and css bit fields */ sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; sc->regs.csts &= ~NVME_CSTS_RDY; } else if ((sc->pending_ios == 0) && !(sc->regs.csts & NVME_CSTS_CFS)) { sc->regs.csts |= NVME_CSTS_RDY; } break; case NVME_CR_CSTS: break; case NVME_CR_NSSR: /* ignore writes; don't support subsystem reset */ break; case NVME_CR_AQA: sc->regs.aqa = (uint32_t)value; break; case NVME_CR_ASQ_LOW: sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ASQ_HI: sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | (value << 32); break; case NVME_CR_ACQ_LOW: sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | (0xFFFFF000 & value); break; case NVME_CR_ACQ_HI: sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | (value << 32); break; default: DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", __func__, offset, value, size); } pthread_mutex_unlock(&sc->mtx); } static void -pci_nvme_write(struct vmctx *ctx, struct pci_devinst *pi, - int baridx, uint64_t offset, int size, uint64_t value) +pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, + uint64_t value) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " " value 0x%lx", baridx, offset, size, value); pci_emul_msix_twrite(pi, offset, size, value); return; } switch (baridx) { case 0: - pci_nvme_write_bar_0(ctx, sc, offset, size, value); + pci_nvme_write_bar_0(sc, offset, size, value); break; default: DPRINTF("%s unknown baridx %d, val 0x%lx", __func__, baridx, value); } } static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, uint64_t offset, int size) { uint64_t value; pci_nvme_bar0_reg_dumps(__func__, offset, 0); if (offset < NVME_DOORBELL_OFFSET) { void *p = &(sc->regs); pthread_mutex_lock(&sc->mtx); memcpy(&value, (void *)((uintptr_t)p + offset), size); pthread_mutex_unlock(&sc->mtx); } else { value = 0; WPRINTF("pci_nvme: read invalid offset %ld", offset); } switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", offset, size, (uint32_t)value); return (value); } static uint64_t -pci_nvme_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_nvme_softc* sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", baridx, offset, size); return pci_emul_msix_tread(pi, offset, size); } switch (baridx) { case 0: return pci_nvme_read_bar_0(sc, offset, size); default: DPRINTF("unknown bar %d, 0x%lx", baridx, offset); } return (0); } static int pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) { char bident[sizeof("XXX:XXX")]; const char *value; uint32_t sectsz; sc->max_queues = NVME_QUEUES; sc->max_qentries = NVME_MAX_QENTRIES; sc->ioslots = NVME_IOSLOTS; sc->num_squeues = sc->max_queues; sc->num_cqueues = sc->max_queues; sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; sectsz = 0; snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); value = get_config_value_node(nvl, "maxq"); if (value != NULL) sc->max_queues = atoi(value); value = get_config_value_node(nvl, "qsz"); if (value != NULL) { sc->max_qentries = atoi(value); if (sc->max_qentries <= 0) { EPRINTLN("nvme: Invalid qsz option %d", sc->max_qentries); return (-1); } } value = get_config_value_node(nvl, "ioslots"); if (value != NULL) { sc->ioslots = atoi(value); if (sc->ioslots <= 0) { EPRINTLN("Invalid ioslots option %d", sc->ioslots); return (-1); } } value = get_config_value_node(nvl, "sectsz"); if (value != NULL) sectsz = atoi(value); value = get_config_value_node(nvl, "ser"); if (value != NULL) { /* * This field indicates the Product Serial Number in * 7-bit ASCII, unused bytes should be space characters. * Ref: NVMe v1.3c. */ cpywithpad((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), value, ' '); } value = get_config_value_node(nvl, "eui64"); if (value != NULL) sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); value = get_config_value_node(nvl, "dsm"); if (value != NULL) { if (strcmp(value, "auto") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; else if (strcmp(value, "enable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; else if (strcmp(value, "disable") == 0) sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; } value = get_config_value_node(nvl, "ram"); if (value != NULL) { uint64_t sz = strtoull(value, NULL, 10); sc->nvstore.type = NVME_STOR_RAM; sc->nvstore.size = sz * 1024 * 1024; sc->nvstore.ctx = calloc(1, sc->nvstore.size); sc->nvstore.sectsz = 4096; sc->nvstore.sectsz_bits = 12; if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Unable to allocate RAM"); return (-1); } } else { snprintf(bident, sizeof(bident), "%u:%u", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); sc->nvstore.ctx = blockif_open(nvl, bident); if (sc->nvstore.ctx == NULL) { EPRINTLN("nvme: Could not open backing file: %s", strerror(errno)); return (-1); } sc->nvstore.type = NVME_STOR_BLOCKIF; sc->nvstore.size = blockif_size(sc->nvstore.ctx); } if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) sc->nvstore.sectsz = sectsz; else if (sc->nvstore.type != NVME_STOR_RAM) sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); for (sc->nvstore.sectsz_bits = 9; (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; sc->nvstore.sectsz_bits++); if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) sc->max_queues = NVME_QUEUES; return (0); } static void pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, size_t new_size) { struct pci_nvme_softc *sc; struct pci_nvme_blockstore *nvstore; struct nvme_namespace_data *nd; sc = arg; nvstore = &sc->nvstore; nd = &sc->nsdata; nvstore->size = new_size; pci_nvme_init_nsdata_size(nvstore, nd); /* Add changed NSID to list */ sc->ns_log.ns[0] = 1; sc->ns_log.ns[1] = 0; pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); } static int -pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) +pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_nvme_softc *sc; uint32_t pci_membar_sz; int error; error = 0; sc = calloc(1, sizeof(struct pci_nvme_softc)); pi->pi_arg = sc; sc->nsc_pi = pi; error = pci_nvme_parse_config(sc, nvl); if (error < 0) goto done; else error = 0; STAILQ_INIT(&sc->ioreqs_free); sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (uint32_t i = 0; i < sc->ioslots; i++) { STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); } pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); /* * Allocate size of NVMe registers + doorbell space for all queues. * * The specification requires a minimum memory I/O window size of 16K. * The Windows driver will refuse to start a device with a smaller * window. */ pci_membar_sz = sizeof(struct nvme_registers) + 2 * sizeof(uint32_t) * (sc->max_queues + 1); pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); DPRINTF("nvme membar size: %u", pci_membar_sz); error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); if (error) { WPRINTF("%s pci alloc mem bar failed", __func__); goto done; } error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); if (error) { WPRINTF("%s pci add msixcap failed", __func__); goto done; } error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); if (error) { WPRINTF("%s pci add Express capability failed", __func__); goto done; } pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); /* * Controller data depends on Namespace data so initialize Namespace * data first. */ pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); pci_nvme_init_ctrldata(sc); pci_nvme_init_logpages(sc); pci_nvme_init_features(sc); pci_nvme_aer_init(sc); pci_nvme_aen_init(sc); pci_nvme_reset(sc); pci_lintr_request(pi); done: return (error); } static int pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *ram; if (opts == NULL) return (0); if (strncmp(opts, "ram=", 4) == 0) { cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "ram", opts + 4); return (0); } ram = strndup(opts + 4, cp - opts - 4); set_config_value_node(nvl, "ram", ram); free(ram); return (pci_parse_legacy_config(nvl, cp + 1)); } else return (blockif_legacy_config(nvl, opts)); } static const struct pci_devemu pci_de_nvme = { .pe_emu = "nvme", .pe_init = pci_nvme_init, .pe_legacy_config = pci_nvme_legacy_config, .pe_barwrite = pci_nvme_write, .pe_barread = pci_nvme_read }; PCI_EMUL_SET(pci_de_nvme); diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index 3dbb80eec9fa..f42bbbda655f 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -1,1220 +1,1218 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "config.h" #include "debug.h" #include "mem.h" #include "pci_passthru.h" #ifndef _PATH_DEVPCI #define _PATH_DEVPCI "/dev/pci" #endif #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; struct passthru_softc { struct pci_devinst *psc_pi; /* ROM is handled like a BAR */ struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1]; struct { int capoff; int msgctrl; int emulated; } psc_msi; struct { int capoff; } psc_msix; struct pcisel psc_sel; }; static int msi_caplen(int msgctrl) { int len; len = 10; /* minimum length of msi capability */ if (msgctrl & PCIM_MSICTRL_64BIT) len += 4; #if 0 /* * Ignore the 'mask' and 'pending' bits in the MSI capability. * We'll let the guest manipulate them directly. */ if (msgctrl & PCIM_MSICTRL_VECTOR) len += 10; #endif return (len); } static int pcifd_init(void) { pcifd = open(_PATH_DEVPCI, O_RDWR, 0); if (pcifd < 0) { warn("failed to open %s", _PATH_DEVPCI); return (1); } #ifndef WITHOUT_CAPSICUM cap_rights_t pcifd_rights; cap_rights_init(&pcifd_rights, CAP_IOCTL, CAP_READ, CAP_WRITE); if (caph_rights_limit(pcifd, &pcifd_rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); const cap_ioctl_t pcifd_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR, PCIOCBARIO, PCIOCBARMMAP, PCIOCGETCONF }; if (caph_ioctls_limit(pcifd, pcifd_ioctls, nitems(pcifd_ioctls)) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif return (0); } uint32_t read_config(const struct pcisel *sel, long reg, int width) { struct pci_io pi; if (pcifd < 0 && pcifd_init()) { return (0); } bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; if (ioctl(pcifd, PCIOCREAD, &pi) < 0) return (0); /* XXX */ else return (pi.pi_data); } void write_config(const struct pcisel *sel, long reg, int width, uint32_t data) { struct pci_io pi; if (pcifd < 0 && pcifd_init()) { return; } bzero(&pi, sizeof(pi)); pi.pi_sel = *sel; pi.pi_reg = reg; pi.pi_width = width; pi.pi_data = data; (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ } #ifdef LEGACY_SUPPORT static int passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) { int capoff; struct msicap msicap; u_char *capdata; pci_populate_msicap(&msicap, msgnum, nextptr); /* * XXX * Copy the msi capability structure in the last 16 bytes of the * config space. This is wrong because it could shadow something * useful to the device. */ capoff = 256 - roundup(sizeof(msicap), 4); capdata = (u_char *)&msicap; for (size_t i = 0; i < sizeof(msicap); i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); return (capoff); } #endif /* LEGACY_SUPPORT */ static int cfginitmsi(struct passthru_softc *sc) { int i, ptr, capptr, cap, sts, caplen, table_size; uint32_t u32; struct pcisel sel; struct pci_devinst *pi; struct msixcap msixcap; char *msixcap_ptr; pi = sc->psc_pi; sel = sc->psc_sel; /* * Parse the capabilities and cache the location of the MSI * and MSI-X capabilities. */ sts = read_config(&sel, PCIR_STATUS, 2); if (sts & PCIM_STATUS_CAPPRESENT) { ptr = read_config(&sel, PCIR_CAP_PTR, 1); while (ptr != 0 && ptr != 0xff) { cap = read_config(&sel, ptr + PCICAP_ID, 1); if (cap == PCIY_MSI) { /* * Copy the MSI capability into the config * space of the emulated pci device */ sc->psc_msi.capoff = ptr; sc->psc_msi.msgctrl = read_config(&sel, ptr + 2, 2); sc->psc_msi.emulated = 0; caplen = msi_caplen(sc->psc_msi.msgctrl); capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; } } else if (cap == PCIY_MSIX) { /* * Copy the MSI-X capability */ sc->psc_msix.capoff = ptr; caplen = 12; msixcap_ptr = (char *)&msixcap; capptr = ptr; while (caplen > 0) { u32 = read_config(&sel, capptr, 4); memcpy(msixcap_ptr, &u32, 4); pci_set_cfgdata32(pi, capptr, u32); caplen -= 4; capptr += 4; msixcap_ptr += 4; } } ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); } } if (sc->psc_msix.capoff != 0) { pi->pi_msix.pba_bar = msixcap.pba_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.pba_offset = msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_bar = msixcap.table_info & PCIM_MSIX_BIR_MASK; pi->pi_msix.table_offset = msixcap.table_info & ~PCIM_MSIX_BIR_MASK; pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); /* Allocate the emulated MSI-X table array */ table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* Mask all table entries */ for (i = 0; i < pi->pi_msix.table_count; i++) { pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } } #ifdef LEGACY_SUPPORT /* * If the passthrough device does not support MSI then craft a * MSI capability for it. We link the new MSI capability at the * head of the list of capabilities. */ if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { int origptr, msiptr; origptr = read_config(&sel, PCIR_CAP_PTR, 1); msiptr = passthru_add_msicap(pi, 1, origptr); sc->psc_msi.capoff = msiptr; sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); sc->psc_msi.emulated = 1; pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); } #endif /* Make sure one of the capabilities is present */ if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) return (-1); else return (0); } static uint64_t msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *src8; uint16_t *src16; uint32_t *src32; uint64_t *src64; uint64_t data; size_t entry_offset; uint32_t table_offset; int index, table_count; pi = sc->psc_pi; table_offset = pi->pi_msix.table_offset; table_count = pi->pi_msix.table_count; if (offset < table_offset || offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { switch (size) { case 1: src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); data = *src8; break; case 2: src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); data = *src16; break; case 4: src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); data = *src32; break; case 8: src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); data = *src64; break; default: return (-1); } return (data); } offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; switch (size) { case 1: src8 = (uint8_t *)((uint8_t *)entry + entry_offset); data = *src8; break; case 2: src16 = (uint16_t *)((uint8_t *)entry + entry_offset); data = *src16; break; case 4: src32 = (uint32_t *)((uint8_t *)entry + entry_offset); data = *src32; break; case 8: src64 = (uint64_t *)((uint8_t *)entry + entry_offset); data = *src64; break; default: return (-1); } return (data); } static void -msix_table_write(struct vmctx *ctx, struct passthru_softc *sc, - uint64_t offset, int size, uint64_t data) +msix_table_write(struct passthru_softc *sc, uint64_t offset, int size, + uint64_t data) { struct pci_devinst *pi; struct msix_table_entry *entry; uint8_t *dest8; uint16_t *dest16; uint32_t *dest32; uint64_t *dest64; size_t entry_offset; uint32_t table_offset, vector_control; int index, table_count; pi = sc->psc_pi; table_offset = pi->pi_msix.table_offset; table_count = pi->pi_msix.table_count; if (offset < table_offset || offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { switch (size) { case 1: dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); *dest8 = data; break; case 2: dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); *dest16 = data; break; case 4: dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); *dest32 = data; break; case 8: dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); *dest64 = data; break; } return; } offset -= table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; assert(index < table_count); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* Only 4 byte naturally-aligned writes are supported */ assert(size == 4); assert(entry_offset % 4 == 0); vector_control = entry->vector_control; dest32 = (uint32_t *)((uint8_t *)entry + entry_offset); *dest32 = data; /* If MSI-X hasn't been enabled, do nothing */ if (pi->pi_msix.enabled) { /* If the entry is masked, don't set it up */ if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { - (void)vm_setup_pptdev_msix(ctx, 0, + (void)vm_setup_pptdev_msix(sc->psc_pi->pi_vmctx, 0, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, index, entry->addr, entry->msg_data, entry->vector_control); } } } static int -init_msix_table(struct vmctx *ctx __unused, struct passthru_softc *sc) +init_msix_table(struct passthru_softc *sc) { struct pci_devinst *pi = sc->psc_pi; struct pci_bar_mmap pbm; int b, s, f; uint32_t table_size, table_offset; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); b = sc->psc_sel.pc_bus; s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; /* * Map the region of the BAR containing the MSI-X table. This is * necessary for two reasons: * 1. The PBA may reside in the first or last page containing the MSI-X * table. * 2. While PCI devices are not supposed to use the page(s) containing * the MSI-X table for other purposes, some do in practice. */ memset(&pbm, 0, sizeof(pbm)); pbm.pbm_sel = sc->psc_sel; pbm.pbm_flags = PCIIO_BAR_MMAP_RW; pbm.pbm_reg = PCIR_BAR(pi->pi_msix.table_bar); pbm.pbm_memattr = VM_MEMATTR_DEVICE; if (ioctl(pcifd, PCIOCBARMMAP, &pbm) != 0) { warn("Failed to map MSI-X table BAR on %d/%d/%d", b, s, f); return (-1); } assert(pbm.pbm_bar_off == 0); pi->pi_msix.mapped_addr = (uint8_t *)(uintptr_t)pbm.pbm_map_base; pi->pi_msix.mapped_size = pbm.pbm_map_length; table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); /* * Unmap any pages not containing the table, we do not need to emulate * accesses to them. Avoid releasing address space to help ensure that * a buggy out-of-bounds access causes a crash. */ if (table_offset != 0) if (mprotect(pi->pi_msix.mapped_addr, table_offset, PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); if (table_offset + table_size != pi->pi_msix.mapped_size) if (mprotect( pi->pi_msix.mapped_addr + table_offset + table_size, pi->pi_msix.mapped_size - (table_offset + table_size), PROT_NONE) != 0) warn("Failed to unmap MSI-X table BAR region"); return (0); } static int -cfginitbar(struct vmctx *ctx __unused, struct passthru_softc *sc) +cfginitbar(struct passthru_softc *sc) { int i, error; struct pci_devinst *pi; struct pci_bar_io bar; enum pcibar_type bartype; uint64_t base, size; pi = sc->psc_pi; /* * Initialize BAR registers */ for (i = 0; i <= PCI_BARMAX; i++) { bzero(&bar, sizeof(bar)); bar.pbi_sel = sc->psc_sel; bar.pbi_reg = PCIR_BAR(i); if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) continue; if (PCI_BAR_IO(bar.pbi_base)) { bartype = PCIBAR_IO; base = bar.pbi_base & PCIM_BAR_IO_BASE; } else { switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { case PCIM_BAR_MEM_64: bartype = PCIBAR_MEM64; break; default: bartype = PCIBAR_MEM32; break; } base = bar.pbi_base & PCIM_BAR_MEM_BASE; } size = bar.pbi_length; if (bartype != PCIBAR_IO) { if (((base | size) & PAGE_MASK) != 0) { warnx("passthru device %d/%d/%d BAR %d: " "base %#lx or size %#lx not page aligned\n", sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, base, size); return (-1); } } /* Cache information about the "real" BAR */ sc->psc_bar[i].type = bartype; sc->psc_bar[i].size = size; sc->psc_bar[i].addr = base; sc->psc_bar[i].lobits = 0; /* Allocate the BAR in the guest I/O or MMIO space */ error = pci_emul_alloc_bar(pi, i, bartype, size); if (error) return (-1); /* Use same lobits as physical bar */ uint8_t lobits = read_config(&sc->psc_sel, PCIR_BAR(i), 0x01); if (bartype == PCIBAR_MEM32 || bartype == PCIBAR_MEM64) { lobits &= ~PCIM_BAR_MEM_BASE; } else { lobits &= ~PCIM_BAR_IO_BASE; } sc->psc_bar[i].lobits = lobits; pi->pi_bar[i].lobits = lobits; /* * 64-bit BAR takes up two slots so skip the next one. */ if (bartype == PCIBAR_MEM64) { i++; assert(i <= PCI_BARMAX); sc->psc_bar[i].type = PCIBAR_MEMHI64; } } return (0); } static int -cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) +cfginit(struct pci_devinst *pi, int bus, int slot, int func) { int error; struct passthru_softc *sc; error = 1; sc = pi->pi_arg; bzero(&sc->psc_sel, sizeof(struct pcisel)); sc->psc_sel.pc_bus = bus; sc->psc_sel.pc_dev = slot; sc->psc_sel.pc_func = func; if (cfginitmsi(sc) != 0) { warnx("failed to initialize MSI for PCI %d/%d/%d", bus, slot, func); goto done; } - if (cfginitbar(ctx, sc) != 0) { + if (cfginitbar(sc) != 0) { warnx("failed to initialize BARs for PCI %d/%d/%d", bus, slot, func); goto done; } write_config(&sc->psc_sel, PCIR_COMMAND, 2, pci_get_cfgdata16(pi, PCIR_COMMAND)); /* * We need to do this after PCIR_COMMAND got possibly updated, e.g., * a BAR was enabled, as otherwise the PCIOCBARMMAP might fail on us. */ if (pci_msix_table_bar(pi) >= 0) { - error = init_msix_table(ctx, sc); + error = init_msix_table(sc); if (error != 0) { warnx( "failed to initialize MSI-X table for PCI %d/%d/%d: %d", bus, slot, func, error); goto done; } } error = 0; /* success */ done: return (error); } static int passthru_legacy_config(nvlist_t *nvl, const char *opts) { const char *cp; char *tofree; char value[16]; int bus, slot, func; if (opts == NULL) return (0); cp = strchr(opts, ','); if (strncmp(opts, "ppt", strlen("ppt")) == 0) { tofree = strndup(opts, cp - opts); set_config_value_node(nvl, "pptdev", tofree); free(tofree); } else if (sscanf(opts, "pci0:%d:%d:%d", &bus, &slot, &func) == 3 || sscanf(opts, "pci%d:%d:%d", &bus, &slot, &func) == 3 || sscanf(opts, "%d/%d/%d", &bus, &slot, &func) == 3) { snprintf(value, sizeof(value), "%d", bus); set_config_value_node(nvl, "bus", value); snprintf(value, sizeof(value), "%d", slot); set_config_value_node(nvl, "slot", value); snprintf(value, sizeof(value), "%d", func); set_config_value_node(nvl, "func", value); } else { EPRINTLN("passthru: invalid options \"%s\"", opts); return (-1); } if (cp == NULL) { return (0); } return (pci_parse_legacy_config(nvl, cp + 1)); } static int -passthru_init_rom(struct vmctx *const ctx __unused, - struct passthru_softc *const sc, const char *const romfile) +passthru_init_rom(struct passthru_softc *const sc, const char *const romfile) { if (romfile == NULL) { return (0); } const int fd = open(romfile, O_RDONLY); if (fd < 0) { warnx("%s: can't open romfile \"%s\"", __func__, romfile); return (-1); } struct stat sbuf; if (fstat(fd, &sbuf) < 0) { warnx("%s: can't fstat romfile \"%s\"", __func__, romfile); close(fd); return (-1); } const uint64_t rom_size = sbuf.st_size; void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd, 0); if (rom_data == MAP_FAILED) { warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__, romfile, errno); close(fd); return (-1); } void *rom_addr; int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr); if (error) { warnx("%s: failed to alloc rom segment", __func__); munmap(rom_data, rom_size); close(fd); return (error); } memcpy(rom_addr, rom_data, rom_size); sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM; sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr; sc->psc_bar[PCI_ROM_IDX].size = rom_size; munmap(rom_data, rom_size); close(fd); return (0); } static bool passthru_lookup_pptdev(const char *name, int *bus, int *slot, int *func) { struct pci_conf_io pc; struct pci_conf conf[1]; struct pci_match_conf patterns[1]; char *cp; bzero(&pc, sizeof(struct pci_conf_io)); pc.match_buf_len = sizeof(conf); pc.matches = conf; bzero(&patterns, sizeof(patterns)); /* * The pattern structure requires the unit to be split out from * the driver name. Walk backwards from the end of the name to * find the start of the unit. */ cp = strchr(name, '\0'); assert(cp != NULL); while (cp != name && isdigit(cp[-1])) cp--; if (cp == name || !isdigit(*cp)) { EPRINTLN("Invalid passthru device name %s", name); return (false); } if ((size_t)(cp - name) + 1 > sizeof(patterns[0].pd_name)) { EPRINTLN("Passthru device name %s is too long", name); return (false); } memcpy(patterns[0].pd_name, name, cp - name); patterns[0].pd_unit = strtol(cp, &cp, 10); if (*cp != '\0') { EPRINTLN("Invalid passthru device name %s", name); return (false); } patterns[0].flags = PCI_GETCONF_MATCH_NAME | PCI_GETCONF_MATCH_UNIT; pc.num_patterns = 1; pc.pat_buf_len = sizeof(patterns); pc.patterns = patterns; if (ioctl(pcifd, PCIOCGETCONF, &pc) == -1) { EPRINTLN("ioctl(PCIOCGETCONF): %s", strerror(errno)); return (false); } if (pc.status != PCI_GETCONF_LAST_DEVICE && pc.status != PCI_GETCONF_MORE_DEVS) { EPRINTLN("error returned from PCIOCGETCONF ioctl"); return (false); } if (pc.num_matches == 0) { EPRINTLN("Passthru device %s not found", name); return (false); } if (conf[0].pc_sel.pc_domain != 0) { EPRINTLN("Passthru device %s on unsupported domain", name); return (false); } *bus = conf[0].pc_sel.pc_bus; *slot = conf[0].pc_sel.pc_dev; *func = conf[0].pc_sel.pc_func; return (true); } static int -passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) +passthru_init(struct pci_devinst *pi, nvlist_t *nvl) { int bus, slot, func, error, memflags; struct passthru_softc *sc; const char *value; sc = NULL; error = 1; - memflags = vm_get_memflags(ctx); + memflags = vm_get_memflags(pi->pi_vmctx); if (!(memflags & VM_MEM_F_WIRED)) { warnx("passthru requires guest memory to be wired"); return (error); } if (pcifd < 0 && pcifd_init()) { return (error); } #define GET_INT_CONFIG(var, name) do { \ value = get_config_value_node(nvl, name); \ if (value == NULL) { \ EPRINTLN("passthru: missing required %s setting", name); \ return (error); \ } \ var = atoi(value); \ } while (0) value = get_config_value_node(nvl, "pptdev"); if (value != NULL) { if (!passthru_lookup_pptdev(value, &bus, &slot, &func)) return (error); } else { GET_INT_CONFIG(bus, "bus"); GET_INT_CONFIG(slot, "slot"); GET_INT_CONFIG(func, "func"); } - if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { + if (vm_assign_pptdev(pi->pi_vmctx, bus, slot, func) != 0) { warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", bus, slot, func); goto done; } sc = calloc(1, sizeof(struct passthru_softc)); pi->pi_arg = sc; sc->psc_pi = pi; /* initialize config space */ - if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) + if ((error = cfginit(pi, bus, slot, func)) != 0) goto done; /* initialize ROM */ - if ((error = passthru_init_rom(ctx, sc, + if ((error = passthru_init_rom(sc, get_config_value_node(nvl, "rom"))) != 0) goto done; error = 0; /* success */ done: if (error) { free(sc); - vm_unassign_pptdev(ctx, bus, slot, func); + vm_unassign_pptdev(pi->pi_vmctx, bus, slot, func); } return (error); } static int bar_access(int coff) { if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) || coff == PCIR_BIOS) return (1); else return (0); } static int msicap_access(struct passthru_softc *sc, int coff) { int caplen; if (sc->psc_msi.capoff == 0) return (0); caplen = msi_caplen(sc->psc_msi.msgctrl); if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) return (1); else return (0); } static int msixcap_access(struct passthru_softc *sc, int coff) { if (sc->psc_msix.capoff == 0) return (0); return (coff >= sc->psc_msix.capoff && coff < sc->psc_msix.capoff + MSIX_CAPLEN); } static int -passthru_cfgread(struct vmctx *ctx __unused, struct pci_devinst *pi, int coff, - int bytes, uint32_t *rv) +passthru_cfgread(struct pci_devinst *pi, int coff, int bytes, uint32_t *rv) { struct passthru_softc *sc; sc = pi->pi_arg; /* * PCI BARs and MSI capability is emulated. */ if (bar_access(coff) || msicap_access(sc, coff) || msixcap_access(sc, coff)) return (-1); #ifdef LEGACY_SUPPORT /* * Emulate PCIR_CAP_PTR if this device does not support MSI capability * natively. */ if (sc->psc_msi.emulated) { if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) return (-1); } #endif /* * Emulate the command register. If a single read reads both the * command and status registers, read the status register from the * device's config space. */ if (coff == PCIR_COMMAND) { if (bytes <= 2) return (-1); *rv = read_config(&sc->psc_sel, PCIR_STATUS, 2) << 16 | pci_get_cfgdata16(pi, PCIR_COMMAND); return (0); } /* Everything else just read from the device's config space */ *rv = read_config(&sc->psc_sel, coff, bytes); return (0); } static int -passthru_cfgwrite(struct vmctx *ctx, struct pci_devinst *pi, int coff, - int bytes, uint32_t val) +passthru_cfgwrite(struct pci_devinst *pi, int coff, int bytes, uint32_t val) { int error, msix_table_entries, i; struct passthru_softc *sc; uint16_t cmd_old; sc = pi->pi_arg; /* * PCI BARs are emulated */ if (bar_access(coff)) return (-1); /* * MSI capability is emulated */ if (msicap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, PCIY_MSI); - error = vm_setup_pptdev_msi(ctx, 0, sc->psc_sel.pc_bus, + error = vm_setup_pptdev_msi(pi->pi_vmctx, 0, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum); if (error != 0) err(1, "vm_setup_pptdev_msi"); return (0); } if (msixcap_access(sc, coff)) { pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) { - error = vm_setup_pptdev_msix(ctx, 0, + error = vm_setup_pptdev_msix(pi->pi_vmctx, 0, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, i, pi->pi_msix.table[i].addr, pi->pi_msix.table[i].msg_data, pi->pi_msix.table[i].vector_control); if (error) err(1, "vm_setup_pptdev_msix"); } } else { - error = vm_disable_pptdev_msix(ctx, sc->psc_sel.pc_bus, - sc->psc_sel.pc_dev, sc->psc_sel.pc_func); + error = vm_disable_pptdev_msix(pi->pi_vmctx, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func); if (error) err(1, "vm_disable_pptdev_msix"); } return (0); } #ifdef LEGACY_SUPPORT /* * If this device does not support MSI natively then we cannot let * the guest disable legacy interrupts from the device. It is the * legacy interrupt that is triggering the virtual MSI to the guest. */ if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { if (coff == PCIR_COMMAND && bytes == 2) val &= ~PCIM_CMD_INTxDIS; } #endif write_config(&sc->psc_sel, coff, bytes, val); if (coff == PCIR_COMMAND) { cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND); if (bytes == 1) pci_set_cfgdata8(pi, PCIR_COMMAND, val); else if (bytes == 2) pci_set_cfgdata16(pi, PCIR_COMMAND, val); pci_emul_cmd_changed(pi, cmd_old); } return (0); } static void -passthru_write(struct vmctx *ctx, struct pci_devinst *pi, int baridx, - uint64_t offset, int size, uint64_t value) +passthru_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, + uint64_t value) { struct passthru_softc *sc; struct pci_bar_ioreq pio; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { - msix_table_write(ctx, sc, offset, size, value); + msix_table_write(sc, offset, size, value); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_WRITE; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; pio.pbi_value = (uint32_t)value; (void)ioctl(pcifd, PCIOCBARIO, &pio); } } static uint64_t -passthru_read(struct vmctx *ctx __unused, struct pci_devinst *pi, int baridx, - uint64_t offset, int size) +passthru_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct passthru_softc *sc; struct pci_bar_ioreq pio; uint64_t val; sc = pi->pi_arg; if (baridx == pci_msix_table_bar(pi)) { val = msix_table_read(sc, offset, size); } else { assert(pi->pi_bar[baridx].type == PCIBAR_IO); assert(size == 1 || size == 2 || size == 4); assert(offset <= UINT32_MAX && offset + size <= UINT32_MAX); bzero(&pio, sizeof(pio)); pio.pbi_sel = sc->psc_sel; pio.pbi_op = PCIBARIO_READ; pio.pbi_bar = baridx; pio.pbi_offset = (uint32_t)offset; pio.pbi_width = size; (void)ioctl(pcifd, PCIOCBARIO, &pio); val = pio.pbi_value; } return (val); } static void -passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, - int enabled, uint64_t address) +passthru_msix_addr(struct pci_devinst *pi, int baridx, int enabled, + uint64_t address) { struct passthru_softc *sc; size_t remaining; uint32_t table_size, table_offset; sc = pi->pi_arg; table_offset = rounddown2(pi->pi_msix.table_offset, 4096); if (table_offset > 0) { if (!enabled) { - if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + if (vm_unmap_pptdev_mmio(pi->pi_vmctx, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { - if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, table_offset, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); remaining = pi->pi_bar[baridx].size - table_offset - table_size; if (remaining > 0) { address += table_offset + table_size; if (!enabled) { - if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + if (vm_unmap_pptdev_mmio(pi->pi_vmctx, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { - if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, remaining, sc->psc_bar[baridx].addr + table_offset + table_size) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } } static void -passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, - int enabled, uint64_t address) +passthru_mmio_addr(struct pci_devinst *pi, int baridx, int enabled, + uint64_t address) { struct passthru_softc *sc; sc = pi->pi_arg; if (!enabled) { - if (vm_unmap_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + if (vm_unmap_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size) != 0) warnx("pci_passthru: unmap_pptdev_mmio failed"); } else { - if (vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + if (vm_map_pptdev_mmio(pi->pi_vmctx, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, address, sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0) warnx("pci_passthru: map_pptdev_mmio failed"); } } static void passthru_addr_rom(struct pci_devinst *const pi, const int idx, const int enabled) { const uint64_t addr = pi->pi_bar[idx].addr; const uint64_t size = pi->pi_bar[idx].size; if (!enabled) { if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) { errx(4, "%s: munmap_memseg @ [%016lx - %016lx] failed", __func__, addr, addr + size); } } else { if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM, pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) { errx(4, "%s: mmap_memseg @ [%016lx - %016lx] failed", __func__, addr, addr + size); } } } static void -passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx, - int enabled, uint64_t address) +passthru_addr(struct pci_devinst *pi, int baridx, int enabled, uint64_t address) { switch (pi->pi_bar[baridx].type) { case PCIBAR_IO: /* IO BARs are emulated */ break; case PCIBAR_ROM: passthru_addr_rom(pi, baridx, enabled); break; case PCIBAR_MEM32: case PCIBAR_MEM64: if (baridx == pci_msix_table_bar(pi)) - passthru_msix_addr(ctx, pi, baridx, enabled, address); + passthru_msix_addr(pi, baridx, enabled, address); else - passthru_mmio_addr(ctx, pi, baridx, enabled, address); + passthru_mmio_addr(pi, baridx, enabled, address); break; default: errx(4, "%s: invalid BAR type %d", __func__, pi->pi_bar[baridx].type); } } static const struct pci_devemu passthru = { .pe_emu = "passthru", .pe_init = passthru_init, .pe_legacy_config = passthru_legacy_config, .pe_cfgwrite = passthru_cfgwrite, .pe_cfgread = passthru_cfgread, .pe_barwrite = passthru_write, .pe_barread = passthru_read, .pe_baraddr = passthru_addr, }; PCI_EMUL_SET(passthru); diff --git a/usr.sbin/bhyve/pci_uart.c b/usr.sbin/bhyve/pci_uart.c index 7ccc3c7b6c62..f610c6236dfc 100644 --- a/usr.sbin/bhyve/pci_uart.c +++ b/usr.sbin/bhyve/pci_uart.c @@ -1,135 +1,133 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "uart_emul.h" /* * Pick a PCI vid/did of a chip with a single uart at * BAR0, that most versions of FreeBSD can understand: * Siig CyberSerial 1-port. */ #define COM_VENDOR 0x131f #define COM_DEV 0x2000 static void pci_uart_intr_assert(void *arg) { struct pci_devinst *pi = arg; pci_lintr_assert(pi); } static void pci_uart_intr_deassert(void *arg) { struct pci_devinst *pi = arg; pci_lintr_deassert(pi); } static void -pci_uart_write(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size, +pci_uart_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { assert(baridx == 0); assert(size == 1); uart_write(pi->pi_arg, offset, value); } static uint64_t -pci_uart_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +pci_uart_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { uint8_t val; assert(baridx == 0); assert(size == 1); val = uart_read(pi->pi_arg, offset); return (val); } static int pci_uart_legacy_config(nvlist_t *nvl, const char *opts) { if (opts != NULL) set_config_value_node(nvl, "path", opts); return (0); } static int -pci_uart_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) +pci_uart_init(struct pci_devinst *pi, nvlist_t *nvl) { struct uart_softc *sc; const char *device; pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE); pci_lintr_request(pi); /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV); pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi); pi->pi_arg = sc; device = get_config_value_node(nvl, "path"); if (uart_set_backend(sc, device) != 0) { EPRINTLN("Unable to initialize backend '%s' for " "pci uart at %d:%d", device, pi->pi_slot, pi->pi_func); return (-1); } return (0); } static const struct pci_devemu pci_de_com = { .pe_emu = "uart", .pe_init = pci_uart_init, .pe_legacy_config = pci_uart_legacy_config, .pe_barwrite = pci_uart_write, .pe_barread = pci_uart_read }; PCI_EMUL_SET(pci_de_com); diff --git a/usr.sbin/bhyve/pci_virtio_9p.c b/usr.sbin/bhyve/pci_virtio_9p.c index c24b33acc299..405692bec37f 100644 --- a/usr.sbin/bhyve/pci_virtio_9p.c +++ b/usr.sbin/bhyve/pci_virtio_9p.c @@ -1,351 +1,351 @@ /*- * Copyright (c) 2015 iXsystems Inc. * Copyright (c) 2017-2018 Jakub Klama * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * VirtIO filesystem passthrough using 9p protocol. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #define VT9P_MAX_IOV 128 #define VT9P_RINGSZ 256 #define VT9P_MAXTAGSZ 256 #define VT9P_CONFIGSPACESZ (VT9P_MAXTAGSZ + sizeof(uint16_t)) static int pci_vt9p_debug; #define DPRINTF(params) if (pci_vt9p_debug) printf params #define WPRINTF(params) printf params /* * Per-device softc */ struct pci_vt9p_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_vq; pthread_mutex_t vsc_mtx; uint64_t vsc_cfg; uint64_t vsc_features; char * vsc_rootpath; struct pci_vt9p_config * vsc_config; struct l9p_backend * vsc_fs_backend; struct l9p_server * vsc_server; struct l9p_connection * vsc_conn; }; struct pci_vt9p_request { struct pci_vt9p_softc * vsr_sc; struct iovec * vsr_iov; size_t vsr_niov; size_t vsr_respidx; size_t vsr_iolen; uint16_t vsr_idx; }; struct pci_vt9p_config { uint16_t tag_len; char tag[0]; } __attribute__((packed)); static int pci_vt9p_send(struct l9p_request *, const struct iovec *, const size_t, const size_t, void *); static void pci_vt9p_drop(struct l9p_request *, const struct iovec *, size_t, void *); static void pci_vt9p_reset(void *); static void pci_vt9p_notify(void *, struct vqueue_info *); static int pci_vt9p_cfgread(void *, int, int, uint32_t *); static void pci_vt9p_neg_features(void *, uint64_t); static struct virtio_consts vt9p_vi_consts = { .vc_name = "vt9p", .vc_nvq = 1, .vc_cfgsize = VT9P_CONFIGSPACESZ, .vc_reset = pci_vt9p_reset, .vc_qnotify = pci_vt9p_notify, .vc_cfgread = pci_vt9p_cfgread, .vc_apply_features = pci_vt9p_neg_features, .vc_hv_caps = (1 << 0), }; static void pci_vt9p_reset(void *vsc) { struct pci_vt9p_softc *sc; sc = vsc; DPRINTF(("vt9p: device reset requested !\n")); vi_reset_dev(&sc->vsc_vs); } static void pci_vt9p_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vt9p_softc *sc = vsc; sc->vsc_features = negotiated_features; } static int pci_vt9p_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vt9p_softc *sc = vsc; void *ptr; ptr = (uint8_t *)sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vt9p_get_buffer(struct l9p_request *req, struct iovec *iov, size_t *niov, void *arg __unused) { struct pci_vt9p_request *preq = req->lr_aux; size_t n = preq->vsr_niov - preq->vsr_respidx; memcpy(iov, preq->vsr_iov + preq->vsr_respidx, n * sizeof(struct iovec)); *niov = n; return (0); } static int pci_vt9p_send(struct l9p_request *req, const struct iovec *iov __unused, const size_t niov __unused, const size_t iolen, void *arg __unused) { struct pci_vt9p_request *preq = req->lr_aux; struct pci_vt9p_softc *sc = preq->vsr_sc; preq->vsr_iolen = iolen; pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vsc_vq, preq->vsr_idx, preq->vsr_iolen); vq_endchains(&sc->vsc_vq, 1); pthread_mutex_unlock(&sc->vsc_mtx); free(preq); return (0); } static void pci_vt9p_drop(struct l9p_request *req, const struct iovec *iov __unused, size_t niov __unused, void *arg __unused) { struct pci_vt9p_request *preq = req->lr_aux; struct pci_vt9p_softc *sc = preq->vsr_sc; pthread_mutex_lock(&sc->vsc_mtx); vq_relchain(&sc->vsc_vq, preq->vsr_idx, 0); vq_endchains(&sc->vsc_vq, 1); pthread_mutex_unlock(&sc->vsc_mtx); free(preq); } static void pci_vt9p_notify(void *vsc, struct vqueue_info *vq) { struct iovec iov[VT9P_MAX_IOV]; struct pci_vt9p_softc *sc; struct pci_vt9p_request *preq; struct vi_req req; int n; sc = vsc; while (vq_has_descs(vq)) { n = vq_getchain(vq, iov, VT9P_MAX_IOV, &req); assert(n >= 1 && n <= VT9P_MAX_IOV); preq = calloc(1, sizeof(struct pci_vt9p_request)); preq->vsr_sc = sc; preq->vsr_idx = req.idx; preq->vsr_iov = iov; preq->vsr_niov = n; preq->vsr_respidx = req.readable; for (int i = 0; i < n; i++) { DPRINTF(("vt9p: vt9p_notify(): desc%d base=%p, " "len=%zu\r\n", i, iov[i].iov_base, iov[i].iov_len)); } l9p_connection_recv(sc->vsc_conn, iov, preq->vsr_respidx, preq); } } static int pci_vt9p_legacy_config(nvlist_t *nvl, const char *opts) { char *sharename = NULL, *tofree, *token, *tokens; if (opts == NULL) return (0); tokens = tofree = strdup(opts); while ((token = strsep(&tokens, ",")) != NULL) { if (strchr(token, '=') != NULL) { if (sharename != NULL) { EPRINTLN( "virtio-9p: more than one share name given"); return (-1); } sharename = strsep(&token, "="); set_config_value_node(nvl, "sharename", sharename); set_config_value_node(nvl, "path", token); } else set_config_bool_node(nvl, token, true); } free(tofree); return (0); } static int -pci_vt9p_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) +pci_vt9p_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vt9p_softc *sc; const char *value; const char *sharename; int rootfd; bool ro; cap_rights_t rootcap; ro = get_config_bool_node_default(nvl, "ro", false); value = get_config_value_node(nvl, "path"); if (value == NULL) { EPRINTLN("virtio-9p: path required"); return (1); } rootfd = open(value, O_DIRECTORY); if (rootfd < 0) { EPRINTLN("virtio-9p: failed to open '%s': %s", value, strerror(errno)); return (-1); } sharename = get_config_value_node(nvl, "sharename"); if (sharename == NULL) { EPRINTLN("virtio-9p: share name required"); return (1); } if (strlen(sharename) > VT9P_MAXTAGSZ) { EPRINTLN("virtio-9p: share name too long"); return (1); } sc = calloc(1, sizeof(struct pci_vt9p_softc)); sc->vsc_config = calloc(1, sizeof(struct pci_vt9p_config) + VT9P_MAXTAGSZ); pthread_mutex_init(&sc->vsc_mtx, NULL); cap_rights_init(&rootcap, CAP_LOOKUP, CAP_ACL_CHECK, CAP_ACL_DELETE, CAP_ACL_GET, CAP_ACL_SET, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSTAT, CAP_CREATE, CAP_FCHMODAT, CAP_FCHOWNAT, CAP_FTRUNCATE, CAP_LINKAT_SOURCE, CAP_LINKAT_TARGET, CAP_MKDIRAT, CAP_MKNODAT, CAP_PREAD, CAP_PWRITE, CAP_RENAMEAT_SOURCE, CAP_RENAMEAT_TARGET, CAP_SEEK, CAP_SYMLINKAT, CAP_UNLINKAT, CAP_EXTATTR_DELETE, CAP_EXTATTR_GET, CAP_EXTATTR_LIST, CAP_EXTATTR_SET, CAP_FUTIMES, CAP_FSTATFS, CAP_FSYNC, CAP_FPATHCONF); if (cap_rights_limit(rootfd, &rootcap) != 0) return (1); sc->vsc_config->tag_len = (uint16_t)strlen(sharename); memcpy(sc->vsc_config->tag, sharename, sc->vsc_config->tag_len); if (l9p_backend_fs_init(&sc->vsc_fs_backend, rootfd, ro) != 0) { errno = ENXIO; return (1); } if (l9p_server_init(&sc->vsc_server, sc->vsc_fs_backend) != 0) { errno = ENXIO; return (1); } if (l9p_connection_init(sc->vsc_server, &sc->vsc_conn) != 0) { errno = EIO; return (1); } sc->vsc_conn->lc_msize = L9P_MAX_IOV * PAGE_SIZE; sc->vsc_conn->lc_lt.lt_get_response_buffer = pci_vt9p_get_buffer; sc->vsc_conn->lc_lt.lt_send_response = pci_vt9p_send; sc->vsc_conn->lc_lt.lt_drop_response = pci_vt9p_drop; vi_softc_linkup(&sc->vsc_vs, &vt9p_vi_consts, sc, pi, &sc->vsc_vq); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; sc->vsc_vq.vq_qsize = VT9P_RINGSZ; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_9P); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_9P); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vsc_vs, 0); return (0); } static const struct pci_devemu pci_de_v9p = { .pe_emu = "virtio-9p", .pe_legacy_config = pci_vt9p_legacy_config, .pe_init = pci_vt9p_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_v9p); diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 06d2a9671c0b..7c412891f295 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -1,602 +1,601 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright 2020-2021 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "block_if.h" #define VTBLK_BSIZE 512 #define VTBLK_RINGSZ 128 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 #define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_BARRIER (1 << 0) /* Does host support barriers? */ #define VTBLK_F_SIZE_MAX (1 << 1) /* Indicates maximum segment size */ #define VTBLK_F_SEG_MAX (1 << 2) /* Indicates maximum # of segments */ #define VTBLK_F_GEOMETRY (1 << 4) /* Legacy geometry available */ #define VTBLK_F_RO (1 << 5) /* Disk is read-only */ #define VTBLK_F_BLK_SIZE (1 << 6) /* Block size of disk is available*/ #define VTBLK_F_SCSI (1 << 7) /* Supports scsi command passthru */ #define VTBLK_F_FLUSH (1 << 9) /* Writeback mode enabled after reset */ #define VTBLK_F_WCE (1 << 9) /* Legacy alias for FLUSH */ #define VTBLK_F_TOPOLOGY (1 << 10) /* Topology information is available */ #define VTBLK_F_CONFIG_WCE (1 << 11) /* Writeback mode available in config */ #define VTBLK_F_MQ (1 << 12) /* Multi-Queue */ #define VTBLK_F_DISCARD (1 << 13) /* Trim blocks */ #define VTBLK_F_WRITE_ZEROES (1 << 14) /* Write zeros */ /* * Host capabilities */ #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ VTBLK_F_FLUSH | \ VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* * The current blockif_delete() interface only allows a single delete * request at a time. */ #define VTBLK_MAX_DISCARD_SEG 1 /* * An arbitrary limit to prevent excessive latency due to large * delete requests. */ #define VTBLK_MAX_DISCARD_SECT ((16 << 20) / VTBLK_BSIZE) /* 16 MiB */ /* * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; struct { uint16_t cylinders; uint8_t heads; uint8_t sectors; } vbc_geometry; uint32_t vbc_blk_size; struct { uint8_t physical_block_exp; uint8_t alignment_offset; uint16_t min_io_size; uint32_t opt_io_size; } vbc_topology; uint8_t vbc_writeback; uint8_t unused0[1]; uint16_t num_queues; uint32_t max_discard_sectors; uint32_t max_discard_seg; uint32_t discard_sector_alignment; uint32_t max_write_zeroes_sectors; uint32_t max_write_zeroes_seg; uint8_t write_zeroes_may_unmap; uint8_t unused1[3]; } __packed; /* * Fixed-size block header */ struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 #define VBH_OP_SCSI_CMD 2 #define VBH_OP_SCSI_CMD_OUT 3 #define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH_OUT 5 #define VBH_OP_IDENT 8 #define VBH_OP_DISCARD 11 #define VBH_OP_WRITE_ZEROES 13 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; /* * Debug printf */ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtblk_ioreq { struct blockif_req io_req; struct pci_vtblk_softc *io_sc; uint8_t *io_status; uint16_t io_idx; }; struct virtio_blk_discard_write_zeroes { uint64_t sector; uint32_t num_sectors; struct { uint32_t unmap:1; uint32_t reserved:31; } flags; }; /* * Per-device softc */ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; struct vtblk_config vbsc_cfg; struct virtio_consts vbsc_consts; struct blockif_ctxt *bc; char vbsc_ident[VTBLK_BLK_ID_BYTES]; struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *); static void pci_vtblk_resume(void *); static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtblk_vi_consts = { .vc_name = "vtblk", .vc_nvq = 1, .vc_cfgsize = sizeof(struct vtblk_config), .vc_reset = pci_vtblk_reset, .vc_qnotify = pci_vtblk_notify, .vc_cfgread = pci_vtblk_cfgread, .vc_cfgwrite = pci_vtblk_cfgwrite, .vc_apply_features = NULL, .vc_hv_caps = VTBLK_S_HOSTCAPS, #ifdef BHYVE_SNAPSHOT .vc_pause = pci_vtblk_pause, .vc_resume = pci_vtblk_resume, .vc_snapshot = pci_vtblk_snapshot, #endif }; static void pci_vtblk_reset(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device reset requested !")); vi_reset_dev(&sc->vbsc_vs); } static void pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) { struct pci_vtblk_softc *sc = io->io_sc; /* convert errno into a virtio block error return */ if (err == EOPNOTSUPP || err == ENOSYS) *io->io_status = VTBLK_S_UNSUPP; else if (err != 0) *io->io_status = VTBLK_S_IOERR; else *io->io_status = VTBLK_S_OK; /* * Return the descriptor back to the host. * We wrote 1 byte (our status) to host. */ vq_relchain(&sc->vbsc_vq, io->io_idx, 1); vq_endchains(&sc->vbsc_vq, 0); } #ifdef BHYVE_SNAPSHOT static void pci_vtblk_pause(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device pause requested !\n")); blockif_pause(sc->bc); } static void pci_vtblk_resume(void *vsc) { struct pci_vtblk_softc *sc = vsc; DPRINTF(("vtblk: device resume requested !\n")); blockif_resume(sc->bc); } static int pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtblk_softc *sc = vsc; SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), meta, ret, done); done: return (ret); } #endif static void pci_vtblk_done(struct blockif_req *br, int err) { struct pci_vtblk_ioreq *io = br->br_param; struct pci_vtblk_softc *sc = io->io_sc; pthread_mutex_lock(&sc->vsc_mtx); pci_vtblk_done_locked(io, err); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; struct pci_vtblk_ioreq *io; int i, n; int err; ssize_t iolen; int writeop, type; struct vi_req req; struct iovec iov[BLOCKIF_IOV_MAX + 2]; struct virtio_blk_discard_write_zeroes *discard; n = vq_getchain(vq, iov, BLOCKIF_IOV_MAX + 2, &req); /* * The first descriptor will be the read-only fixed header, * and the last is for status (hence +2 above and below). * The remaining iov's are the actual data I/O vectors. * * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); io = &sc->vbsc_ios[req.idx]; assert(req.readable != 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); vbh = (struct virtio_blk_hdr *)iov[0].iov_base; memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); io->io_req.br_iovcnt = n - 2; io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE; io->io_status = (uint8_t *)iov[--n].iov_base; assert(req.writable != 0); assert(iov[n].iov_len == 1); /* * XXX * The guest should not be setting the BARRIER flag because * we don't advertise the capability. */ type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD); /* * - Write op implies read-only descriptor * - Read/ident op implies write-only descriptor * * By taking away either the read-only fixed header or the write-only * status iovec, the following condition should hold true. */ assert(n == (writeop ? req.readable : req.writable)); iolen = 0; for (i = 1; i < n; i++) { iolen += iov[i].iov_len; } io->io_req.br_resid = iolen; DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld", writeop ? "write/discard" : "read/ident", iolen, i - 1, io->io_req.br_offset)); switch (type) { case VBH_OP_READ: err = blockif_read(sc->bc, &io->io_req); break; case VBH_OP_WRITE: err = blockif_write(sc->bc, &io->io_req); break; case VBH_OP_DISCARD: /* * We currently only support a single request, if the guest * has submitted a request that doesn't conform to the * requirements, we return a error. */ if (iov[1].iov_len != sizeof (*discard)) { pci_vtblk_done_locked(io, EINVAL); return; } /* The segments to discard are provided rather than data */ discard = (struct virtio_blk_discard_write_zeroes *) iov[1].iov_base; /* * virtio v1.1 5.2.6.2: * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP * for discard and write zeroes commands if any unknown flag is * set. Furthermore, the device MUST set the status byte to * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag * is set. * * Currently there are no known flags for a DISCARD request. */ if (discard->flags.unmap != 0 || discard->flags.reserved != 0) { pci_vtblk_done_locked(io, ENOTSUP); return; } /* Make sure the request doesn't exceed our size limit */ if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) { pci_vtblk_done_locked(io, EINVAL); return; } io->io_req.br_offset = discard->sector * VTBLK_BSIZE; io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE; err = blockif_delete(sc->bc, &io->io_req); break; case VBH_OP_FLUSH: case VBH_OP_FLUSH_OUT: err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ /* S/n equal to buffer is not zero-terminated. */ memset(iov[1].iov_base, 0, iov[1].iov_len); strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); pci_vtblk_done_locked(io, 0); return; default: pci_vtblk_done_locked(io, EOPNOTSUPP); return; } assert(err == 0); } static void pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); } static void pci_vtblk_resized(struct blockif_ctxt *bctxt __unused, void *arg, size_t new_size) { struct pci_vtblk_softc *sc; sc = arg; sc->vbsc_cfg.vbc_capacity = new_size / VTBLK_BSIZE; /* 512-byte units */ vi_interrupt(&sc->vbsc_vs, VIRTIO_PCI_ISR_CONFIG, sc->vbsc_vs.vs_msix_cfg_idx); } static int -pci_vtblk_init(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl) +pci_vtblk_init(struct pci_devinst *pi, nvlist_t *nvl) { char bident[sizeof("XXX:XXX")]; struct blockif_ctxt *bctxt; const char *path, *serial; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; off_t size; int i, sectsz, sts, sto; /* * The supplied backing file has to exist */ snprintf(bident, sizeof(bident), "%u:%u", pi->pi_slot, pi->pi_func); bctxt = blockif_open(nvl, bident); if (bctxt == NULL) { perror("Could not open backing file"); return (1); } size = blockif_size(bctxt); sectsz = blockif_sectsz(bctxt); blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); sc->bc = bctxt; for (i = 0; i < VTBLK_RINGSZ; i++) { struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; io->io_req.br_callback = pci_vtblk_done; io->io_req.br_param = io; io->io_sc = sc; io->io_idx = i; } bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts)); if (blockif_candelete(sc->bc)) sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD; pthread_mutex_init(&sc->vsc_mtx, NULL); /* init virtio softc and virtqueues */ vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq); sc->vbsc_vs.vs_mtx = &sc->vsc_mtx; sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ /* * If an explicit identifier is not given, create an * identifier using parts of the md5 sum of the filename. */ bzero(sc->vbsc_ident, VTBLK_BLK_ID_BYTES); if ((serial = get_config_value_node(nvl, "serial")) != NULL || (serial = get_config_value_node(nvl, "ser")) != NULL) { strlcpy(sc->vbsc_ident, serial, VTBLK_BLK_ID_BYTES); } else { path = get_config_value_node(nvl, "path"); MD5Init(&mdctx); MD5Update(&mdctx, path, strlen(path)); MD5Final(digest, &mdctx); snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); } /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ /* * If Linux is presented with a seg_max greater than the virtio queue * size, it can stumble into situations where it violates its own * invariants and panics. For safety, we keep seg_max clamped, paying * heed to the two extra descriptors needed for the header and status * of a request. */ sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ sc->vbsc_cfg.vbc_geometry.heads = 0; sc->vbsc_cfg.vbc_geometry.sectors = 0; sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_topology.physical_block_exp = (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; sc->vbsc_cfg.vbc_topology.alignment_offset = (sto != 0) ? ((sts - sto) / sectsz) : 0; sc->vbsc_cfg.vbc_topology.min_io_size = 0; sc->vbsc_cfg.vbc_topology.opt_io_size = 0; sc->vbsc_cfg.vbc_writeback = 0; sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT; sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG; sc->vbsc_cfg.discard_sector_alignment = MAX(sectsz, sts) / VTBLK_BSIZE; /* * Should we move some of this into virtio.c? Could * have the device, class, and subdev_0 as fields in * the virtio constants structure. */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_BLOCK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { blockif_close(sc->bc); free(sc); return (1); } vi_set_io_bar(&sc->vbsc_vs, 0); blockif_register_resize_callback(sc->bc, pci_vtblk_resized, sc); return (0); } static int pci_vtblk_cfgwrite(void *vsc __unused, int offset, int size __unused, uint32_t value __unused) { DPRINTF(("vtblk: write to readonly reg %d", offset)); return (1); } static int pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtblk_softc *sc = vsc; void *ptr; /* our caller has already verified offset and size */ ptr = (uint8_t *)&sc->vbsc_cfg + offset; memcpy(retval, ptr, size); return (0); } static const struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_legacy_config = blockif_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, .pe_pause = vi_pci_pause, .pe_resume = vi_pci_resume, #endif }; PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_console.c b/usr.sbin/bhyve/pci_virtio_console.c index 46005d155f49..0f57c7a8248f 100644 --- a/usr.sbin/bhyve/pci_virtio_console.c +++ b/usr.sbin/bhyve/pci_virtio_console.c @@ -1,761 +1,760 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 iXsystems Inc. * All rights reserved. * * This software was developed by Jakub Klama * under sponsorship from iXsystems Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "mevent.h" #include "sockstream.h" #define VTCON_RINGSZ 64 #define VTCON_MAXPORTS 16 #define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) #define VTCON_DEVICE_READY 0 #define VTCON_DEVICE_ADD 1 #define VTCON_DEVICE_REMOVE 2 #define VTCON_PORT_READY 3 #define VTCON_CONSOLE_PORT 4 #define VTCON_CONSOLE_RESIZE 5 #define VTCON_PORT_OPEN 6 #define VTCON_PORT_NAME 7 #define VTCON_F_SIZE 0 #define VTCON_F_MULTIPORT 1 #define VTCON_F_EMERG_WRITE 2 #define VTCON_S_HOSTCAPS \ (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) static int pci_vtcon_debug; #define DPRINTF(params) if (pci_vtcon_debug) PRINTLN params #define WPRINTF(params) PRINTLN params struct pci_vtcon_softc; struct pci_vtcon_port; struct pci_vtcon_config; typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, int); struct pci_vtcon_port { struct pci_vtcon_softc * vsp_sc; int vsp_id; const char * vsp_name; bool vsp_enabled; bool vsp_console; bool vsp_rx_ready; bool vsp_open; int vsp_rxq; int vsp_txq; void * vsp_arg; pci_vtcon_cb_t * vsp_cb; }; struct pci_vtcon_sock { struct pci_vtcon_port * vss_port; const char * vss_path; struct mevent * vss_server_evp; struct mevent * vss_conn_evp; int vss_server_fd; int vss_conn_fd; bool vss_open; }; struct pci_vtcon_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTCON_MAXQ]; pthread_mutex_t vsc_mtx; uint64_t vsc_cfg; uint64_t vsc_features; char * vsc_rootdir; int vsc_kq; bool vsc_ready; struct pci_vtcon_port vsc_control_port; struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; struct pci_vtcon_config *vsc_config; }; struct pci_vtcon_config { uint16_t cols; uint16_t rows; uint32_t max_nr_ports; uint32_t emerg_wr; } __attribute__((packed)); struct pci_vtcon_control { uint32_t id; uint16_t event; uint16_t value; } __attribute__((packed)); struct pci_vtcon_console_resize { uint16_t cols; uint16_t rows; } __attribute__((packed)); static void pci_vtcon_reset(void *); static void pci_vtcon_notify_rx(void *, struct vqueue_info *); static void pci_vtcon_notify_tx(void *, struct vqueue_info *); static int pci_vtcon_cfgread(void *, int, int, uint32_t *); static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); static void pci_vtcon_neg_features(void *, uint64_t); static void pci_vtcon_sock_accept(int, enum ev_type, void *); static void pci_vtcon_sock_rx(int, enum ev_type, void *); static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, int); static void pci_vtcon_control_send(struct pci_vtcon_softc *, struct pci_vtcon_control *, const void *, size_t); static void pci_vtcon_announce_port(struct pci_vtcon_port *); static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); static struct virtio_consts vtcon_vi_consts = { .vc_name = "vtcon", .vc_nvq = VTCON_MAXQ, .vc_cfgsize = sizeof(struct pci_vtcon_config), .vc_reset = pci_vtcon_reset, .vc_cfgread = pci_vtcon_cfgread, .vc_cfgwrite = pci_vtcon_cfgwrite, .vc_apply_features = pci_vtcon_neg_features, .vc_hv_caps = VTCON_S_HOSTCAPS, }; static void pci_vtcon_reset(void *vsc) { struct pci_vtcon_softc *sc; sc = vsc; DPRINTF(("vtcon: device reset requested!")); vi_reset_dev(&sc->vsc_vs); } static void pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtcon_softc *sc = vsc; sc->vsc_features = negotiated_features; } static int pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtcon_softc *sc = vsc; void *ptr; ptr = (uint8_t *)sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtcon_cfgwrite(void *vsc __unused, int offset __unused, int size __unused, uint32_t val __unused) { return (0); } static inline struct pci_vtcon_port * pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) { uint16_t num = vq->vq_num; if (num == 0 || num == 1) return (&sc->vsc_ports[0]); if (num == 2 || num == 3) return (&sc->vsc_control_port); return (&sc->vsc_ports[(num / 2) - 1]); } static inline struct vqueue_info * pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) { int qnum; qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; return (&port->vsp_sc->vsc_queues[qnum]); } static struct pci_vtcon_port * pci_vtcon_port_add(struct pci_vtcon_softc *sc, int port_id, const char *name, pci_vtcon_cb_t *cb, void *arg) { struct pci_vtcon_port *port; port = &sc->vsc_ports[port_id]; if (port->vsp_enabled) { errno = EBUSY; return (NULL); } port->vsp_id = port_id; port->vsp_sc = sc; port->vsp_name = name; port->vsp_cb = cb; port->vsp_arg = arg; if (port->vsp_id == 0) { /* port0 */ port->vsp_txq = 0; port->vsp_rxq = 1; } else { port->vsp_txq = (port_id + 1) * 2; port->vsp_rxq = port->vsp_txq + 1; } port->vsp_enabled = true; return (port); } static int pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *port_name, const nvlist_t *nvl) { struct pci_vtcon_sock *sock = NULL; struct sockaddr_un sun; const char *name, *path; char *cp, *pathcopy; long port; int s = -1, fd = -1, error = 0; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif port = strtol(port_name, &cp, 0); if (*cp != '\0' || port < 0 || port >= VTCON_MAXPORTS) { EPRINTLN("vtcon: Invalid port %s", port_name); error = -1; goto out; } path = get_config_value_node(nvl, "path"); if (path == NULL) { EPRINTLN("vtcon: required path missing for port %ld", port); error = -1; goto out; } sock = calloc(1, sizeof(struct pci_vtcon_sock)); if (sock == NULL) { error = -1; goto out; } s = socket(AF_UNIX, SOCK_STREAM, 0); if (s < 0) { error = -1; goto out; } pathcopy = strdup(path); if (pathcopy == NULL) { error = -1; goto out; } fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); if (fd < 0) { free(pathcopy); error = -1; goto out; } sun.sun_family = AF_UNIX; sun.sun_len = sizeof(struct sockaddr_un); strcpy(pathcopy, path); strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); free(pathcopy); if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { error = -1; goto out; } if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { error = -1; goto out; } if (listen(s, 1) < 0) { error = -1; goto out; } #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); if (caph_rights_limit(s, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif name = get_config_value_node(nvl, "name"); if (name == NULL) { EPRINTLN("vtcon: required name missing for port %ld", port); error = -1; goto out; } sock->vss_port = pci_vtcon_port_add(sc, port, name, pci_vtcon_sock_tx, sock); if (sock->vss_port == NULL) { error = -1; goto out; } sock->vss_open = false; sock->vss_conn_fd = -1; sock->vss_server_fd = s; sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, sock); if (sock->vss_server_evp == NULL) { error = -1; goto out; } out: if (fd != -1) close(fd); if (error != 0) { if (s != -1) close(s); free(sock); } return (error); } static void pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; int s; s = accept(sock->vss_server_fd, NULL, NULL); if (s < 0) return; if (sock->vss_open) { close(s); return; } sock->vss_open = true; sock->vss_conn_fd = s; sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); pci_vtcon_open_port(sock->vss_port, true); } static void pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) { struct pci_vtcon_port *port; struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; struct vqueue_info *vq; struct vi_req req; struct iovec iov; static char dummybuf[2048]; int len, n; port = sock->vss_port; vq = pci_vtcon_port_to_vq(port, true); if (!sock->vss_open || !port->vsp_rx_ready) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); if (len == 0) goto close; return; } if (!vq_has_descs(vq)) { len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); vq_endchains(vq, 1); if (len == 0) goto close; return; } do { n = vq_getchain(vq, &iov, 1, &req); assert(n == 1); len = readv(sock->vss_conn_fd, &iov, n); if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { vq_retchains(vq, 1); vq_endchains(vq, 0); if (len == 0) goto close; return; } vq_relchain(vq, req.idx, len); } while (vq_has_descs(vq)); vq_endchains(vq, 1); close: mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } static void pci_vtcon_sock_tx(struct pci_vtcon_port *port __unused, void *arg __unused, struct iovec *iov, int niov) { struct pci_vtcon_sock *sock; int i, ret; sock = (struct pci_vtcon_sock *)arg; if (sock->vss_conn_fd == -1) return; for (i = 0; i < niov; i++) { ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, iov[i].iov_len); if (ret <= 0) break; } if (ret <= 0) { mevent_delete_close(sock->vss_conn_evp); sock->vss_conn_fd = -1; sock->vss_open = false; } } static void pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg __unused, struct iovec *iov, int niov) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *tmp; struct pci_vtcon_control resp, *ctrl; int i; assert(niov == 1); sc = port->vsp_sc; ctrl = (struct pci_vtcon_control *)iov->iov_base; switch (ctrl->event) { case VTCON_DEVICE_READY: sc->vsc_ready = true; /* set port ready events for registered ports */ for (i = 0; i < VTCON_MAXPORTS; i++) { tmp = &sc->vsc_ports[i]; if (tmp->vsp_enabled) pci_vtcon_announce_port(tmp); if (tmp->vsp_open) pci_vtcon_open_port(tmp, true); } break; case VTCON_PORT_READY: tmp = &sc->vsc_ports[ctrl->id]; if (ctrl->id >= VTCON_MAXPORTS || !tmp->vsp_enabled) { WPRINTF(("VTCON_PORT_READY event for unknown port %d", ctrl->id)); return; } if (tmp->vsp_console) { resp.event = VTCON_CONSOLE_PORT; resp.id = ctrl->id; resp.value = 1; pci_vtcon_control_send(sc, &resp, NULL, 0); } break; } } static void pci_vtcon_announce_port(struct pci_vtcon_port *port) { struct pci_vtcon_control event; event.id = port->vsp_id; event.event = VTCON_DEVICE_ADD; event.value = 1; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); event.event = VTCON_PORT_NAME; pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, strlen(port->vsp_name)); } static void pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) { struct pci_vtcon_control event; if (!port->vsp_sc->vsc_ready) { port->vsp_open = true; return; } event.id = port->vsp_id; event.event = VTCON_PORT_OPEN; event.value = (int)open; pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); } static void pci_vtcon_control_send(struct pci_vtcon_softc *sc, struct pci_vtcon_control *ctrl, const void *payload, size_t len) { struct vqueue_info *vq; struct vi_req req; struct iovec iov; int n; vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); if (!vq_has_descs(vq)) return; n = vq_getchain(vq, &iov, 1, &req); assert(n == 1); memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); if (payload != NULL && len > 0) memcpy((uint8_t *)iov.iov_base + sizeof(struct pci_vtcon_control), payload, len); vq_relchain(vq, req.idx, sizeof(struct pci_vtcon_control) + len); vq_endchains(vq, 1); } static void pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; struct iovec iov[1]; struct vi_req req; int n; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); while (vq_has_descs(vq)) { n = vq_getchain(vq, iov, 1, &req); assert(n == 1); if (port != NULL) port->vsp_cb(port, port->vsp_arg, iov, 1); /* * Release this chain and handle more */ vq_relchain(vq, req.idx, 0); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static void pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) { struct pci_vtcon_softc *sc; struct pci_vtcon_port *port; sc = vsc; port = pci_vtcon_vq_to_port(sc, vq); if (!port->vsp_rx_ready) { port->vsp_rx_ready = 1; vq_kick_disable(vq); } } /* * Each console device has a "port" node which contains nodes for * each port. Ports are numbered starting at 0. */ static int pci_vtcon_legacy_config_port(nvlist_t *nvl, int port, char *opt) { char *name, *path; char node_name[sizeof("XX")]; nvlist_t *port_nvl; name = strsep(&opt, "="); path = opt; if (path == NULL) { EPRINTLN("vtcon: port %s requires a path", name); return (-1); } if (port >= VTCON_MAXPORTS) { EPRINTLN("vtcon: too many ports"); return (-1); } snprintf(node_name, sizeof(node_name), "%d", port); port_nvl = create_relative_config_node(nvl, node_name); set_config_value_node(port_nvl, "name", name); set_config_value_node(port_nvl, "path", path); return (0); } static int pci_vtcon_legacy_config(nvlist_t *nvl, const char *opts) { char *opt, *str, *tofree; nvlist_t *ports_nvl; int error, port; ports_nvl = create_relative_config_node(nvl, "port"); tofree = str = strdup(opts); error = 0; port = 0; while ((opt = strsep(&str, ",")) != NULL) { error = pci_vtcon_legacy_config_port(ports_nvl, port, opt); if (error) break; port++; } free(tofree); return (error); } static int -pci_vtcon_init(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl) +pci_vtcon_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtcon_softc *sc; nvlist_t *ports_nvl; int i; sc = calloc(1, sizeof(struct pci_vtcon_softc)); sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; sc->vsc_config->cols = 80; sc->vsc_config->rows = 25; pthread_mutex_init(&sc->vsc_mtx, NULL); vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; for (i = 0; i < VTCON_MAXQ; i++) { sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; sc->vsc_queues[i].vq_notify = i % 2 == 0 ? pci_vtcon_notify_rx : pci_vtcon_notify_tx; } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_CONSOLE); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vsc_vs, 0); /* create control port */ sc->vsc_control_port.vsp_sc = sc; sc->vsc_control_port.vsp_txq = 2; sc->vsc_control_port.vsp_rxq = 3; sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; sc->vsc_control_port.vsp_enabled = true; ports_nvl = find_relative_config_node(nvl, "port"); if (ports_nvl != NULL) { const char *name; void *cookie; int type; cookie = NULL; while ((name = nvlist_next(ports_nvl, &type, &cookie)) != NULL) { if (type != NV_TYPE_NVLIST) continue; if (pci_vtcon_sock_add(sc, name, nvlist_get_nvlist(ports_nvl, name)) < 0) { EPRINTLN("cannot create port %s: %s", name, strerror(errno)); return (1); } } } return (0); } static const struct pci_devemu pci_de_vcon = { .pe_emu = "virtio-console", .pe_init = pci_vtcon_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, .pe_legacy_config = pci_vtcon_legacy_config, }; PCI_EMUL_SET(pci_de_vcon); diff --git a/usr.sbin/bhyve/pci_virtio_input.c b/usr.sbin/bhyve/pci_virtio_input.c index d4a39ed0afdd..503121d00422 100644 --- a/usr.sbin/bhyve/pci_virtio_input.c +++ b/usr.sbin/bhyve/pci_virtio_input.c @@ -1,780 +1,779 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Beckhoff Automation GmbH & Co. KG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * virtio input device emulation. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "mevent.h" #include "pci_emul.h" #include "virtio.h" #define VTINPUT_RINGSZ 64 #define VTINPUT_MAX_PKT_LEN 10 /* * Queue definitions. */ #define VTINPUT_EVENTQ 0 #define VTINPUT_STATUSQ 1 #define VTINPUT_MAXQ 2 static int pci_vtinput_debug; #define DPRINTF(params) \ if (pci_vtinput_debug) \ PRINTLN params #define WPRINTF(params) PRINTLN params enum vtinput_config_select { VTINPUT_CFG_UNSET = 0x00, VTINPUT_CFG_ID_NAME = 0x01, VTINPUT_CFG_ID_SERIAL = 0x02, VTINPUT_CFG_ID_DEVIDS = 0x03, VTINPUT_CFG_PROP_BITS = 0x10, VTINPUT_CFG_EV_BITS = 0x11, VTINPUT_CFG_ABS_INFO = 0x12 }; struct vtinput_absinfo { uint32_t min; uint32_t max; uint32_t fuzz; uint32_t flat; uint32_t res; } __packed; struct vtinput_devids { uint16_t bustype; uint16_t vendor; uint16_t product; uint16_t version; } __packed; struct vtinput_config { uint8_t select; uint8_t subsel; uint8_t size; uint8_t reserved[5]; union { char string[128]; uint8_t bitmap[128]; struct vtinput_absinfo abs; struct vtinput_devids ids; } u; } __packed; struct vtinput_event { uint16_t type; uint16_t code; uint32_t value; } __packed; struct vtinput_event_elem { struct vtinput_event event; struct iovec iov; uint16_t idx; }; struct vtinput_eventqueue { struct vtinput_event_elem *events; uint32_t size; uint32_t idx; }; /* * Per-device softc */ struct pci_vtinput_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTINPUT_MAXQ]; pthread_mutex_t vsc_mtx; const char *vsc_evdev; int vsc_fd; struct vtinput_config vsc_config; int vsc_config_valid; struct mevent *vsc_evp; struct vtinput_eventqueue vsc_eventqueue; }; static void pci_vtinput_reset(void *); static int pci_vtinput_cfgread(void *, int, int, uint32_t *); static int pci_vtinput_cfgwrite(void *, int, int, uint32_t); static struct virtio_consts vtinput_vi_consts = { .vc_name = "vtinput", .vc_nvq = VTINPUT_MAXQ, .vc_cfgsize = sizeof(struct vtinput_config), .vc_reset = pci_vtinput_reset, .vc_cfgread = pci_vtinput_cfgread, .vc_cfgwrite = pci_vtinput_cfgwrite, .vc_hv_caps = 0, }; static void pci_vtinput_reset(void *vsc) { struct pci_vtinput_softc *sc = vsc; DPRINTF(("%s: device reset requested", __func__)); vi_reset_dev(&sc->vsc_vs); } static void pci_vtinput_notify_eventq(void *vsc __unused, struct vqueue_info *vq __unused) { DPRINTF(("%s", __func__)); } static void pci_vtinput_notify_statusq(void *vsc, struct vqueue_info *vq) { struct pci_vtinput_softc *sc = vsc; while (vq_has_descs(vq)) { /* get descriptor chain */ struct iovec iov; struct vi_req req; const int n = vq_getchain(vq, &iov, 1, &req); if (n <= 0) { WPRINTF(("%s: invalid descriptor: %d", __func__, n)); return; } /* get event */ struct vtinput_event event; memcpy(&event, iov.iov_base, sizeof(event)); /* * on multi touch devices: * - host send EV_MSC to guest * - guest sends EV_MSC back to host * - host writes EV_MSC to evdev * - evdev saves EV_MSC in it's event buffer * - host receives an extra EV_MSC by reading the evdev event * buffer * - frames become larger and larger * avoid endless loops by ignoring EV_MSC */ if (event.type == EV_MSC) { vq_relchain(vq, req.idx, sizeof(event)); continue; } /* send event to evdev */ struct input_event host_event; host_event.type = event.type; host_event.code = event.code; host_event.value = event.value; if (gettimeofday(&host_event.time, NULL) != 0) { WPRINTF(("%s: failed gettimeofday", __func__)); } if (write(sc->vsc_fd, &host_event, sizeof(host_event)) == -1) { WPRINTF(("%s: failed to write host_event", __func__)); } vq_relchain(vq, req.idx, sizeof(event)); } vq_endchains(vq, 1); } static int pci_vtinput_get_bitmap(struct pci_vtinput_softc *sc, int cmd, int count) { if (count <= 0 || !sc) { return (-1); } /* query bitmap */ memset(sc->vsc_config.u.bitmap, 0, sizeof(sc->vsc_config.u.bitmap)); if (ioctl(sc->vsc_fd, cmd, sc->vsc_config.u.bitmap) < 0) { return (-1); } /* get number of set bytes in bitmap */ for (int i = count - 1; i >= 0; i--) { if (sc->vsc_config.u.bitmap[i]) { return i + 1; } } return (-1); } static int pci_vtinput_read_config_id_name(struct pci_vtinput_softc *sc) { char name[128]; if (ioctl(sc->vsc_fd, EVIOCGNAME(sizeof(name) - 1), name) < 0) { return (1); } memcpy(sc->vsc_config.u.string, name, sizeof(name)); sc->vsc_config.size = strnlen(name, sizeof(name)); return (0); } static int pci_vtinput_read_config_id_serial(struct pci_vtinput_softc *sc) { /* serial isn't supported */ sc->vsc_config.size = 0; return (0); } static int pci_vtinput_read_config_id_devids(struct pci_vtinput_softc *sc) { struct input_id devids; if (ioctl(sc->vsc_fd, EVIOCGID, &devids)) { return (1); } sc->vsc_config.u.ids.bustype = devids.bustype; sc->vsc_config.u.ids.vendor = devids.vendor; sc->vsc_config.u.ids.product = devids.product; sc->vsc_config.u.ids.version = devids.version; sc->vsc_config.size = sizeof(struct vtinput_devids); return (0); } static int pci_vtinput_read_config_prop_bits(struct pci_vtinput_softc *sc) { /* * Evdev bitmap countains 1 bit per count. Additionally evdev bitmaps * are arrays of longs instead of chars. Calculate how many longs are * required for evdev bitmap. Multiply that with sizeof(long) to get the * number of elements. */ const int count = howmany(INPUT_PROP_CNT, sizeof(long) * 8) * sizeof(long); const unsigned int cmd = EVIOCGPROP(count); const int size = pci_vtinput_get_bitmap(sc, cmd, count); if (size <= 0) { return (1); } sc->vsc_config.size = size; return (0); } static int pci_vtinput_read_config_ev_bits(struct pci_vtinput_softc *sc, uint8_t type) { int count; switch (type) { case EV_KEY: count = KEY_CNT; break; case EV_REL: count = REL_CNT; break; case EV_ABS: count = ABS_CNT; break; case EV_MSC: count = MSC_CNT; break; case EV_SW: count = SW_CNT; break; case EV_LED: count = LED_CNT; break; default: return (1); } /* * Evdev bitmap countains 1 bit per count. Additionally evdev bitmaps * are arrays of longs instead of chars. Calculate how many longs are * required for evdev bitmap. Multiply that with sizeof(long) to get the * number of elements. */ count = howmany(count, sizeof(long) * 8) * sizeof(long); const unsigned int cmd = EVIOCGBIT(sc->vsc_config.subsel, count); const int size = pci_vtinput_get_bitmap(sc, cmd, count); if (size <= 0) { return (1); } sc->vsc_config.size = size; return (0); } static int pci_vtinput_read_config_abs_info(struct pci_vtinput_softc *sc) { /* check if evdev has EV_ABS */ if (!pci_vtinput_read_config_ev_bits(sc, EV_ABS)) { return (1); } /* get abs information */ struct input_absinfo abs; if (ioctl(sc->vsc_fd, EVIOCGABS(sc->vsc_config.subsel), &abs) < 0) { return (1); } /* save abs information */ sc->vsc_config.u.abs.min = abs.minimum; sc->vsc_config.u.abs.max = abs.maximum; sc->vsc_config.u.abs.fuzz = abs.fuzz; sc->vsc_config.u.abs.flat = abs.flat; sc->vsc_config.u.abs.res = abs.resolution; sc->vsc_config.size = sizeof(struct vtinput_absinfo); return (0); } static int pci_vtinput_read_config(struct pci_vtinput_softc *sc) { switch (sc->vsc_config.select) { case VTINPUT_CFG_UNSET: return (0); case VTINPUT_CFG_ID_NAME: return pci_vtinput_read_config_id_name(sc); case VTINPUT_CFG_ID_SERIAL: return pci_vtinput_read_config_id_serial(sc); case VTINPUT_CFG_ID_DEVIDS: return pci_vtinput_read_config_id_devids(sc); case VTINPUT_CFG_PROP_BITS: return pci_vtinput_read_config_prop_bits(sc); case VTINPUT_CFG_EV_BITS: return pci_vtinput_read_config_ev_bits( sc, sc->vsc_config.subsel); case VTINPUT_CFG_ABS_INFO: return pci_vtinput_read_config_abs_info(sc); default: return (1); } } static int pci_vtinput_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtinput_softc *sc = vsc; /* check for valid offset and size */ if (offset + size > (int)sizeof(struct vtinput_config)) { WPRINTF(("%s: read to invalid offset/size %d/%d", __func__, offset, size)); memset(retval, 0, size); return (0); } /* read new config values, if select and subsel changed. */ if (!sc->vsc_config_valid) { if (pci_vtinput_read_config(sc) != 0) { DPRINTF(("%s: could not read config %d/%d", __func__, sc->vsc_config.select, sc->vsc_config.subsel)); memset(retval, 0, size); return (0); } sc->vsc_config_valid = 1; } uint8_t *ptr = (uint8_t *)&sc->vsc_config; memcpy(retval, ptr + offset, size); return (0); } static int pci_vtinput_cfgwrite(void *vsc, int offset, int size, uint32_t value) { struct pci_vtinput_softc *sc = vsc; /* guest can only write to select and subsel fields */ if (offset + size > 2) { WPRINTF(("%s: write to readonly reg %d", __func__, offset)); return (1); } /* copy value into config */ uint8_t *ptr = (uint8_t *)&sc->vsc_config; memcpy(ptr + offset, &value, size); /* select/subsel changed, query new config on next cfgread */ sc->vsc_config_valid = 0; return (0); } static int vtinput_eventqueue_add_event( struct vtinput_eventqueue *queue, struct input_event *e) { /* check if queue is full */ if (queue->idx >= queue->size) { /* alloc new elements for queue */ const uint32_t newSize = queue->idx; void *newPtr = realloc(queue->events, queue->size * sizeof(struct vtinput_event_elem)); if (newPtr == NULL) { WPRINTF(("%s: realloc memory for eventqueue failed!", __func__)); return (1); } queue->events = newPtr; queue->size = newSize; } /* save event */ struct vtinput_event *event = &queue->events[queue->idx].event; event->type = e->type; event->code = e->code; event->value = e->value; queue->idx++; return (0); } static void vtinput_eventqueue_clear(struct vtinput_eventqueue *queue) { /* just reset index to clear queue */ queue->idx = 0; } static void vtinput_eventqueue_send_events( struct vtinput_eventqueue *queue, struct vqueue_info *vq) { /* * First iteration through eventqueue: * Get descriptor chains. */ for (uint32_t i = 0; i < queue->idx; ++i) { /* get descriptor */ if (!vq_has_descs(vq)) { /* * We don't have enough descriptors for all events. * Return chains back to guest. */ vq_retchains(vq, i); WPRINTF(( "%s: not enough available descriptors, dropping %d events", __func__, queue->idx)); goto done; } /* get descriptor chain */ struct iovec iov; struct vi_req req; const int n = vq_getchain(vq, &iov, 1, &req); if (n <= 0) { WPRINTF(("%s: invalid descriptor: %d", __func__, n)); return; } if (n != 1) { WPRINTF( ("%s: invalid number of descriptors in chain: %d", __func__, n)); /* release invalid chain */ vq_relchain(vq, req.idx, 0); return; } if (iov.iov_len < sizeof(struct vtinput_event)) { WPRINTF(("%s: invalid descriptor length: %lu", __func__, iov.iov_len)); /* release invalid chain */ vq_relchain(vq, req.idx, 0); return; } /* save descriptor */ queue->events[i].iov = iov; queue->events[i].idx = req.idx; } /* * Second iteration through eventqueue: * Send events to guest by releasing chains */ for (uint32_t i = 0; i < queue->idx; ++i) { struct vtinput_event_elem event = queue->events[i]; memcpy(event.iov.iov_base, &event.event, sizeof(struct vtinput_event)); vq_relchain(vq, event.idx, sizeof(struct vtinput_event)); } done: /* clear queue and send interrupt to guest */ vtinput_eventqueue_clear(queue); vq_endchains(vq, 1); return; } static int vtinput_read_event_from_host(int fd, struct input_event *event) { const int len = read(fd, event, sizeof(struct input_event)); if (len != sizeof(struct input_event)) { if (len == -1 && errno != EAGAIN) { WPRINTF(("%s: event read failed! len = %d, errno = %d", __func__, len, errno)); } /* host doesn't have more events for us */ return (1); } return (0); } static void vtinput_read_event(int fd __attribute((unused)), enum ev_type t __attribute__((unused)), void *arg __attribute__((unused))) { struct pci_vtinput_softc *sc = arg; /* skip if driver isn't ready */ if (!(sc->vsc_vs.vs_status & VIRTIO_CONFIG_STATUS_DRIVER_OK)) return; /* read all events from host */ struct input_event event; while (vtinput_read_event_from_host(sc->vsc_fd, &event) == 0) { /* add events to our queue */ vtinput_eventqueue_add_event(&sc->vsc_eventqueue, &event); /* only send events to guest on EV_SYN or SYN_REPORT */ if (event.type != EV_SYN || event.type != SYN_REPORT) { continue; } /* send host events to guest */ vtinput_eventqueue_send_events( &sc->vsc_eventqueue, &sc->vsc_queues[VTINPUT_EVENTQ]); } } static int pci_vtinput_legacy_config(nvlist_t *nvl, const char *opts) { if (opts == NULL) return (-1); /* * parse opts: * virtio-input,/dev/input/eventX */ char *cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "path", opts); return (0); } char *path = strndup(opts, cp - opts); set_config_value_node(nvl, "path", path); free(path); return (pci_parse_legacy_config(nvl, cp + 1)); } static int -pci_vtinput_init(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl) +pci_vtinput_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtinput_softc *sc; /* * Keep it here. * Else it's possible to access it uninitialized by jumping to failed. */ pthread_mutexattr_t mtx_attr = NULL; sc = calloc(1, sizeof(struct pci_vtinput_softc)); sc->vsc_evdev = get_config_value_node(nvl, "path"); if (sc->vsc_evdev == NULL) { WPRINTF(("%s: missing required path config value", __func__)); goto failed; } /* * open evdev by using non blocking I/O: * read from /dev/input/eventX would block our thread otherwise */ sc->vsc_fd = open(sc->vsc_evdev, O_RDWR | O_NONBLOCK); if (sc->vsc_fd < 0) { WPRINTF(("%s: failed to open %s", __func__, sc->vsc_evdev)); goto failed; } /* check if evdev is really a evdev */ int evversion; int error = ioctl(sc->vsc_fd, EVIOCGVERSION, &evversion); if (error < 0) { WPRINTF(("%s: %s is no evdev", __func__, sc->vsc_evdev)); goto failed; } /* gain exclusive access to evdev */ error = ioctl(sc->vsc_fd, EVIOCGRAB, 1); if (error < 0) { WPRINTF(("%s: failed to grab %s", __func__, sc->vsc_evdev)); goto failed; } if (pthread_mutexattr_init(&mtx_attr)) { WPRINTF(("%s: init mutexattr failed", __func__)); goto failed; } if (pthread_mutexattr_settype(&mtx_attr, PTHREAD_MUTEX_RECURSIVE)) { WPRINTF(("%s: settype mutexattr failed", __func__)); goto failed; } if (pthread_mutex_init(&sc->vsc_mtx, &mtx_attr)) { WPRINTF(("%s: init mutex failed", __func__)); goto failed; } /* init softc */ sc->vsc_eventqueue.idx = 0; sc->vsc_eventqueue.size = VTINPUT_MAX_PKT_LEN; sc->vsc_eventqueue.events = calloc( sc->vsc_eventqueue.size, sizeof(struct vtinput_event_elem)); sc->vsc_config_valid = 0; if (sc->vsc_eventqueue.events == NULL) { WPRINTF(("%s: failed to alloc eventqueue", __func__)); goto failed; } /* register event handler */ sc->vsc_evp = mevent_add(sc->vsc_fd, EVF_READ, vtinput_read_event, sc); if (sc->vsc_evp == NULL) { WPRINTF(("%s: could not register mevent", __func__)); goto failed; } #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); if (caph_rights_limit(sc->vsc_fd, &rights) == -1) { errx(EX_OSERR, "Unable to apply rights for sandbox"); } #endif /* link virtio to softc */ vi_softc_linkup( &sc->vsc_vs, &vtinput_vi_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; /* init virtio queues */ sc->vsc_queues[VTINPUT_EVENTQ].vq_qsize = VTINPUT_RINGSZ; sc->vsc_queues[VTINPUT_EVENTQ].vq_notify = pci_vtinput_notify_eventq; sc->vsc_queues[VTINPUT_STATUSQ].vq_qsize = VTINPUT_RINGSZ; sc->vsc_queues[VTINPUT_STATUSQ].vq_notify = pci_vtinput_notify_statusq; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_INPUT); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_INPUTDEV); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_INPUTDEV_OTHER); pci_set_cfgdata8(pi, PCIR_REVID, VIRTIO_REV_INPUT); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_SUBDEV_INPUT); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_SUBVEN_INPUT); /* add MSI-X table BAR */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) goto failed; /* add virtio register */ vi_set_io_bar(&sc->vsc_vs, 0); return (0); failed: if (sc == NULL) { return (-1); } if (sc->vsc_evp) mevent_delete(sc->vsc_evp); if (sc->vsc_eventqueue.events) free(sc->vsc_eventqueue.events); if (sc->vsc_mtx) pthread_mutex_destroy(&sc->vsc_mtx); if (mtx_attr) pthread_mutexattr_destroy(&mtx_attr); if (sc->vsc_fd) close(sc->vsc_fd); free(sc); return (-1); } static struct pci_devemu pci_de_vinput = { .pe_emu = "virtio-input", .pe_init = pci_vtinput_init, .pe_legacy_config = pci_vtinput_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, }; PCI_EMUL_SET(pci_de_vinput); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index 85fa09ca194b..92ea78826391 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -1,821 +1,820 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #include "net_utils.h" #include "net_backends.h" #include "iov.h" #define VTNET_RINGSZ 1024 #define VTNET_MAXSEGS 256 #define VTNET_MAX_PKT_LEN (65536 + 64) #define VTNET_MIN_MTU ETHERMIN #define VTNET_MAX_MTU 65535 #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" */ struct virtio_net_config { uint8_t mac[6]; uint16_t status; uint16_t max_virtqueue_pairs; uint16_t mtu; } __packed; /* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 #define VTNET_CTLQ 2 /* NB: not yet supported */ #define VTNET_MAXQ 3 /* * Debug printf */ static int pci_vtnet_debug; #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* * Per-device softc */ struct pci_vtnet_softc { struct virtio_softc vsc_vs; struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; net_backend_t *vsc_be; bool features_negotiated; /* protected by rx_mtx */ int resetting; /* protected by tx_mtx */ uint64_t vsc_features; /* negotiated features */ pthread_mutex_t rx_mtx; int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; size_t vhdrlen; size_t be_vhdrlen; struct virtio_net_config vsc_config; struct virtio_consts vsc_consts; }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); static void pci_vtnet_neg_features(void *, uint64_t); #ifdef BHYVE_SNAPSHOT static void pci_vtnet_pause(void *); static void pci_vtnet_resume(void *); static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *); #endif static struct virtio_consts vtnet_vi_consts = { .vc_name = "vtnet", .vc_nvq = VTNET_MAXQ - 1, .vc_cfgsize = sizeof(struct virtio_net_config), .vc_reset = pci_vtnet_reset, .vc_cfgread = pci_vtnet_cfgread, .vc_cfgwrite = pci_vtnet_cfgwrite, .vc_apply_features = pci_vtnet_neg_features, .vc_hv_caps = VTNET_S_HOSTCAPS, #ifdef BHYVE_SNAPSHOT .vc_pause = pci_vtnet_pause, .vc_resume = pci_vtnet_resume, .vc_snapshot = pci_vtnet_snapshot, #endif }; static void pci_vtnet_reset(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device reset requested !")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* * Make sure receive operation is disabled at least until we * re-negotiate the features, since receive operation depends * on the value of sc->rx_merge and the header length, which * are both set in pci_vtnet_neg_features(). * Receive operation will be enabled again once the guest adds * the first receive buffers and kicks us. */ sc->features_negotiated = false; netbe_rx_disable(sc->vsc_be); /* Set sc->resetting and give a chance to the TX thread to stop. */ pthread_mutex_lock(&sc->tx_mtx); sc->resetting = 1; while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } /* * Now reset rings, MSI-X vectors, and negotiated capabilities. * Do that with the TX lock held, since we need to reset * sc->resetting. */ vi_reset_dev(&sc->vsc_vs); sc->resetting = 0; pthread_mutex_unlock(&sc->tx_mtx); pthread_mutex_unlock(&sc->rx_mtx); } static __inline struct iovec * iov_trim_hdr(struct iovec *iov, int *iovcnt, unsigned int hlen) { struct iovec *riov; if (iov[0].iov_len < hlen) { /* * Not enough header space in the first fragment. * That's not ok for us. */ return NULL; } iov[0].iov_len -= hlen; if (iov[0].iov_len == 0) { *iovcnt -= 1; if (*iovcnt == 0) { /* * Only space for the header. That's not * enough for us. */ return NULL; } riov = &iov[1]; } else { iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + hlen); riov = &iov[0]; } return (riov); } struct virtio_mrg_rxbuf_info { uint16_t idx; uint16_t pad; uint32_t len; }; static void pci_vtnet_rx(struct pci_vtnet_softc *sc) { int prepend_hdr_len = sc->vhdrlen - sc->be_vhdrlen; struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS]; struct iovec iov[VTNET_MAXSEGS + 1]; struct vqueue_info *vq; struct vi_req req; vq = &sc->vsc_queues[VTNET_RXQ]; /* Features must be negotiated */ if (!sc->features_negotiated) { return; } for (;;) { struct virtio_net_rxhdr *hdr; uint32_t riov_bytes; struct iovec *riov; uint32_t ulen; int riov_len; int n_chains; ssize_t rlen; ssize_t plen; plen = netbe_peek_recvlen(sc->vsc_be); if (plen <= 0) { /* * No more packets (plen == 0), or backend errored * (plen < 0). Interrupt if needed and stop. */ vq_endchains(vq, /*used_all_avail=*/0); return; } plen += prepend_hdr_len; /* * Get a descriptor chain to store the next ingress * packet. In case of mergeable rx buffers, get as * many chains as necessary in order to make room * for plen bytes. */ riov_bytes = 0; riov_len = 0; riov = iov; n_chains = 0; do { int n = vq_getchain(vq, riov, VTNET_MAXSEGS - riov_len, &req); info[n_chains].idx = req.idx; if (n == 0) { /* * No rx buffers. Enable RX kicks and double * check. */ vq_kick_enable(vq); if (!vq_has_descs(vq)) { /* * Still no buffers. Return the unused * chains (if any), interrupt if needed * (including for NOTIFY_ON_EMPTY), and * disable the backend until the next * kick. */ vq_retchains(vq, n_chains); vq_endchains(vq, /*used_all_avail=*/1); netbe_rx_disable(sc->vsc_be); return; } /* More rx buffers found, so keep going. */ vq_kick_disable(vq); continue; } assert(n >= 1 && riov_len + n <= VTNET_MAXSEGS); riov_len += n; if (!sc->rx_merge) { n_chains = 1; break; } info[n_chains].len = (uint32_t)count_iov(riov, n); riov_bytes += info[n_chains].len; riov += n; n_chains++; } while (riov_bytes < plen && riov_len < VTNET_MAXSEGS); riov = iov; hdr = riov[0].iov_base; if (prepend_hdr_len > 0) { /* * The frontend uses a virtio-net header, but the * backend does not. We need to prepend a zeroed * header. */ riov = iov_trim_hdr(riov, &riov_len, prepend_hdr_len); if (riov == NULL) { /* * The first collected chain is nonsensical, * as it is not even enough to store the * virtio-net header. Just drop it. */ vq_relchain(vq, info[0].idx, 0); vq_retchains(vq, n_chains - 1); continue; } memset(hdr, 0, prepend_hdr_len); } rlen = netbe_recv(sc->vsc_be, riov, riov_len); if (rlen != plen - prepend_hdr_len) { /* * If this happens it means there is something * wrong with the backend (e.g., some other * process is stealing our packets). */ WPRINTF(("netbe_recv: expected %zd bytes, " "got %zd", plen - prepend_hdr_len, rlen)); vq_retchains(vq, n_chains); continue; } ulen = (uint32_t)plen; /* * Publish the used buffers to the guest, reporting the * number of bytes that we wrote. */ if (!sc->rx_merge) { vq_relchain(vq, info[0].idx, ulen); } else { uint32_t iolen; int i = 0; do { iolen = info[i].len; if (iolen > ulen) { iolen = ulen; } vq_relchain_prepare(vq, info[i].idx, iolen); ulen -= iolen; i++; } while (ulen > 0); hdr->vrh_bufs = i; vq_relchain_publish(vq); assert(i == n_chains); } } } /* * Called when there is read activity on the backend file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. */ static void pci_vtnet_rx_callback(int fd __unused, enum ev_type type __unused, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); pci_vtnet_rx(sc); pthread_mutex_unlock(&sc->rx_mtx); } /* Called on RX kick. */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * A qnotify means that the rx process can now begin. * Enable RX only if features are negotiated. */ pthread_mutex_lock(&sc->rx_mtx); if (!sc->features_negotiated) { pthread_mutex_unlock(&sc->rx_mtx); return; } vq_kick_disable(vq); netbe_rx_enable(sc->vsc_be); pthread_mutex_unlock(&sc->rx_mtx); } /* TX virtqueue processing, called by the TX thread. */ static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; struct iovec *siov = iov; struct vi_req req; ssize_t len; int n; /* * Obtain chain of descriptors. The first descriptor also * contains the virtio-net header. */ n = vq_getchain(vq, iov, VTNET_MAXSEGS, &req); assert(n >= 1 && n <= VTNET_MAXSEGS); if (sc->vhdrlen != sc->be_vhdrlen) { /* * The frontend uses a virtio-net header, but the backend * does not. We simply strip the header and ignore it, as * it should be zero-filled. */ siov = iov_trim_hdr(siov, &n, sc->vhdrlen); } if (siov == NULL) { /* The chain is nonsensical. Just drop it. */ len = 0; } else { len = netbe_send(sc->vsc_be, siov, n); if (len < 0) { /* * If send failed, report that 0 bytes * were read. */ len = 0; } } /* * Return the processed chain to the guest, reporting * the number of bytes that we read. */ vq_relchain(vq, req.idx, len); } /* Called on TX kick. */ static void pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { struct pci_vtnet_softc *sc = vsc; /* * Any ring entries to process? */ if (!vq_has_descs(vq)) return; /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); vq_kick_disable(vq); if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); } /* * Thread which will handle processing of TX desc */ static void * pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; int error; vq = &sc->vsc_queues[VTNET_TXQ]; /* * Let us wait till the tx queue pointers get initialised & * first tx signaled */ pthread_mutex_lock(&sc->tx_mtx); error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); for (;;) { /* note - tx mutex is locked here */ while (sc->resetting || !vq_has_descs(vq)) { vq_kick_enable(vq); if (!sc->resetting && vq_has_descs(vq)) break; sc->tx_in_progress = 0; error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); } vq_kick_disable(vq); sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); do { /* * Run through entries, placing them into * iovecs and sending when an end-of-packet * is found */ pci_vtnet_proctx(sc, vq); } while (vq_has_descs(vq)); /* * Generate an interrupt if needed. */ vq_endchains(vq, /*used_all_avail=*/1); pthread_mutex_lock(&sc->tx_mtx); } } #ifdef notyet static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!")); } #endif static int -pci_vtnet_init(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl) +pci_vtnet_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtnet_softc *sc; const char *value; char tname[MAXCOMLEN + 1]; unsigned long mtu = ETHERMTU; int err; /* * Allocate data structures for further virtio initializations. * sc also contains a copy of vtnet_vi_consts, since capabilities * change depending on the backend. */ sc = calloc(1, sizeof(struct pci_vtnet_softc)); sc->vsc_consts = vtnet_vi_consts; pthread_mutex_init(&sc->vsc_mtx, NULL); sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; #ifdef notyet sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif value = get_config_value_node(nvl, "mac"); if (value != NULL) { err = net_parsemac(value, sc->vsc_config.mac); if (err) { free(sc); return (err); } } else net_genmac(pi, sc->vsc_config.mac); value = get_config_value_node(nvl, "mtu"); if (value != NULL) { err = net_parsemtu(value, &mtu); if (err) { free(sc); return (err); } if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) { err = EINVAL; errno = EINVAL; free(sc); return (err); } sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU; } sc->vsc_config.mtu = mtu; /* Permit interfaces without a configured backend. */ if (get_config_value_node(nvl, "backend") != NULL) { err = netbe_init(&sc->vsc_be, nvl, pci_vtnet_rx_callback, sc); if (err) { free(sc); return (err); } } sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | netbe_get_cap(sc->vsc_be); /* * Since we do not actually support multiqueue, * set the maximum virtqueue pairs to 1. */ sc->vsc_config.max_virtqueue_pairs = 1; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); /* Link is always up. */ sc->vsc_config.status = 1; vi_softc_linkup(&sc->vsc_vs, &sc->vsc_consts, sc, pi, sc->vsc_queues); sc->vsc_vs.vs_mtx = &sc->vsc_mtx; /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) { free(sc); return (1); } /* use BAR 0 to map config regs in IO space */ vi_set_io_bar(&sc->vsc_vs, 0); sc->resetting = 0; sc->rx_merge = 0; sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; pthread_mutex_init(&sc->rx_mtx, NULL); /* * Initialize tx semaphore & spawn TX processing thread. * As of now, only one thread for TX desc processing is * spawned. */ sc->tx_in_progress = 0; pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, pi->pi_func); pthread_set_name_np(sc->tx_tid, tname); return (0); } static int pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) { struct pci_vtnet_softc *sc = vsc; void *ptr; if (offset < (int)sizeof(sc->vsc_config.mac)) { assert(offset + size <= (int)sizeof(sc->vsc_config.mac)); /* * The driver is allowed to change the MAC address */ ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d", offset)); } return (0); } static int pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtnet_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vsc_config + offset; memcpy(retval, ptr, size); return (0); } static void pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtnet_softc *sc = vsc; sc->vsc_features = negotiated_features; if (negotiated_features & VIRTIO_NET_F_MRG_RXBUF) { sc->vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_merge = 1; } else { /* * Without mergeable rx buffers, virtio-net header is 2 * bytes shorter than sizeof(struct virtio_net_rxhdr). */ sc->vhdrlen = sizeof(struct virtio_net_rxhdr) - 2; sc->rx_merge = 0; } /* Tell the backend to enable some capabilities it has advertised. */ netbe_set_cap(sc->vsc_be, negotiated_features, sc->vhdrlen); sc->be_vhdrlen = netbe_get_vnet_hdr_len(sc->vsc_be); assert(sc->be_vhdrlen == 0 || sc->be_vhdrlen == sc->vhdrlen); pthread_mutex_lock(&sc->rx_mtx); sc->features_negotiated = true; pthread_mutex_unlock(&sc->rx_mtx); } #ifdef BHYVE_SNAPSHOT static void pci_vtnet_pause(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device pause requested !\n")); /* Acquire the RX lock to block RX processing. */ pthread_mutex_lock(&sc->rx_mtx); /* Wait for the transmit thread to finish its processing. */ pthread_mutex_lock(&sc->tx_mtx); while (sc->tx_in_progress) { pthread_mutex_unlock(&sc->tx_mtx); usleep(10000); pthread_mutex_lock(&sc->tx_mtx); } } static void pci_vtnet_resume(void *vsc) { struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device resume requested !\n")); pthread_mutex_unlock(&sc->tx_mtx); /* The RX lock should have been acquired in vtnet_pause. */ pthread_mutex_unlock(&sc->rx_mtx); } static int pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta) { int ret; struct pci_vtnet_softc *sc = vsc; DPRINTF(("vtnet: device snapshot requested !\n")); /* * Queues and consts should have been saved by the more generic * vi_pci_snapshot function. We need to save only our features and * config. */ SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->features_negotiated, meta, ret, done); /* Force reapply negociated features at restore time */ if (meta->op == VM_SNAPSHOT_RESTORE && sc->features_negotiated) { pci_vtnet_neg_features(sc, sc->vsc_features); netbe_rx_enable(sc->vsc_be); } SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->vhdrlen, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->be_vhdrlen, meta, ret, done); done: return (ret); } #endif static const struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_legacy_config = netbe_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, .pe_pause = vi_pci_pause, .pe_resume = vi_pci_resume, #endif }; PCI_EMUL_SET(pci_de_vnet); diff --git a/usr.sbin/bhyve/pci_virtio_rnd.c b/usr.sbin/bhyve/pci_virtio_rnd.c index 416a1b605e9f..ff7e6b7fc7c8 100644 --- a/usr.sbin/bhyve/pci_virtio_rnd.c +++ b/usr.sbin/bhyve/pci_virtio_rnd.c @@ -1,213 +1,212 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Nahanni Systems Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * virtio entropy device emulation. * Randomness is sourced from /dev/random which does not block * once it has been seeded at bootup. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #define VTRND_RINGSZ 64 static int pci_vtrnd_debug; #define DPRINTF(params) if (pci_vtrnd_debug) PRINTLN params #define WPRINTF(params) PRINTLN params /* * Per-device softc */ struct pci_vtrnd_softc { struct virtio_softc vrsc_vs; struct vqueue_info vrsc_vq; pthread_mutex_t vrsc_mtx; uint64_t vrsc_cfg; int vrsc_fd; }; static void pci_vtrnd_reset(void *); static void pci_vtrnd_notify(void *, struct vqueue_info *); static struct virtio_consts vtrnd_vi_consts = { .vc_name = "vtrnd", .vc_nvq = 1, .vc_cfgsize = 0, .vc_reset = pci_vtrnd_reset, .vc_qnotify = pci_vtrnd_notify, .vc_hv_caps = 0, }; static void pci_vtrnd_reset(void *vsc) { struct pci_vtrnd_softc *sc; sc = vsc; DPRINTF(("vtrnd: device reset requested !")); vi_reset_dev(&sc->vrsc_vs); } static void pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) { struct iovec iov; struct pci_vtrnd_softc *sc; struct vi_req req; int len, n; sc = vsc; if (sc->vrsc_fd < 0) { vq_endchains(vq, 0); return; } while (vq_has_descs(vq)) { n = vq_getchain(vq, &iov, 1, &req); assert(n == 1); len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); DPRINTF(("vtrnd: vtrnd_notify(): %d", len)); /* Catastrophe if unable to read from /dev/random */ assert(len > 0); /* * Release this chain and handle more */ vq_relchain(vq, req.idx, len); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static int -pci_vtrnd_init(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl __unused) +pci_vtrnd_init(struct pci_devinst *pi, nvlist_t *nvl __unused) { struct pci_vtrnd_softc *sc; int fd; int len; uint8_t v; #ifndef WITHOUT_CAPSICUM cap_rights_t rights; #endif /* * Should always be able to open /dev/random. */ fd = open("/dev/random", O_RDONLY | O_NONBLOCK); assert(fd >= 0); #ifndef WITHOUT_CAPSICUM cap_rights_init(&rights, CAP_READ); if (caph_rights_limit(fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif /* * Check that device is seeded and non-blocking. */ len = read(fd, &v, sizeof(v)); if (len <= 0) { WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); close(fd); return (1); } sc = calloc(1, sizeof(struct pci_vtrnd_softc)); pthread_mutex_init(&sc->vrsc_mtx, NULL); vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; /* keep /dev/random opened while emulating */ sc->vrsc_fd = fd; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_ENTROPY); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vrsc_vs, 0); return (0); } static const struct pci_devemu pci_de_vrnd = { .pe_emu = "virtio-rnd", .pe_init = pci_vtrnd_init, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = vi_pci_snapshot, #endif }; PCI_EMUL_SET(pci_de_vrnd); diff --git a/usr.sbin/bhyve/pci_virtio_scsi.c b/usr.sbin/bhyve/pci_virtio_scsi.c index bad95e4e85b0..7da825ad4050 100644 --- a/usr.sbin/bhyve/pci_virtio_scsi.c +++ b/usr.sbin/bhyve/pci_virtio_scsi.c @@ -1,763 +1,762 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Jakub Klama . * Copyright (c) 2018 Marcelo Araujo . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" #include "iov.h" #define VTSCSI_RINGSZ 64 #define VTSCSI_REQUESTQ 1 #define VTSCSI_THR_PER_Q 16 #define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) #define VTSCSI_MAXSEG 64 #define VTSCSI_IN_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) #define VTSCSI_OUT_HEADER_LEN(_sc) \ (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) #define VIRTIO_SCSI_MAX_CHANNEL 0 #define VIRTIO_SCSI_MAX_TARGET 0 #define VIRTIO_SCSI_MAX_LUN 16383 #define VIRTIO_SCSI_F_INOUT (1 << 0) #define VIRTIO_SCSI_F_HOTPLUG (1 << 1) #define VIRTIO_SCSI_F_CHANGE (1 << 2) static int pci_vtscsi_debug = 0; #define WPRINTF(msg, params...) PRINTLN("virtio-scsi: " msg, ##params) #define DPRINTF(msg, params...) if (pci_vtscsi_debug) WPRINTF(msg, ##params) struct pci_vtscsi_config { uint32_t num_queues; uint32_t seg_max; uint32_t max_sectors; uint32_t cmd_per_lun; uint32_t event_info_size; uint32_t sense_size; uint32_t cdb_size; uint16_t max_channel; uint16_t max_target; uint32_t max_lun; } __attribute__((packed)); struct pci_vtscsi_queue { struct pci_vtscsi_softc * vsq_sc; struct vqueue_info * vsq_vq; pthread_mutex_t vsq_mtx; pthread_mutex_t vsq_qmtx; pthread_cond_t vsq_cv; STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; LIST_HEAD(, pci_vtscsi_worker) vsq_workers; }; struct pci_vtscsi_worker { struct pci_vtscsi_queue * vsw_queue; pthread_t vsw_thread; bool vsw_exiting; LIST_ENTRY(pci_vtscsi_worker) vsw_link; }; struct pci_vtscsi_request { struct pci_vtscsi_queue * vsr_queue; struct iovec vsr_iov_in[VTSCSI_MAXSEG]; int vsr_niov_in; struct iovec vsr_iov_out[VTSCSI_MAXSEG]; int vsr_niov_out; uint32_t vsr_idx; STAILQ_ENTRY(pci_vtscsi_request) vsr_link; }; /* * Per-device softc */ struct pci_vtscsi_softc { struct virtio_softc vss_vs; struct vqueue_info vss_vq[VTSCSI_MAXQ]; struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; pthread_mutex_t vss_mtx; int vss_iid; int vss_ctl_fd; uint32_t vss_features; struct pci_vtscsi_config vss_config; }; #define VIRTIO_SCSI_T_TMF 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 #define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 #define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 #define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 #define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 #define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 #define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 #define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 /* command-specific response values */ #define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 #define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 #define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 struct pci_vtscsi_ctrl_tmf { uint32_t type; uint32_t subtype; uint8_t lun[8]; uint64_t id; uint8_t response; } __attribute__((packed)); #define VIRTIO_SCSI_T_AN_QUERY 1 #define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 #define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 #define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 #define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 #define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 #define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 struct pci_vtscsi_ctrl_an { uint32_t type; uint8_t lun[8]; uint32_t event_requested; uint32_t event_actual; uint8_t response; } __attribute__((packed)); /* command-specific response values */ #define VIRTIO_SCSI_S_OK 0 #define VIRTIO_SCSI_S_OVERRUN 1 #define VIRTIO_SCSI_S_ABORTED 2 #define VIRTIO_SCSI_S_BAD_TARGET 3 #define VIRTIO_SCSI_S_RESET 4 #define VIRTIO_SCSI_S_BUSY 5 #define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 #define VIRTIO_SCSI_S_TARGET_FAILURE 7 #define VIRTIO_SCSI_S_NEXUS_FAILURE 8 #define VIRTIO_SCSI_S_FAILURE 9 #define VIRTIO_SCSI_S_INCORRECT_LUN 12 /* task_attr */ #define VIRTIO_SCSI_S_SIMPLE 0 #define VIRTIO_SCSI_S_ORDERED 1 #define VIRTIO_SCSI_S_HEAD 2 #define VIRTIO_SCSI_S_ACA 3 struct pci_vtscsi_event { uint32_t event; uint8_t lun[8]; uint32_t reason; } __attribute__((packed)); struct pci_vtscsi_req_cmd_rd { uint8_t lun[8]; uint64_t id; uint8_t task_attr; uint8_t prio; uint8_t crn; uint8_t cdb[]; } __attribute__((packed)); struct pci_vtscsi_req_cmd_wr { uint32_t sense_len; uint32_t residual; uint16_t status_qualifier; uint8_t status; uint8_t response; uint8_t sense[]; } __attribute__((packed)); static void *pci_vtscsi_proc(void *); static void pci_vtscsi_reset(void *); static void pci_vtscsi_neg_features(void *, uint64_t); static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); static inline int pci_vtscsi_get_lun(uint8_t *); static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_tmf *); static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, struct pci_vtscsi_ctrl_an *); static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, int, struct iovec *, int); static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, struct pci_vtscsi_queue *, int); -static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, nvlist_t *); +static int pci_vtscsi_init(struct pci_devinst *, nvlist_t *); static struct virtio_consts vtscsi_vi_consts = { .vc_name = "vtscsi", .vc_nvq = VTSCSI_MAXQ, .vc_cfgsize = sizeof(struct pci_vtscsi_config), .vc_reset = pci_vtscsi_reset, .vc_cfgread = pci_vtscsi_cfgread, .vc_cfgwrite = pci_vtscsi_cfgwrite, .vc_apply_features = pci_vtscsi_neg_features, .vc_hv_caps = 0, }; static void * pci_vtscsi_proc(void *arg) { struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; struct pci_vtscsi_queue *q = worker->vsw_queue; struct pci_vtscsi_request *req; int iolen; for (;;) { pthread_mutex_lock(&q->vsq_mtx); while (STAILQ_EMPTY(&q->vsq_requests) && !worker->vsw_exiting) pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); if (worker->vsw_exiting) break; req = STAILQ_FIRST(&q->vsq_requests); STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); pthread_mutex_unlock(&q->vsq_mtx); iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); pthread_mutex_lock(&q->vsq_qmtx); vq_relchain(q->vsq_vq, req->vsr_idx, iolen); vq_endchains(q->vsq_vq, 0); pthread_mutex_unlock(&q->vsq_qmtx); DPRINTF("request completed", req->vsr_idx); free(req); } pthread_mutex_unlock(&q->vsq_mtx); return (NULL); } static void pci_vtscsi_reset(void *vsc) { struct pci_vtscsi_softc *sc; sc = vsc; DPRINTF("device reset requested"); vi_reset_dev(&sc->vss_vs); /* initialize config structure */ sc->vss_config = (struct pci_vtscsi_config){ .num_queues = VTSCSI_REQUESTQ, /* Leave room for the request and the response. */ .seg_max = VTSCSI_MAXSEG - 2, .max_sectors = 2, .cmd_per_lun = 1, .event_info_size = sizeof(struct pci_vtscsi_event), .sense_size = 96, .cdb_size = 32, .max_channel = VIRTIO_SCSI_MAX_CHANNEL, .max_target = VIRTIO_SCSI_MAX_TARGET, .max_lun = VIRTIO_SCSI_MAX_LUN }; } static void pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) { struct pci_vtscsi_softc *sc = vsc; sc->vss_features = negotiated_features; } static int pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) { struct pci_vtscsi_softc *sc = vsc; void *ptr; ptr = (uint8_t *)&sc->vss_config + offset; memcpy(retval, ptr, size); return (0); } static int pci_vtscsi_cfgwrite(void *vsc __unused, int offset __unused, int size __unused, uint32_t val __unused) { return (0); } static inline int pci_vtscsi_get_lun(uint8_t *lun) { return (((lun[2] << 8) | lun[3]) & 0x3fff); } static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, size_t bufsize) { struct pci_vtscsi_ctrl_tmf *tmf; struct pci_vtscsi_ctrl_an *an; uint32_t type; if (bufsize < sizeof(uint32_t)) { WPRINTF("ignoring truncated control request"); return (0); } type = *(uint32_t *)buf; if (type == VIRTIO_SCSI_T_TMF) { if (bufsize != sizeof(*tmf)) { WPRINTF("ignoring tmf request with size %zu", bufsize); return (0); } tmf = (struct pci_vtscsi_ctrl_tmf *)buf; return (pci_vtscsi_tmf_handle(sc, tmf)); } if (type == VIRTIO_SCSI_T_AN_QUERY) { if (bufsize != sizeof(*an)) { WPRINTF("ignoring AN request with size %zu", bufsize); return (0); } an = (struct pci_vtscsi_ctrl_an *)buf; return (pci_vtscsi_an_handle(sc, an)); } return (0); } static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, struct pci_vtscsi_ctrl_tmf *tmf) { union ctl_io *io; int err; io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.io_type = CTL_IO_TASK; io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); io->taskio.tag_type = CTL_TAG_SIMPLE; io->taskio.tag_num = (uint32_t)tmf->id; switch (tmf->subtype) { case VIRTIO_SCSI_T_TMF_ABORT_TASK: io->taskio.task_action = CTL_TASK_ABORT_TASK; break; case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; break; case VIRTIO_SCSI_T_TMF_CLEAR_ACA: io->taskio.task_action = CTL_TASK_CLEAR_ACA; break; case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; break; case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; break; case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: io->taskio.task_action = CTL_TASK_LUN_RESET; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK: io->taskio.task_action = CTL_TASK_QUERY_TASK; break; case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; break; } if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF("%s", sbuf_data(sb)); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) WPRINTF("CTL_IO: err=%d (%s)", errno, strerror(errno)); tmf->response = io->taskio.task_status; ctl_scsi_free_io(io); return (1); } static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc __unused, struct pci_vtscsi_ctrl_an *an __unused) { return (0); } static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, int niov_in, struct iovec *iov_out, int niov_out) { struct pci_vtscsi_softc *sc = q->vsq_sc; struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; struct pci_vtscsi_req_cmd_wr *cmd_wr; struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; union ctl_io *io; int data_niov_in, data_niov_out; void *ext_data_ptr = NULL; uint32_t ext_data_len = 0, ext_sg_entries = 0; int err, nxferred; if (count_iov(iov_out, niov_out) < VTSCSI_OUT_HEADER_LEN(sc)) { WPRINTF("ignoring request with insufficient output"); return (0); } if (count_iov(iov_in, niov_in) < VTSCSI_IN_HEADER_LEN(sc)) { WPRINTF("ignoring request with incomplete header"); return (0); } seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, VTSCSI_IN_HEADER_LEN(sc)); seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, VTSCSI_OUT_HEADER_LEN(sc)); truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); cmd_wr = calloc(1, VTSCSI_OUT_HEADER_LEN(sc)); io = ctl_scsi_alloc_io(sc->vss_iid); ctl_scsi_zero_io(io); io->io_hdr.nexus.initid = sc->vss_iid; io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); io->io_hdr.io_type = CTL_IO_SCSI; if (data_niov_in > 0) { ext_data_ptr = (void *)data_iov_in; ext_sg_entries = data_niov_in; ext_data_len = count_iov(data_iov_in, data_niov_in); io->io_hdr.flags |= CTL_FLAG_DATA_OUT; } else if (data_niov_out > 0) { ext_data_ptr = (void *)data_iov_out; ext_sg_entries = data_niov_out; ext_data_len = count_iov(data_iov_out, data_niov_out); io->io_hdr.flags |= CTL_FLAG_DATA_IN; } io->scsiio.sense_len = sc->vss_config.sense_size; io->scsiio.tag_num = (uint32_t)cmd_rd->id; switch (cmd_rd->task_attr) { case VIRTIO_SCSI_S_ORDERED: io->scsiio.tag_type = CTL_TAG_ORDERED; break; case VIRTIO_SCSI_S_HEAD: io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; break; case VIRTIO_SCSI_S_ACA: io->scsiio.tag_type = CTL_TAG_ACA; break; case VIRTIO_SCSI_S_SIMPLE: default: io->scsiio.tag_type = CTL_TAG_SIMPLE; break; } io->scsiio.ext_sg_entries = ext_sg_entries; io->scsiio.ext_data_ptr = ext_data_ptr; io->scsiio.ext_data_len = ext_data_len; io->scsiio.ext_data_filled = 0; io->scsiio.cdb_len = sc->vss_config.cdb_size; memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); if (pci_vtscsi_debug) { struct sbuf *sb = sbuf_new_auto(); ctl_io_sbuf(io, sb); sbuf_finish(sb); DPRINTF("%s", sbuf_data(sb)); sbuf_delete(sb); } err = ioctl(sc->vss_ctl_fd, CTL_IO, io); if (err != 0) { WPRINTF("CTL_IO: err=%d (%s)", errno, strerror(errno)); cmd_wr->response = VIRTIO_SCSI_S_FAILURE; } else { cmd_wr->sense_len = MIN(io->scsiio.sense_len, sc->vss_config.sense_size); cmd_wr->residual = ext_data_len - io->scsiio.ext_data_filled; cmd_wr->status = io->scsiio.scsi_status; cmd_wr->response = VIRTIO_SCSI_S_OK; memcpy(&cmd_wr->sense, &io->scsiio.sense_data, cmd_wr->sense_len); } buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); nxferred = VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled; free(cmd_rd); free(cmd_wr); ctl_scsi_free_io(io); return (nxferred); } static void pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct iovec iov[VTSCSI_MAXSEG]; struct vi_req req; void *buf = NULL; size_t bufsize; int iolen, n; sc = vsc; while (vq_has_descs(vq)) { n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &req); assert(n >= 1 && n <= VTSCSI_MAXSEG); bufsize = iov_to_buf(iov, n, &buf); iolen = pci_vtscsi_control_handle(sc, buf, bufsize); buf_to_iov((uint8_t *)buf + bufsize - iolen, iolen, iov, n, bufsize - iolen); /* * Release this chain and handle more */ vq_relchain(vq, req.idx, iolen); } vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ free(buf); } static void pci_vtscsi_eventq_notify(void *vsc __unused, struct vqueue_info *vq) { vq_kick_disable(vq); } static void pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtscsi_softc *sc; struct pci_vtscsi_queue *q; struct pci_vtscsi_request *req; struct iovec iov[VTSCSI_MAXSEG]; struct vi_req vireq; int n; sc = vsc; q = &sc->vss_queues[vq->vq_num - 2]; while (vq_has_descs(vq)) { n = vq_getchain(vq, iov, VTSCSI_MAXSEG, &vireq); assert(n >= 1 && n <= VTSCSI_MAXSEG); req = calloc(1, sizeof(struct pci_vtscsi_request)); req->vsr_idx = vireq.idx; req->vsr_queue = q; req->vsr_niov_in = vireq.readable; req->vsr_niov_out = vireq.writable; memcpy(req->vsr_iov_in, iov, req->vsr_niov_in * sizeof(struct iovec)); memcpy(req->vsr_iov_out, iov + vireq.readable, req->vsr_niov_out * sizeof(struct iovec)); pthread_mutex_lock(&q->vsq_mtx); STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); pthread_cond_signal(&q->vsq_cv); pthread_mutex_unlock(&q->vsq_mtx); DPRINTF("request enqueued", vireq.idx); } } static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, struct pci_vtscsi_queue *queue, int num) { struct pci_vtscsi_worker *worker; char tname[MAXCOMLEN + 1]; int i; queue->vsq_sc = sc; queue->vsq_vq = &sc->vss_vq[num + 2]; pthread_mutex_init(&queue->vsq_mtx, NULL); pthread_mutex_init(&queue->vsq_qmtx, NULL); pthread_cond_init(&queue->vsq_cv, NULL); STAILQ_INIT(&queue->vsq_requests); LIST_INIT(&queue->vsq_workers); for (i = 0; i < VTSCSI_THR_PER_Q; i++) { worker = calloc(1, sizeof(struct pci_vtscsi_worker)); worker->vsw_queue = queue; pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, (void *)worker); snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); pthread_set_name_np(worker->vsw_thread, tname); LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); } return (0); } static int pci_vtscsi_legacy_config(nvlist_t *nvl, const char *opts) { char *cp, *devname; if (opts == NULL) return (0); cp = strchr(opts, ','); if (cp == NULL) { set_config_value_node(nvl, "dev", opts); return (0); } devname = strndup(opts, cp - opts); set_config_value_node(nvl, "dev", devname); free(devname); return (pci_parse_legacy_config(nvl, cp + 1)); } static int -pci_vtscsi_init(struct vmctx *ctx __unused, struct pci_devinst *pi, - nvlist_t *nvl) +pci_vtscsi_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_vtscsi_softc *sc; const char *devname, *value; int i; sc = calloc(1, sizeof(struct pci_vtscsi_softc)); value = get_config_value_node(nvl, "iid"); if (value != NULL) sc->vss_iid = strtoul(value, NULL, 10); devname = get_config_value_node(nvl, "dev"); if (devname == NULL) devname = "/dev/cam/ctl"; sc->vss_ctl_fd = open(devname, O_RDWR); if (sc->vss_ctl_fd < 0) { WPRINTF("cannot open %s: %s", devname, strerror(errno)); free(sc); return (1); } pthread_mutex_init(&sc->vss_mtx, NULL); vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); sc->vss_vs.vs_mtx = &sc->vss_mtx; /* controlq */ sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; /* eventq */ sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; /* request queues */ for (i = 2; i < VTSCSI_MAXQ; i++) { sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); } /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_SCSI); pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) return (1); vi_set_io_bar(&sc->vss_vs, 0); return (0); } static const struct pci_devemu pci_de_vscsi = { .pe_emu = "virtio-scsi", .pe_init = pci_vtscsi_init, .pe_legacy_config = pci_vtscsi_legacy_config, .pe_barwrite = vi_pci_write, .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vscsi); diff --git a/usr.sbin/bhyve/pci_xhci.c b/usr.sbin/bhyve/pci_xhci.c index 1eceb6fe0fd3..0ac1423f0736 100644 --- a/usr.sbin/bhyve/pci_xhci.c +++ b/usr.sbin/bhyve/pci_xhci.c @@ -1,3212 +1,3210 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Leon Dang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* XHCI options: -s ,xhci,{devices} devices: tablet USB tablet mouse */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "config.h" #include "debug.h" #include "pci_emul.h" #include "pci_xhci.h" #include "usb_emul.h" static int xhci_debug = 0; #define DPRINTF(params) if (xhci_debug) PRINTLN params #define WPRINTF(params) PRINTLN params #define XHCI_NAME "xhci" #define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ #define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ /* * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping * to 4k to avoid going over the guest physical memory barrier. */ #define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ #define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ #define XHCI_CAPLEN (4*8) /* offset of op register space */ #define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ #define XHCI_PORTREGS_START 0x400 #define XHCI_DOORBELL_MAX 256 #define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ /* caplength and hci-version registers */ #define XHCI_SET_CAPLEN(x) ((x) & 0xFF) #define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) #define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) /* hcsparams1 register */ #define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) #define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) #define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) /* hcsparams2 register */ #define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) #define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) #define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) #define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) /* hcsparams3 register */ #define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) #define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) /* hccparams1 register */ #define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) #define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) #define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) #define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) #define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) #define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) #define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) #define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) #define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) #define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) /* hccparams2 register */ #define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) #define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) #define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) #define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) #define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) #define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) /* other registers */ #define XHCI_SET_DOORBELL(x) ((x) & ~0x03) #define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) /* register masks */ #define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ #define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ #define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ /* port register set */ #define XHCI_PORTREGS_BASE 0x400 /* base offset */ #define XHCI_PORTREGS_PORT0 0x3F0 #define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ #define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) #define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) #define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & (m)) << (s))) #define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & ((m) << (s))))) #define SNAP_DEV_NAME_LEN 128 struct pci_xhci_trb_ring { uint64_t ringaddr; /* current dequeue guest address */ uint32_t ccs; /* consumer cycle state */ }; /* device endpoint transfer/stream rings */ struct pci_xhci_dev_ep { union { struct xhci_trb *_epu_tr; struct xhci_stream_ctx *_epu_sctx; } _ep_trbsctx; #define ep_tr _ep_trbsctx._epu_tr #define ep_sctx _ep_trbsctx._epu_sctx /* * Caches the value of MaxPStreams from the endpoint context * when an endpoint is initialized and is used to validate the * use of ep_ringaddr vs ep_sctx_trbs[] as well as the length * of ep_sctx_trbs[]. */ uint32_t ep_MaxPStreams; union { struct pci_xhci_trb_ring _epu_trb; struct pci_xhci_trb_ring *_epu_sctx_trbs; } _ep_trb_rings; #define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr #define ep_ccs _ep_trb_rings._epu_trb.ccs #define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs struct usb_data_xfer *ep_xfer; /* transfer chain */ }; /* device context base address array: maps slot->device context */ struct xhci_dcbaa { uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ }; /* port status registers */ struct pci_xhci_portregs { uint32_t portsc; /* port status and control */ uint32_t portpmsc; /* port pwr mgmt status & control */ uint32_t portli; /* port link info */ uint32_t porthlpmc; /* port hardware LPM control */ } __packed; #define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) /* xHC operational registers */ struct pci_xhci_opregs { uint32_t usbcmd; /* usb command */ uint32_t usbsts; /* usb status */ uint32_t pgsz; /* page size */ uint32_t dnctrl; /* device notification control */ uint64_t crcr; /* command ring control */ uint64_t dcbaap; /* device ctx base addr array ptr */ uint32_t config; /* configure */ /* guest mapped addresses: */ struct xhci_trb *cr_p; /* crcr dequeue */ struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ }; /* xHC runtime registers */ struct pci_xhci_rtsregs { uint32_t mfindex; /* microframe index */ struct { /* interrupter register set */ uint32_t iman; /* interrupter management */ uint32_t imod; /* interrupter moderation */ uint32_t erstsz; /* event ring segment table size */ uint32_t rsvd; uint64_t erstba; /* event ring seg-tbl base addr */ uint64_t erdp; /* event ring dequeue ptr */ } intrreg __packed; /* guest mapped addresses */ struct xhci_event_ring_seg *erstba_p; struct xhci_trb *erst_p; /* event ring segment tbl */ int er_deq_seg; /* event ring dequeue segment */ int er_enq_idx; /* event ring enqueue index - xHCI */ int er_enq_seg; /* event ring enqueue segment */ uint32_t er_events_cnt; /* number of events in ER */ uint32_t event_pcs; /* producer cycle state flag */ }; struct pci_xhci_softc; /* * USB device emulation container. * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each * emulated device instance. */ struct pci_xhci_dev_emu { struct pci_xhci_softc *xsc; /* XHCI contexts */ struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; int dev_slotstate; struct usb_devemu *dev_ue; /* USB emulated dev */ void *dev_sc; /* device's softc */ struct usb_hci hci; }; struct pci_xhci_softc { struct pci_devinst *xsc_pi; pthread_mutex_t mtx; uint32_t caplength; /* caplen & hciversion */ uint32_t hcsparams1; /* structural parameters 1 */ uint32_t hcsparams2; /* structural parameters 2 */ uint32_t hcsparams3; /* structural parameters 3 */ uint32_t hccparams1; /* capability parameters 1 */ uint32_t dboff; /* doorbell offset */ uint32_t rtsoff; /* runtime register space offset */ uint32_t hccparams2; /* capability parameters 2 */ uint32_t regsend; /* end of configuration registers */ struct pci_xhci_opregs opregs; struct pci_xhci_rtsregs rtsregs; struct pci_xhci_portregs *portregs; struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ int usb2_port_start; int usb3_port_start; }; /* port and slot numbering start from 1 */ #define XHCI_PORTREG_PTR(x,n) &((x)->portregs[(n) - 1]) #define XHCI_DEVINST_PTR(x,n) ((x)->devices[(n) - 1]) #define XHCI_SLOTDEV_PTR(x,n) ((x)->slots[(n) - 1]) #define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) #define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \ (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1))) #define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ (a), XHCI_GADDR_SIZE(a)) static int xhci_in_use; /* map USB errors to XHCI */ static const int xhci_usb_errors[USB_ERR_MAX] = { [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, }; #define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ XHCI_TRB_ERROR_INVALID) static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr); static void pci_xhci_dump_trb(struct xhci_trb *trb); static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs); static void pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, uint32_t evtype) { evtrb->qwTrb0 = port << 24; evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); } /* controller reset */ static void pci_xhci_reset(struct pci_xhci_softc *sc) { int i; sc->rtsregs.er_enq_idx = 0; sc->rtsregs.er_events_cnt = 0; sc->rtsregs.event_pcs = 1; for (i = 1; i <= XHCI_MAX_SLOTS; i++) { pci_xhci_reset_slot(sc, i); } } static uint32_t pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) { int do_intr = 0; int i; if (cmd & XHCI_CMD_RS) { do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; sc->opregs.usbcmd |= XHCI_CMD_RS; sc->opregs.usbsts &= ~XHCI_STS_HCH; sc->opregs.usbsts |= XHCI_STS_PCD; /* Queue port change event on controller run from stop */ if (do_intr) for (i = 1; i <= XHCI_MAX_DEVS; i++) { struct pci_xhci_dev_emu *dev; struct pci_xhci_portregs *port; struct xhci_trb evtrb; if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) continue; port = XHCI_PORTREG_PTR(sc, i); port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; port->portsc &= ~XHCI_PS_PLS_MASK; /* * XHCI 4.19.3 USB2 RxDetect->Polling, * USB3 Polling->U0 */ if (dev->dev_ue->ue_usbver == 2) port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); else port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0); pci_xhci_set_evtrb(&evtrb, i, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); if (pci_xhci_insert_event(sc, &evtrb, 0) != XHCI_TRB_ERROR_SUCCESS) break; } } else { sc->opregs.usbcmd &= ~XHCI_CMD_RS; sc->opregs.usbsts |= XHCI_STS_HCH; sc->opregs.usbsts &= ~XHCI_STS_PCD; } /* start execution of schedule; stop when set to 0 */ cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; if (cmd & XHCI_CMD_HCRST) { /* reset controller */ pci_xhci_reset(sc); cmd &= ~XHCI_CMD_HCRST; } cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); if (do_intr) pci_xhci_assert_interrupt(sc); return (cmd); } static void pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct xhci_trb evtrb; struct pci_xhci_portregs *p; int port; uint32_t oldpls, newpls; if (sc->portregs == NULL) return; port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx", offset, port, value)); assert(port >= 0); if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_write port %d > ndevices", port)); return; } if (XHCI_DEVINST_PTR(sc, port) == NULL) { DPRINTF(("pci_xhci: portregs_write to unattached port %d", port)); } p = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: /* port reset or warm reset */ if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); break; } if ((p->portsc & XHCI_PS_PP) == 0) { WPRINTF(("pci_xhci: portregs_write to unpowered " "port %d", port)); break; } /* Port status and control register */ oldpls = XHCI_PS_PLS_GET(p->portsc); newpls = XHCI_PS_PLS_GET(value); p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; if (XHCI_DEVINST_PTR(sc, port)) p->portsc |= XHCI_PS_CCS; p->portsc |= (value & ~(XHCI_PS_OCA | XHCI_PS_PR | XHCI_PS_PED | XHCI_PS_PLS_MASK | /* link state */ XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK | /* port indicator */ XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); /* clear control bits */ p->portsc &= ~(value & (XHCI_PS_CSC | XHCI_PS_PEC | XHCI_PS_WRC | XHCI_PS_OCC | XHCI_PS_PRC | XHCI_PS_PLC | XHCI_PS_CEC | XHCI_PS_CAS)); /* port disable request; for USB3, don't care */ if (value & XHCI_PS_PED) DPRINTF(("Disable port %d request", port)); if (!(value & XHCI_PS_LWS)) break; DPRINTF(("Port new PLS: %d", newpls)); switch (newpls) { case 0: /* U0 */ case 3: /* U3 */ if (oldpls != newpls) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(newpls) | XHCI_PS_PLC; if (oldpls != 0 && newpls == 0) { pci_xhci_set_evtrb(&evtrb, port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); pci_xhci_insert_event(sc, &evtrb, 1); } } break; default: DPRINTF(("Unhandled change port %d PLS %u", port, newpls)); break; } break; case 4: /* Port power management status and control register */ p->portpmsc = value; break; case 8: /* Port link information register */ DPRINTF(("pci_xhci attempted write to PORTLI, port %d", port)); break; case 12: /* * Port hardware LPM control register. * For USB3, this register is reserved. */ p->porthlpmc = value; break; default: DPRINTF(("pci_xhci: unaligned portreg write offset %#lx", offset)); break; } } static struct xhci_dev_ctx * pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) { uint64_t devctx_addr; struct xhci_dev_ctx *devctx; assert(slot > 0 && slot <= XHCI_MAX_DEVS); assert(XHCI_SLOTDEV_PTR(sc, slot) != NULL); assert(sc->opregs.dcbaa_p != NULL); devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; if (devctx_addr == 0) { DPRINTF(("get_dev_ctx devctx_addr == 0")); return (NULL); } DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx", slot, devctx_addr)); devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); return (devctx); } static struct xhci_trb * pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, uint64_t *guestaddr) { struct xhci_trb *next; assert(curtrb != NULL); if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { if (guestaddr) *guestaddr = curtrb->qwTrb0 & ~0xFUL; next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); } else { if (guestaddr) *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; next = curtrb + 1; } return (next); } static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) { sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; sc->opregs.usbsts |= XHCI_STS_EINT; /* only trigger interrupt if permitted */ if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { if (pci_msi_enabled(sc->xsc_pi)) pci_generate_msi(sc->xsc_pi, 0); else pci_lintr_assert(sc->xsc_pi); } } static void pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) { if (!pci_msi_enabled(sc->xsc_pi)) pci_lintr_assert(sc->xsc_pi); } static void pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; uint32_t i, pstreams; dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); if (pstreams > 0) { DPRINTF(("init_ep %d with pstreams %d", epid, pstreams)); assert(devep->ep_sctx_trbs == NULL); devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK); devep->ep_sctx_trbs = calloc(pstreams, sizeof(struct pci_xhci_trb_ring)); for (i = 0; i < pstreams; i++) { devep->ep_sctx_trbs[i].ringaddr = devep->ep_sctx[i].qwSctx0 & XHCI_SCTX_0_TR_DQ_PTR_MASK; devep->ep_sctx_trbs[i].ccs = XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); } } else { DPRINTF(("init_ep %d with no pstreams", epid)); devep->ep_ringaddr = ep_ctx->qwEpCtx2 & XHCI_EPCTX_2_TR_DQ_PTR_MASK; devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); DPRINTF(("init_ep tr DCS %x", devep->ep_ccs)); } devep->ep_MaxPStreams = pstreams; if (devep->ep_xfer == NULL) { devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); USB_DATA_XFER_INIT(devep->ep_xfer); } } static void pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) { struct xhci_dev_ctx *dev_ctx; struct pci_xhci_dev_ep *devep; struct xhci_endp_ctx *ep_ctx; DPRINTF(("pci_xhci disable_ep %d", epid)); dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; devep = &dev->eps[epid]; if (devep->ep_MaxPStreams > 0) free(devep->ep_sctx_trbs); if (devep->ep_xfer != NULL) { free(devep->ep_xfer); devep->ep_xfer = NULL; } memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); } /* reset device at slot and data structures related to it */ static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) { struct pci_xhci_dev_emu *dev; dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev) { DPRINTF(("xhci reset unassigned slot (%d)?", slot)); } else { dev->dev_slotstate = XHCI_ST_DISABLED; } /* TODO: reset ring buffer pointers */ } static int pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, int do_intr) { struct pci_xhci_rtsregs *rts; uint64_t erdp; int erdp_idx; int err; struct xhci_trb *evtrbptr; err = XHCI_TRB_ERROR_SUCCESS; rts = &sc->rtsregs; erdp = rts->intrreg.erdp & ~0xF; erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / sizeof(struct xhci_trb); DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]", evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3)); DPRINTF(("\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u", erdp_idx, rts->er_deq_seg, rts->er_enq_idx, rts->er_enq_seg, rts->event_pcs)); DPRINTF(("\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)", erdp, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize, do_intr)); evtrbptr = &rts->erst_p[rts->er_enq_idx]; /* TODO: multi-segment table */ if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { DPRINTF(("pci_xhci[%d] cannot insert event; ring full", __LINE__)); err = XHCI_TRB_ERROR_EV_RING_FULL; goto done; } if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { struct xhci_trb errev; if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { DPRINTF(("pci_xhci[%d] insert evt err: ring full", __LINE__)); errev.qwTrb0 = 0; errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( XHCI_TRB_ERROR_EV_RING_FULL); errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( XHCI_TRB_EVENT_HOST_CTRL) | rts->event_pcs; rts->er_events_cnt++; memcpy(&rts->erst_p[rts->er_enq_idx], &errev, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; err = XHCI_TRB_ERROR_EV_RING_FULL; do_intr = 1; goto done; } } else { rts->er_events_cnt++; } evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; evtrb->dwTrb3 |= rts->event_pcs; memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); rts->er_enq_idx = (rts->er_enq_idx + 1) % rts->erstba_p->dwEvrsTableSize; if (rts->er_enq_idx == 0) rts->event_pcs ^= 1; done: if (do_intr) pci_xhci_assert_interrupt(sc); return (err); } static uint32_t pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs != NULL) for (i = 1; i <= XHCI_MAX_SLOTS; i++) { dev = XHCI_SLOTDEV_PTR(sc, i); if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { *slot = i; dev->dev_slotstate = XHCI_ST_ENABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; dev->hci.hci_address = i; break; } } DPRINTF(("pci_xhci enable slot (error=%d) slot %u", cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); return (cmderr); } static uint32_t pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; uint32_t cmderr; DPRINTF(("pci_xhci disable slot %u", slot)); cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; if (slot > XHCI_MAX_SLOTS) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } dev = XHCI_SLOTDEV_PTR(sc, slot); if (dev) { if (dev->dev_slotstate == XHCI_ST_DISABLED) { cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; } else { dev->dev_slotstate = XHCI_ST_DISABLED; cmderr = XHCI_TRB_ERROR_SUCCESS; /* TODO: reset events and endpoints */ } } else cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_NO_SLOTS; if (sc->portregs == NULL) goto done; DPRINTF(("pci_xhci reset device slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; else { dev->dev_slotstate = XHCI_ST_DEFAULT; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, 0x1F, 27); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* reset all eps other than ep-0 */ for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_DISABLED, 0x7, 0); } cmderr = XHCI_TRB_ERROR_SUCCESS; } pci_xhci_reset_slot(sc, slot); done: return (cmderr); } static uint32_t pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { DPRINTF(("pci_xhci: address device, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: address device, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); dev->hci.hci_address = slot; dev->dev_ctx = dev_ctx; if (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); dev_ctx->ctx_slot.dwSctx3 = XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | XHCI_SCTX_3_DEV_ADDR_SET(slot); memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); ep0_ctx = &dev_ctx->ctx_ep[1]; ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); pci_xhci_init_ep(dev, 1); dev->dev_slotstate = XHCI_ST_ADDRESSED; DPRINTF(("pci_xhci: address device, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static uint32_t pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx, *iep_ctx; uint32_t cmderr; int i; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci config_ep slot %u", slot)); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u", slot)); if (dev->dev_ue->ue_stop != NULL) dev->dev_ue->ue_stop(dev->dev_sc); dev->dev_slotstate = XHCI_ST_ADDRESSED; dev->hci.hci_address = 0; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); /* number of contexts */ dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); /* slot state */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, 0x1F, 27); /* disable endpoints */ for (i = 2; i < 32; i++) pci_xhci_disable_ep(dev, i); cmderr = XHCI_TRB_ERROR_SUCCESS; goto done; } if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed", dev->dev_slotstate)); cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; goto done; } /* In addressed/configured state; * for each drop endpoint ctx flag: * ep->state = DISABLED * for each add endpoint ctx flag: * cp(ep-in, ep-out) * ep->state = RUNNING * for each drop+add endpoint flag: * reset ep resources * cp(ep-in, ep-out) * ep->state = RUNNING * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) * slot->state = configured */ input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); dev_ctx = dev->dev_ctx; DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, input_ctx->ctx_input.dwInCtx7)); for (i = 2; i <= 31; i++) { ep_ctx = &dev_ctx->ctx_ep[i]; if (input_ctx->ctx_input.dwInCtx0 & XHCI_INCTX_0_DROP_MASK(i)) { DPRINTF((" config ep - dropping ep %d", i)); pci_xhci_disable_ep(dev, i); } if (input_ctx->ctx_input.dwInCtx1 & XHCI_INCTX_1_ADD_MASK(i)) { iep_ctx = &input_ctx->ctx_ep[i]; DPRINTF((" enable ep[%d] %08x %08x %016lx %08x", i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); pci_xhci_init_ep(dev, i); /* ep state */ ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); } } /* slot state to configured */ dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); dev->dev_slotstate = XHCI_ST_CONFIGURED; DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " "[3]=0x%08x", slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); done: return (cmderr); } static uint32_t pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t type; epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); DPRINTF(("pci_xhci: reset ep %u: slot %u", epid, slot)); cmderr = XHCI_TRB_ERROR_SUCCESS; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); if (type == XHCI_TRB_TYPE_STOP_EP && (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { /* XXX suspend endpoint for 10ms */ } if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: reset ep: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } devep = &dev->eps[epid]; if (devep->ep_xfer != NULL) USB_DATA_XFER_RESET(devep->ep_xfer); dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; if (devep->ep_MaxPStreams == 0) ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (type == XHCI_TRB_TYPE_RESET_EP && (dev->dev_ue->ue_reset == NULL || dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; goto done; } done: return (cmderr); } static uint32_t pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, struct pci_xhci_dev_ep *devep, uint32_t streamid) { struct xhci_stream_ctx *sctx; if (devep->ep_MaxPStreams == 0) return (XHCI_TRB_ERROR_TRB); if (devep->ep_MaxPStreams > XHCI_STREAMS_MAX) return (XHCI_TRB_ERROR_INVALID_SID); if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { DPRINTF(("pci_xhci: find_stream; LSA bit not set")); return (XHCI_TRB_ERROR_INVALID_SID); } /* only support primary stream */ if (streamid > devep->ep_MaxPStreams) return (XHCI_TRB_ERROR_STREAM_TYPE); sctx = (struct xhci_stream_ctx *)XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) return (XHCI_TRB_ERROR_STREAM_TYPE); return (XHCI_TRB_ERROR_SUCCESS); } static uint32_t pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; uint32_t cmderr, epid; uint32_t streamid; cmderr = XHCI_TRB_ERROR_SUCCESS; dev = XHCI_SLOTDEV_PTR(sc, slot); assert(dev != NULL); DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u", (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), (uint32_t)(trb->qwTrb0 & 0x1))); DPRINTF((" stream-id %u, slot %u, epid %u, C %u", (trb->dwTrb2 >> 16) & 0xFFFF, XHCI_TRB_3_SLOT_GET(trb->dwTrb3), XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); if (epid < 1 || epid > 31) { DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u", epid)); cmderr = XHCI_TRB_ERROR_TRB; goto done; } dev_ctx = dev->dev_ctx; assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; devep = &dev->eps[epid]; switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { case XHCI_ST_EPCTX_STOPPED: case XHCI_ST_EPCTX_ERROR: break; default: DPRINTF(("pci_xhci cmd set_tr invalid state %x", XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; goto done; } streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); if (devep->ep_MaxPStreams > 0) { cmderr = pci_xhci_find_stream(sc, ep_ctx, devep, streamid); if (cmderr == XHCI_TRB_ERROR_SUCCESS) { assert(devep->ep_sctx != NULL); devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; devep->ep_sctx_trbs[streamid].ringaddr = trb->qwTrb0 & ~0xF; devep->ep_sctx_trbs[streamid].ccs = XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); } } else { if (streamid != 0) { DPRINTF(("pci_xhci cmd set_tr streamid %x != 0", streamid)); } ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; devep->ep_ccs = trb->qwTrb0 & 0x1; devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); DPRINTF(("pci_xhci set_tr first TRB:")); pci_xhci_dump_trb(devep->ep_tr); } ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; done: return (cmderr); } static uint32_t pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, struct xhci_trb *trb) { struct xhci_input_dev_ctx *input_ctx; struct xhci_slot_ctx *islot_ctx; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep0_ctx; uint32_t cmderr; input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); islot_ctx = &input_ctx->ctx_slot; ep0_ctx = &input_ctx->ctx_ep[1]; cmderr = XHCI_TRB_ERROR_SUCCESS; DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,", input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1)); DPRINTF((" slot %08x %08x %08x %08x", islot_ctx->dwSctx0, islot_ctx->dwSctx1, islot_ctx->dwSctx2, islot_ctx->dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ if ((input_ctx->ctx_input.dwInCtx0 != 0) || (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { DPRINTF(("pci_xhci: eval ctx, input ctl invalid")); cmderr = XHCI_TRB_ERROR_TRB; goto done; } /* assign address to slot; in this emulation, slot_id = address */ dev_ctx = pci_xhci_get_dev_ctx(sc, slot); DPRINTF(("pci_xhci: eval ctx, dev ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ /* set max exit latency */ dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, 0xFFFF, 0); /* set interrupter target */ dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, 0x3FF, 22); } if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ /* set max packet size */ dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, 0xFFFF, 16); ep0_ctx = &dev_ctx->ctx_ep[1]; } DPRINTF(("pci_xhci: eval ctx, output ctx")); DPRINTF((" slot %08x %08x %08x %08x", dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); DPRINTF((" ep0 %08x %08x %016lx %08x", ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, ep0_ctx->dwEpCtx4)); done: return (cmderr); } static int pci_xhci_complete_commands(struct pci_xhci_softc *sc) { struct xhci_trb evtrb; struct xhci_trb *trb; uint64_t crcr; uint32_t ccs; /* cycle state (XHCI 4.9.2) */ uint32_t type; uint32_t slot; uint32_t cmderr; int error; error = 0; sc->opregs.crcr |= XHCI_CRCR_LO_CRR; trb = sc->opregs.cr_p; ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; crcr = sc->opregs.crcr & ~0xF; while (1) { sc->opregs.cr_p = trb; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) break; DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u", type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); cmderr = XHCI_TRB_ERROR_SUCCESS; evtrb.dwTrb2 = 0; evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); slot = 0; switch (type) { case XHCI_TRB_TYPE_LINK: /* 0x06 */ if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= XHCI_CRCR_LO_RCS; break; case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ cmderr = pci_xhci_cmd_enable_slot(sc, &slot); break; case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_disable_slot(sc, slot); break; case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_address_device(sc, slot, trb); break; case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ DPRINTF(("Reset Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ DPRINTF(("Stop Endpoint on slot %d", slot)); slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); break; case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); break; case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); cmderr = pci_xhci_cmd_reset_device(sc, slot); break; case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ /* TODO: */ break; case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ break; case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ break; case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ break; case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ break; case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ break; default: DPRINTF(("pci_xhci: unsupported cmd %x", type)); break; } if (type != XHCI_TRB_TYPE_LINK) { /* * insert command completion event and assert intr */ evtrb.qwTrb0 = crcr; evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); DPRINTF(("pci_xhci: command 0x%x result: 0x%x", type, cmderr)); pci_xhci_insert_event(sc, &evtrb, 1); } trb = pci_xhci_trb_next(sc, trb, &crcr); } sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; return (error); } static void pci_xhci_dump_trb(struct xhci_trb *trb) { static const char *trbtypes[] = { "RESERVED", "NORMAL", "SETUP_STAGE", "DATA_STAGE", "STATUS_STAGE", "ISOCH", "LINK", "EVENT_DATA", "NOOP", "ENABLE_SLOT", "DISABLE_SLOT", "ADDRESS_DEVICE", "CONFIGURE_EP", "EVALUATE_CTX", "RESET_EP", "STOP_EP", "SET_TR_DEQUEUE", "RESET_DEVICE", "FORCE_EVENT", "NEGOTIATE_BW", "SET_LATENCY_TOL", "GET_PORT_BW", "FORCE_HEADER", "NOOP_CMD" }; uint32_t type; type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x", trb, type, type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); } static int pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, uint32_t slot, uint32_t epid, int *do_intr) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct xhci_trb *trb; struct xhci_trb evtrb; uint32_t trbflags; uint32_t edtla; int i, err; dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); assert(dev_ctx != NULL); ep_ctx = &dev_ctx->ctx_ep[epid]; err = XHCI_TRB_ERROR_SUCCESS; *do_intr = 0; edtla = 0; /* go through list of TRBs and insert event(s) */ for (i = xfer->head; xfer->ndata > 0; ) { evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; trb = XHCI_GADDR(sc, evtrb.qwTrb0); trbflags = trb->dwTrb3; DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " "(err %d) IOC?%d", i, xfer->data[i].processed, xfer->data[i].blen, XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, trbflags, err, trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); if (!xfer->data[i].processed) { xfer->head = i; break; } xfer->ndata--; edtla += xfer->data[i].bdone; trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, xfer->data[i].streamid, xfer->data[i].trbnext, xfer->data[i].ccs); /* Only interrupt if IOC or short packet */ if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && !((err == XHCI_TRB_ERROR_SHORT_PKT) && (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { i = (i + 1) % USB_MAX_XFER_BLOCKS; continue; } evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | XHCI_TRB_2_REM_SET(xfer->data[i].blen); evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { DPRINTF(("pci_xhci EVENT_DATA edtla %u", edtla)); evtrb.qwTrb0 = trb->qwTrb0; evtrb.dwTrb2 = (edtla & 0xFFFFF) | XHCI_TRB_2_ERROR_SET(err); evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; edtla = 0; } *do_intr = 1; err = pci_xhci_insert_event(sc, &evtrb, 0); if (err != XHCI_TRB_ERROR_SUCCESS) { break; } i = (i + 1) % USB_MAX_XFER_BLOCKS; } return (err); } static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev __unused, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t streamid, uint64_t ringaddr, int ccs) { if (devep->ep_MaxPStreams != 0) { devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | (ccs & 0x1); devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); DPRINTF(("xhci update ep-ring stream %d, addr %lx", streamid, devep->ep_sctx[streamid].qwSctx0)); } else { devep->ep_ringaddr = ringaddr & ~0xFUL; devep->ep_ccs = ccs & 0x1; devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); DPRINTF(("xhci update ep-ring, addr %lx", (devep->ep_ringaddr | devep->ep_ccs))); } } /* * Outstanding transfer still in progress (device NAK'd earlier) so retry * the transfer again to see if it succeeds. */ static int pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) { struct usb_data_xfer *xfer; int err; int do_intr; ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); err = 0; do_intr = 0; xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); /* outstanding requests queued up */ if (dev->dev_ue->ue_data != NULL) { err = dev->dev_ue->ue_data(dev->dev_sc, xfer, epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); if (err == USB_ERR_CANCELLED) { if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == USB_NAK) err = XHCI_TRB_ERROR_SUCCESS; } else { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { pci_xhci_assert_interrupt(sc); } /* XXX should not do it if error? */ USB_DATA_XFER_RESET(xfer); } } USB_DATA_XFER_UNLOCK(xfer); return (err); } static int pci_xhci_handle_transfer(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) { struct xhci_trb *setup_trb; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; uint64_t val; uint32_t trbflags; int do_intr, err; int do_retry; ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); xfer = devep->ep_xfer; USB_DATA_XFER_LOCK(xfer); DPRINTF(("pci_xhci handle_transfer slot %u", slot)); retry: err = XHCI_TRB_ERROR_INVALID; do_retry = 0; do_intr = 0; setup_trb = NULL; while (1) { pci_xhci_dump_trb(trb); trbflags = trb->dwTrb3; if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && (trbflags & XHCI_TRB_3_CYCLE_BIT) != (ccs & XHCI_TRB_3_CYCLE_BIT)) { DPRINTF(("Cycle-bit changed trbflags %x, ccs %x", trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); break; } xfer_block = NULL; switch (XHCI_TRB_3_TYPE_GET(trbflags)) { case XHCI_TRB_TYPE_LINK: if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) ccs ^= 0x1; xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_SETUP_STAGE: if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { DPRINTF(("pci_xhci: invalid setup trb")); err = XHCI_TRB_ERROR_TRB; goto errout; } setup_trb = trb; val = trb->qwTrb0; if (!xfer->ureq) xfer->ureq = malloc( sizeof(struct usb_device_request)); memcpy(xfer->ureq, &val, sizeof(struct usb_device_request)); xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_NORMAL: case XHCI_TRB_TYPE_ISOCH: if (setup_trb != NULL) { DPRINTF(("pci_xhci: trb not supposed to be in " "ctl scope")); err = XHCI_TRB_ERROR_TRB; goto errout; } /* fall through */ case XHCI_TRB_TYPE_DATA_STAGE: xfer_block = usb_data_xfer_append(xfer, (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); break; case XHCI_TRB_TYPE_STATUS_STAGE: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); break; case XHCI_TRB_TYPE_NOOP: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); xfer_block->processed = 1; break; case XHCI_TRB_TYPE_EVENT_DATA: xfer_block = usb_data_xfer_append(xfer, NULL, 0, (void *)addr, ccs); if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { xfer_block->processed = 1; } break; default: DPRINTF(("pci_xhci: handle xfer unexpected trb type " "0x%x", XHCI_TRB_3_TYPE_GET(trbflags))); err = XHCI_TRB_ERROR_TRB; goto errout; } trb = pci_xhci_trb_next(sc, trb, &addr); DPRINTF(("pci_xhci: next trb: 0x%lx", (uint64_t)trb)); if (xfer_block) { xfer_block->trbnext = addr; xfer_block->streamid = streamid; } if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { break; } /* handle current batch that requires interrupt on complete */ if (trbflags & XHCI_TRB_3_IOC_BIT) { DPRINTF(("pci_xhci: trb IOC bit set")); if (epid == 1) do_retry = 1; break; } } DPRINTF(("pci_xhci[%d]: xfer->ndata %u", __LINE__, xfer->ndata)); if (xfer->ndata <= 0) goto errout; if (epid == 1) { int usberr; if (dev->dev_ue->ue_request != NULL) usberr = dev->dev_ue->ue_request(dev->dev_sc, xfer); else usberr = USB_ERR_NOT_STARTED; err = USB_TO_XHCI_ERR(usberr); if (err == XHCI_TRB_ERROR_SUCCESS || err == XHCI_TRB_ERROR_STALL || err == XHCI_TRB_ERROR_SHORT_PKT) { err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); if (err != XHCI_TRB_ERROR_SUCCESS) do_retry = 0; } } else { /* handle data transfer */ pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); err = XHCI_TRB_ERROR_SUCCESS; } errout: if (err == XHCI_TRB_ERROR_EV_RING_FULL) DPRINTF(("pci_xhci[%d]: event ring full", __LINE__)); if (!do_retry) USB_DATA_XFER_UNLOCK(xfer); if (do_intr) pci_xhci_assert_interrupt(sc); if (do_retry) { USB_DATA_XFER_RESET(xfer); DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs", __LINE__)); goto retry; } if (epid == 1) USB_DATA_XFER_RESET(xfer); return (err); } static void pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, uint32_t epid, uint32_t streamid) { struct pci_xhci_dev_emu *dev; struct pci_xhci_dev_ep *devep; struct xhci_dev_ctx *dev_ctx; struct xhci_endp_ctx *ep_ctx; struct pci_xhci_trb_ring *sctx_tr; struct xhci_trb *trb; uint64_t ringaddr; uint32_t ccs; int error; DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u", slot, epid, streamid)); if (slot == 0 || slot > XHCI_MAX_SLOTS) { DPRINTF(("pci_xhci: invalid doorbell slot %u", slot)); return; } if (epid == 0 || epid >= XHCI_MAX_ENDPOINTS) { DPRINTF(("pci_xhci: invalid endpoint %u", epid)); return; } dev = XHCI_SLOTDEV_PTR(sc, slot); devep = &dev->eps[epid]; dev_ctx = pci_xhci_get_dev_ctx(sc, slot); if (!dev_ctx) { return; } ep_ctx = &dev_ctx->ctx_ep[epid]; sctx_tr = NULL; DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x", epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, ep_ctx->dwEpCtx4)); if (ep_ctx->qwEpCtx2 == 0) return; /* handle pending transfers */ if (devep->ep_xfer->ndata > 0) { pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); return; } /* get next trb work item */ if (devep->ep_MaxPStreams != 0) { /* * Stream IDs of 0, 65535 (any stream), and 65534 * (prime) are invalid. */ if (streamid == 0 || streamid == 65534 || streamid == 65535) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } error = pci_xhci_find_stream(sc, ep_ctx, devep, streamid); if (error != XHCI_TRB_ERROR_SUCCESS) { DPRINTF(("pci_xhci: invalid stream %u: %d", streamid, error)); return; } sctx_tr = &devep->ep_sctx_trbs[streamid]; ringaddr = sctx_tr->ringaddr; ccs = sctx_tr->ccs; trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x", streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } else { if (streamid != 0) { DPRINTF(("pci_xhci: invalid stream %u", streamid)); return; } ringaddr = devep->ep_ringaddr; ccs = devep->ep_ccs; trb = devep->ep_tr; DPRINTF(("doorbell, ccs %lx, trb ccs %x", ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); } if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?", ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); return; } pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, ringaddr, ccs, streamid); } static void pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset = (offset - sc->dboff) / sizeof(uint32_t); DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx", offset, value)); if (XHCI_HALTED(sc)) { DPRINTF(("pci_xhci: controller halted")); return; } if (offset == 0) pci_xhci_complete_commands(sc); else if (sc->portregs != NULL) pci_xhci_device_doorbell(sc, offset, XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); } static void pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { struct pci_xhci_rtsregs *rts; offset -= sc->rtsoff; if (offset == 0) { DPRINTF(("pci_xhci attempted write to MFINDEX")); return; } DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx", offset, value)); offset -= 0x20; /* start of intrreg */ rts = &sc->rtsregs; switch (offset) { case 0x00: if (value & XHCI_IMAN_INTR_PEND) rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); if (!(value & XHCI_IMAN_INTR_ENA)) pci_xhci_deassert_interrupt(sc); break; case 0x04: rts->intrreg.imod = value; break; case 0x08: rts->intrreg.erstsz = value & 0xFFFF; break; case 0x10: /* ERSTBA low bits */ rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | (value & ~0x3F); break; case 0x14: /* ERSTBA high bits */ rts->intrreg.erstba = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erstba); rts->erstba_p = XHCI_GADDR(sc, sc->rtsregs.intrreg.erstba & ~0x3FUL); rts->erst_p = XHCI_GADDR(sc, sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); rts->er_enq_idx = 0; rts->er_events_cnt = 0; DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u", rts->erstba_p, rts->erstba_p->qwEvrsTablePtr, rts->erstba_p->dwEvrsTableSize)); break; case 0x18: /* ERDP low bits */ rts->intrreg.erdp = MASK_64_HI(sc->rtsregs.intrreg.erdp) | (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | (value & ~0xF); if (value & XHCI_ERDP_LO_BUSY) { rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; } rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); break; case 0x1C: /* ERDP high bits */ rts->intrreg.erdp = (value << 32) | MASK_64_LO(sc->rtsregs.intrreg.erdp); if (rts->er_events_cnt > 0) { uint64_t erdp; int erdp_i; erdp = rts->intrreg.erdp & ~0xF; erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / sizeof(struct xhci_trb); if (erdp_i <= rts->er_enq_idx) rts->er_events_cnt = rts->er_enq_idx - erdp_i; else rts->er_events_cnt = rts->erstba_p->dwEvrsTableSize - (erdp_i - rts->er_enq_idx); DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u", erdp, rts->er_events_cnt)); } break; default: DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx", offset)); break; } } static uint64_t pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) { struct pci_xhci_portregs *portregs; int port; uint32_t reg; if (sc->portregs == NULL) return (0); port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; if (port > XHCI_MAX_DEVS) { DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS", port)); /* return default value for unused port */ return (XHCI_PS_SPEED_SET(3)); } portregs = XHCI_PORTREG_PTR(sc, port); switch (offset) { case 0: reg = portregs->portsc; break; case 4: reg = portregs->portpmsc; break; case 8: reg = portregs->portli; break; case 12: reg = portregs->porthlpmc; break; default: DPRINTF(("pci_xhci: unaligned portregs read offset %#lx", offset)); reg = 0xffffffff; break; } DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x", offset, port, reg)); return (reg); } static void pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, uint64_t value) { offset -= XHCI_CAPLEN; if (offset < 0x400) DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx", offset, value)); switch (offset) { case XHCI_USBCMD: sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); break; case XHCI_USBSTS: /* clear bits on write */ sc->opregs.usbsts &= ~(value & (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); break; case XHCI_PAGESIZE: /* read only */ break; case XHCI_DNCTRL: sc->opregs.dnctrl = value & 0xFFFF; break; case XHCI_CRCR_LO: if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); sc->opregs.crcr |= value & (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); } else { sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); } break; case XHCI_CRCR_HI: if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | (value << 32); sc->opregs.cr_p = XHCI_GADDR(sc, sc->opregs.crcr & ~0xF); } if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { /* Stop operation of Command Ring */ } if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { /* Abort command */ } break; case XHCI_DCBAAP_LO: sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | (value & 0xFFFFFFC0); break; case XHCI_DCBAAP_HI: sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | (value << 32); sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)", sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); break; case XHCI_CONFIG: sc->opregs.config = value & 0x03FF; break; default: if (offset >= 0x400) pci_xhci_portregs_write(sc, offset, value); break; } } static void -pci_xhci_write(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size __unused, - uint64_t value) +pci_xhci_write(struct pci_devinst *pi, int baridx, uint64_t offset, + int size __unused, uint64_t value) { struct pci_xhci_softc *sc; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) /* read only registers */ WPRINTF(("pci_xhci: write RO-CAPs offset %ld", offset)); else if (offset < sc->dboff) pci_xhci_hostop_write(sc, offset, value); else if (offset < sc->rtsoff) pci_xhci_dbregs_write(sc, offset, value); else if (offset < sc->regsend) pci_xhci_rtsregs_write(sc, offset, value); else WPRINTF(("pci_xhci: write invalid offset %ld", offset)); pthread_mutex_unlock(&sc->mtx); } static uint64_t pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; switch (offset) { case XHCI_CAPLENGTH: /* 0x00 */ value = sc->caplength; break; case XHCI_HCSPARAMS1: /* 0x04 */ value = sc->hcsparams1; break; case XHCI_HCSPARAMS2: /* 0x08 */ value = sc->hcsparams2; break; case XHCI_HCSPARAMS3: /* 0x0C */ value = sc->hcsparams3; break; case XHCI_HCSPARAMS0: /* 0x10 */ value = sc->hccparams1; break; case XHCI_DBOFF: /* 0x14 */ value = sc->dboff; break; case XHCI_RTSOFF: /* 0x18 */ value = sc->rtsoff; break; case XHCI_HCCPRAMS2: /* 0x1C */ value = sc->hccparams2; break; default: value = 0; break; } DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) { uint64_t value; offset = (offset - XHCI_CAPLEN); switch (offset) { case XHCI_USBCMD: /* 0x00 */ value = sc->opregs.usbcmd; break; case XHCI_USBSTS: /* 0x04 */ value = sc->opregs.usbsts; break; case XHCI_PAGESIZE: /* 0x08 */ value = sc->opregs.pgsz; break; case XHCI_DNCTRL: /* 0x14 */ value = sc->opregs.dnctrl; break; case XHCI_CRCR_LO: /* 0x18 */ value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; break; case XHCI_CRCR_HI: /* 0x1C */ value = 0; break; case XHCI_DCBAAP_LO: /* 0x30 */ value = sc->opregs.dcbaap & 0xFFFFFFFF; break; case XHCI_DCBAAP_HI: /* 0x34 */ value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; break; case XHCI_CONFIG: /* 0x38 */ value = sc->opregs.config; break; default: if (offset >= 0x400) value = pci_xhci_portregs_read(sc, offset); else value = 0; break; } if (offset < 0x400) DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx", offset, value)); return (value); } static uint64_t pci_xhci_dbregs_read(struct pci_xhci_softc *sc __unused, uint64_t offset __unused) { /* read doorbell always returns 0 */ return (0); } static uint64_t pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->rtsoff; value = 0; if (offset == XHCI_MFINDEX) { value = sc->rtsregs.mfindex; } else if (offset >= 0x20) { int item; uint32_t *p; offset -= 0x20; item = offset % 32; assert(offset < sizeof(sc->rtsregs.intrreg)); p = &sc->rtsregs.intrreg.iman; p += item / sizeof(uint32_t); value = *p; } DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) { uint32_t value; offset -= sc->regsend; value = 0; switch (offset) { case 0: /* rev major | rev minor | next-cap | cap-id */ value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; break; case 4: /* name string = "USB" */ value = 0x20425355; break; case 8: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; break; case 12: break; case 16: /* rev major | rev minor | next-cap | cap-id */ value = (0x03 << 24) | XHCI_ID_PROTOCOLS; break; case 20: /* name string = "USB" */ value = 0x20425355; break; case 24: /* psic | proto-defined | compat # | compat offset */ value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; break; case 28: break; default: DPRINTF(("pci_xhci: xecp invalid offset 0x%lx", offset)); break; } DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x", offset, value)); return (value); } static uint64_t -pci_xhci_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +pci_xhci_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_xhci_softc *sc; uint32_t value; sc = pi->pi_arg; assert(baridx == 0); pthread_mutex_lock(&sc->mtx); if (offset < XHCI_CAPLEN) value = pci_xhci_hostcap_read(sc, offset); else if (offset < sc->dboff) value = pci_xhci_hostop_read(sc, offset); else if (offset < sc->rtsoff) value = pci_xhci_dbregs_read(sc, offset); else if (offset < sc->regsend) value = pci_xhci_rtsregs_read(sc, offset); else if (offset < (sc->regsend + 4*32)) value = pci_xhci_xecp_read(sc, offset); else { value = 0; WPRINTF(("pci_xhci: read invalid offset %ld", offset)); } pthread_mutex_unlock(&sc->mtx); switch (size) { case 1: value &= 0xFF; break; case 2: value &= 0xFFFF; break; case 4: value &= 0xFFFFFFFF; break; } return (value); } static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; struct xhci_trb evtrb; int error; assert(portn <= XHCI_MAX_DEVS); DPRINTF(("xhci reset port %d", portn)); port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); port->portsc |= XHCI_PS_PED | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); if (warm && dev->dev_ue->ue_usbver == 3) { port->portsc |= XHCI_PS_WRC; } if ((port->portsc & XHCI_PS_PRC) == 0) { port->portsc |= XHCI_PS_PRC; pci_xhci_set_evtrb(&evtrb, portn, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 1); if (error != XHCI_TRB_ERROR_SUCCESS) DPRINTF(("xhci reset port insert event " "failed")); } } } static void pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) { struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; port = XHCI_PORTREG_PTR(sc, portn); dev = XHCI_DEVINST_PTR(sc, portn); if (dev) { port->portsc = XHCI_PS_CCS | /* connected */ XHCI_PS_PP; /* port power */ if (dev->dev_ue->ue_usbver == 2) { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } else { port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | XHCI_PS_PED | /* enabled */ XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); } DPRINTF(("Init port %d 0x%x", portn, port->portsc)); } else { port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; DPRINTF(("Init empty port %d 0x%x", portn, port->portsc)); } } static int pci_xhci_dev_intr(struct usb_hci *hci, int epctx) { struct pci_xhci_dev_emu *dev; struct xhci_dev_ctx *dev_ctx; struct xhci_trb evtrb; struct pci_xhci_softc *sc; struct pci_xhci_portregs *p; struct xhci_endp_ctx *ep_ctx; int error = 0; int dir_in; int epid; dir_in = epctx & 0x80; epid = epctx & ~0x80; /* HW endpoint contexts are 0-15; convert to epid based on dir */ epid = (epid * 2) + (dir_in ? 1 : 0); assert(epid >= 1 && epid <= 31); dev = hci->hci_sc; sc = dev->xsc; /* check if device is ready; OS has to initialise it */ if (sc->rtsregs.erstba_p == NULL || (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || dev->dev_ctx == NULL) return (0); p = XHCI_PORTREG_PTR(sc, hci->hci_port); /* raise event if link U3 (suspended) state */ if (XHCI_PS_PLS_GET(p->portsc) == 3) { p->portsc &= ~XHCI_PS_PLS_MASK; p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); if ((p->portsc & XHCI_PS_PLC) != 0) return (0); p->portsc |= XHCI_PS_PLC; pci_xhci_set_evtrb(&evtrb, hci->hci_port, XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); error = pci_xhci_insert_event(sc, &evtrb, 0); if (error != XHCI_TRB_ERROR_SUCCESS) goto done; } dev_ctx = dev->dev_ctx; ep_ctx = &dev_ctx->ctx_ep[epid]; if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { DPRINTF(("xhci device interrupt on disabled endpoint %d", epid)); return (0); } DPRINTF(("xhci device interrupt on endpoint %d", epid)); pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); done: return (error); } static int pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid __unused, void *param __unused) { DPRINTF(("xhci device event port %d", hci->hci_port)); return (0); } /* * Each controller contains a "slot" node which contains a list of * child nodes each of which is a device. Each slot node's name * corresponds to a specific controller slot. These nodes * contain a "device" variable identifying the device model of the * USB device. For example: * * pci.0.1.0 * .device="xhci" * .slot * .1 * .device="tablet" */ static int pci_xhci_legacy_config(nvlist_t *nvl, const char *opts) { char node_name[16]; nvlist_t *slots_nvl, *slot_nvl; char *cp, *opt, *str, *tofree; int slot; if (opts == NULL) return (0); slots_nvl = create_relative_config_node(nvl, "slot"); slot = 1; tofree = str = strdup(opts); while ((opt = strsep(&str, ",")) != NULL) { /* device[=] */ cp = strchr(opt, '='); if (cp != NULL) { *cp = '\0'; cp++; } snprintf(node_name, sizeof(node_name), "%d", slot); slot++; slot_nvl = create_relative_config_node(slots_nvl, node_name); set_config_value_node(slot_nvl, "device", opt); /* * NB: Given that we split on commas above, the legacy * format only supports a single option. */ if (cp != NULL && *cp != '\0') pci_parse_legacy_config(slot_nvl, cp); } free(tofree); return (0); } static int pci_xhci_parse_devices(struct pci_xhci_softc *sc, nvlist_t *nvl) { struct pci_xhci_dev_emu *dev; struct usb_devemu *ue; const nvlist_t *slots_nvl, *slot_nvl; const char *name, *device; char *cp; void *devsc, *cookie; long slot; int type, usb3_port, usb2_port, i, ndevices; usb3_port = sc->usb3_port_start; usb2_port = sc->usb2_port_start; sc->devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); ndevices = 0; slots_nvl = find_relative_config_node(nvl, "slot"); if (slots_nvl == NULL) goto portsfinal; cookie = NULL; while ((name = nvlist_next(slots_nvl, &type, &cookie)) != NULL) { if (usb2_port == ((sc->usb2_port_start) + XHCI_MAX_DEVS/2) || usb3_port == ((sc->usb3_port_start) + XHCI_MAX_DEVS/2)) { WPRINTF(("pci_xhci max number of USB 2 or 3 " "devices reached, max %d", XHCI_MAX_DEVS/2)); goto bad; } if (type != NV_TYPE_NVLIST) { EPRINTLN( "pci_xhci: config variable '%s' under slot node", name); goto bad; } slot = strtol(name, &cp, 0); if (*cp != '\0' || slot <= 0 || slot > XHCI_MAX_SLOTS) { EPRINTLN("pci_xhci: invalid slot '%s'", name); goto bad; } if (XHCI_SLOTDEV_PTR(sc, slot) != NULL) { EPRINTLN("pci_xhci: duplicate slot '%s'", name); goto bad; } slot_nvl = nvlist_get_nvlist(slots_nvl, name); device = get_config_value_node(slot_nvl, "device"); if (device == NULL) { EPRINTLN( "pci_xhci: missing \"device\" value for slot '%s'", name); goto bad; } ue = usb_emu_finddev(device); if (ue == NULL) { EPRINTLN("pci_xhci: unknown device model \"%s\"", device); goto bad; } DPRINTF(("pci_xhci adding device %s", device)); dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); dev->xsc = sc; dev->hci.hci_sc = dev; dev->hci.hci_intr = pci_xhci_dev_intr; dev->hci.hci_event = pci_xhci_dev_event; if (ue->ue_usbver == 2) { if (usb2_port == sc->usb2_port_start + XHCI_MAX_DEVS / 2) { WPRINTF(("pci_xhci max number of USB 2 devices " "reached, max %d", XHCI_MAX_DEVS / 2)); goto bad; } dev->hci.hci_port = usb2_port; usb2_port++; } else { if (usb3_port == sc->usb3_port_start + XHCI_MAX_DEVS / 2) { WPRINTF(("pci_xhci max number of USB 3 devices " "reached, max %d", XHCI_MAX_DEVS / 2)); goto bad; } dev->hci.hci_port = usb3_port; usb3_port++; } XHCI_DEVINST_PTR(sc, dev->hci.hci_port) = dev; dev->hci.hci_address = 0; devsc = ue->ue_init(&dev->hci, nvl); if (devsc == NULL) { goto bad; } dev->dev_ue = ue; dev->dev_sc = devsc; XHCI_SLOTDEV_PTR(sc, slot) = dev; ndevices++; } portsfinal: sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); if (ndevices > 0) { for (i = 1; i <= XHCI_MAX_DEVS; i++) { pci_xhci_init_port(sc, i); } } else { WPRINTF(("pci_xhci no USB devices configured")); } return (0); bad: for (i = 1; i <= XHCI_MAX_DEVS; i++) { free(XHCI_DEVINST_PTR(sc, i)); } free(sc->devices); free(sc->slots); return (-1); } static int -pci_xhci_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl) +pci_xhci_init(struct pci_devinst *pi, nvlist_t *nvl) { struct pci_xhci_softc *sc; int error; if (xhci_in_use) { WPRINTF(("pci_xhci controller already defined")); return (-1); } xhci_in_use = 1; sc = calloc(1, sizeof(struct pci_xhci_softc)); pi->pi_arg = sc; sc->xsc_pi = pi; sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; sc->usb3_port_start = 1; /* discover devices */ error = pci_xhci_parse_devices(sc, nvl); if (error < 0) goto done; else error = 0; sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | XHCI_SET_HCIVERSION(0x0100); sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | XHCI_SET_HCSP2_IST(0x04); sc->hcsparams3 = 0; /* no latency */ sc->hccparams1 = XHCI_SET_HCCP1_AC64(1) | /* 64-bit addrs */ XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ XHCI_SET_HCCP1_SPC(1) | /* short packet */ XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | XHCI_SET_HCCP2_U3C(1); sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); /* dboff must be 32-bit aligned */ if (sc->dboff & 0x3) sc->dboff = (sc->dboff + 0x3) & ~0x3; /* rtsoff must be 32-bytes aligned */ sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); if (sc->rtsoff & 0x1F) sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x", sc->dboff, sc->rtsoff)); sc->opregs.usbsts = XHCI_STS_HCH; sc->opregs.pgsz = XHCI_PAGESIZE_4K; pci_xhci_reset(sc); sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ /* * Set extended capabilities pointer to be after regsend; * value of xecp field is 32-bit offset. */ sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); pci_emul_add_msicap(pi, 1); /* regsend + xecp registers */ pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); DPRINTF(("pci_xhci pci_emu_alloc: %d", sc->regsend + 4*32)); pci_lintr_request(pi); pthread_mutex_init(&sc->mtx, NULL); done: if (error) { free(sc); } return (error); } #ifdef BHYVE_SNAPSHOT static void pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[]) { int i, j; struct pci_xhci_dev_emu *dev, *slot; memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { for (j = 1; j <= XHCI_MAX_DEVS; j++) { slot = XHCI_SLOTDEV_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, j); if (slot == dev) maps[i] = j; } } } static int pci_xhci_snapshot_ep(struct pci_xhci_softc *sc __unused, struct pci_xhci_dev_emu *dev, int idx, struct vm_snapshot_meta *meta) { int k; int ret; struct usb_data_xfer *xfer; struct usb_data_xfer_block *xfer_block; /* some sanity checks */ if (meta->op == VM_SNAPSHOT_SAVE) xfer = dev->eps[idx].ep_xfer; SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done); if (xfer == NULL) { ret = 0; goto done; } if (meta->op == VM_SNAPSHOT_RESTORE) { pci_xhci_init_ep(dev, idx); xfer = dev->eps[idx].ep_xfer; } /* save / restore proper */ for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) { xfer_block = &xfer->data[k]; SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf, XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done); if (xfer->ureq) { /* xfer->ureq is not allocated at restore time */ if (meta->op == VM_SNAPSHOT_RESTORE) xfer->ureq = malloc(sizeof(struct usb_device_request)); SNAPSHOT_BUF_OR_LEAVE(xfer->ureq, sizeof(struct usb_device_request), meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done); done: return (ret); } static int pci_xhci_snapshot(struct vm_snapshot_meta *meta) { int i, j; int ret; int restore_idx; struct pci_devinst *pi; struct pci_xhci_softc *sc; struct pci_xhci_portregs *port; struct pci_xhci_dev_emu *dev; char dname[SNAP_DEV_NAME_LEN]; int maps[XHCI_MAX_SLOTS + 1]; pi = meta->dev_data; sc = pi->pi_arg; SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done); /* opregs */ SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done); /* opregs.cr_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p, XHCI_GADDR_SIZE(sc->opregs.cr_p), true, meta, ret, done); /* opregs.dcbaa_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p, XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), true, meta, ret, done); /* rtsregs */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done); /* rtsregs.intrreg */ SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done); /* rtsregs.erstba_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p, XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), true, meta, ret, done); /* rtsregs.erst_p */ SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p, XHCI_GADDR_SIZE(sc->rtsregs.erst_p), true, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done); /* sanity checking */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; if (meta->op == VM_SNAPSHOT_SAVE) restore_idx = i; SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done); /* check if the restored device (when restoring) is sane */ if (restore_idx != i) { fprintf(stderr, "%s: idx not matching: actual: %d, " "expected: %d\r\n", __func__, restore_idx, i); ret = EINVAL; goto done; } if (meta->op == VM_SNAPSHOT_SAVE) { memset(dname, 0, sizeof(dname)); strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1); } SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done); if (meta->op == VM_SNAPSHOT_RESTORE) { dname[sizeof(dname) - 1] = '\0'; if (strcmp(dev->dev_ue->ue_emu, dname)) { fprintf(stderr, "%s: device names mismatch: " "actual: %s, expected: %s\r\n", __func__, dname, dev->dev_ue->ue_emu); ret = EINVAL; goto done; } } } /* portregs */ for (i = 1; i <= XHCI_MAX_DEVS; i++) { port = XHCI_PORTREG_PTR(sc, i); dev = XHCI_DEVINST_PTR(sc, i); if (dev == NULL) continue; SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done); } /* slots */ if (meta->op == VM_SNAPSHOT_SAVE) pci_xhci_map_devs_slots(sc, maps); for (i = 1; i <= XHCI_MAX_SLOTS; i++) { SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done); if (meta->op == VM_SNAPSHOT_SAVE) { dev = XHCI_SLOTDEV_PTR(sc, i); } else if (meta->op == VM_SNAPSHOT_RESTORE) { if (maps[i] != 0) dev = XHCI_DEVINST_PTR(sc, maps[i]); else dev = NULL; XHCI_SLOTDEV_PTR(sc, i) = dev; } else { /* error */ ret = EINVAL; goto done; } if (dev == NULL) continue; SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx, XHCI_GADDR_SIZE(dev->dev_ctx), true, meta, ret, done); if (dev->dev_ctx != NULL) { for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) { ret = pci_xhci_snapshot_ep(sc, dev, j, meta); if (ret != 0) goto done; } } SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done); /* devices[i]->dev_sc */ dev->dev_ue->ue_snapshot(dev->dev_sc, meta); /* devices[i]->hci */ SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done); } SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done); done: return (ret); } #endif static const struct pci_devemu pci_de_xhci = { .pe_emu = "xhci", .pe_init = pci_xhci_init, .pe_legacy_config = pci_xhci_legacy_config, .pe_barwrite = pci_xhci_write, .pe_barread = pci_xhci_read, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_xhci_snapshot, #endif }; PCI_EMUL_SET(pci_de_xhci); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c index 6f4e3c1f91a2..0c9e32abd326 100644 --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -1,1699 +1,1699 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef WITHOUT_CAPSICUM #include #endif #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "debug.h" #include "inout.h" #include "ipc.h" #include "fwctl.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "snapshot.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #include #include struct spinner_info { const size_t *crtval; const size_t maxval; const size_t total; }; extern int guest_ncpus; static struct winsize winsize; static sig_t old_winch_handler; #define KB (1024UL) #define MB (1024UL * KB) #define GB (1024UL * MB) #define SNAPSHOT_CHUNK (4 * MB) #define PROG_BUF_SZ (8192) #define SNAPSHOT_BUFFER_SIZE (20 * MB) #define JSON_STRUCT_ARR_KEY "structs" #define JSON_DEV_ARR_KEY "devices" #define JSON_BASIC_METADATA_KEY "basic metadata" #define JSON_SNAPSHOT_REQ_KEY "snapshot_req" #define JSON_SIZE_KEY "size" #define JSON_FILE_OFFSET_KEY "file_offset" #define JSON_NCPUS_KEY "ncpus" #define JSON_VMNAME_KEY "vmname" #define JSON_MEMSIZE_KEY "memsize" #define JSON_MEMFLAGS_KEY "memflags" #define min(a,b) \ ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; \ }) static const struct vm_snapshot_dev_info snapshot_devs[] = { { "atkbdc", atkbdc_snapshot, NULL, NULL }, { "virtio-net", pci_snapshot, pci_pause, pci_resume }, { "virtio-blk", pci_snapshot, pci_pause, pci_resume }, { "virtio-rnd", pci_snapshot, NULL, NULL }, { "lpc", pci_snapshot, NULL, NULL }, { "fbuf", pci_snapshot, NULL, NULL }, { "xhci", pci_snapshot, NULL, NULL }, { "e1000", pci_snapshot, NULL, NULL }, { "ahci", pci_snapshot, pci_pause, pci_resume }, { "ahci-hd", pci_snapshot, pci_pause, pci_resume }, { "ahci-cd", pci_snapshot, pci_pause, pci_resume }, }; static const struct vm_snapshot_kern_info snapshot_kern_structs[] = { { "vhpet", STRUCT_VHPET }, { "vm", STRUCT_VM }, { "vmx", STRUCT_VMX }, { "vioapic", STRUCT_VIOAPIC }, { "vlapic", STRUCT_VLAPIC }, { "vmcx", STRUCT_VMCX }, { "vatpit", STRUCT_VATPIT }, { "vatpic", STRUCT_VATPIC }, { "vpmtmr", STRUCT_VPMTMR }, { "vrtc", STRUCT_VRTC }, }; static cpuset_t vcpus_active, vcpus_suspended; static pthread_mutex_t vcpu_lock; static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. */ static char * strcat_extension(const char *base_str, const char *ext) { char *res; size_t base_len, ext_len; base_len = strnlen(base_str, NAME_MAX); ext_len = strnlen(ext, NAME_MAX); if (base_len + ext_len > NAME_MAX) { fprintf(stderr, "Filename exceeds maximum length.\n"); return (NULL); } res = malloc(base_len + ext_len + 1); if (res == NULL) { perror("Failed to allocate memory."); return (NULL); } memcpy(res, base_str, base_len); memcpy(res + base_len, ext, ext_len); res[base_len + ext_len] = 0; return (res); } void destroy_restore_state(struct restore_state *rstate) { if (rstate == NULL) { fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); return; } if (rstate->kdata_map != MAP_FAILED) munmap(rstate->kdata_map, rstate->kdata_len); if (rstate->kdata_fd > 0) close(rstate->kdata_fd); if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); if (rstate->meta_root_obj != NULL) ucl_object_unref(rstate->meta_root_obj); if (rstate->meta_parser != NULL) ucl_parser_free(rstate->meta_parser); } static int load_vmmem_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->vmmem_fd = open(filename, O_RDONLY); if (rstate->vmmem_fd < 0) { perror("Failed to open restore file"); return (-1); } err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); goto err_load_vmmem; } if (sb.st_size == 0) { fprintf(stderr, "Restore file is empty.\n"); goto err_load_vmmem; } rstate->vmmem_len = sb.st_size; return (0); err_load_vmmem: if (rstate->vmmem_fd > 0) close(rstate->vmmem_fd); return (-1); } static int load_kdata_file(const char *filename, struct restore_state *rstate) { struct stat sb; int err; rstate->kdata_fd = open(filename, O_RDONLY); if (rstate->kdata_fd < 0) { perror("Failed to open kernel data file"); return (-1); } err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); goto err_load_kdata; } if (sb.st_size == 0) { fprintf(stderr, "Kernel data file is empty.\n"); goto err_load_kdata; } rstate->kdata_len = sb.st_size; rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, MAP_SHARED, rstate->kdata_fd, 0); if (rstate->kdata_map == MAP_FAILED) { perror("Failed to map restore file"); goto err_load_kdata; } return (0); err_load_kdata: if (rstate->kdata_fd > 0) close(rstate->kdata_fd); return (-1); } static int load_metadata_file(const char *filename, struct restore_state *rstate) { ucl_object_t *obj; struct ucl_parser *parser; int err; parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (parser == NULL) { fprintf(stderr, "Failed to initialize UCL parser.\n"); err = -1; goto err_load_metadata; } err = ucl_parser_add_file(parser, filename); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); err = -1; goto err_load_metadata; } obj = ucl_parser_get_object(parser); if (obj == NULL) { fprintf(stderr, "Failed to parse object.\n"); err = -1; goto err_load_metadata; } rstate->meta_parser = parser; rstate->meta_root_obj = (ucl_object_t *)obj; return (0); err_load_metadata: if (parser != NULL) ucl_parser_free(parser); return (err); } int load_restore_file(const char *filename, struct restore_state *rstate) { int err = 0; char *kdata_filename = NULL, *meta_filename = NULL; assert(filename != NULL); assert(rstate != NULL); memset(rstate, 0, sizeof(*rstate)); rstate->kdata_map = MAP_FAILED; err = load_vmmem_file(filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest RAM file.\n"); goto err_restore; } kdata_filename = strcat_extension(filename, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); goto err_restore; } err = load_kdata_file(kdata_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest kernel data file.\n"); goto err_restore; } meta_filename = strcat_extension(filename, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct kernel metadata filename.\n"); goto err_restore; } err = load_metadata_file(meta_filename, rstate); if (err != 0) { fprintf(stderr, "Failed to load guest metadata file.\n"); goto err_restore; } return (0); err_restore: destroy_restore_state(rstate); if (kdata_filename != NULL) free(kdata_filename); if (meta_filename != NULL) free(meta_filename); return (-1); } #define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_toint_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to int.", key); \ return (ret); \ } \ } while(0) #define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ do { \ const ucl_object_t *obj__; \ obj__ = ucl_object_lookup(obj, key); \ if (obj__ == NULL) { \ fprintf(stderr, "Missing key: '%s'", key); \ return (ret); \ } \ if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ fprintf(stderr, "Cannot convert '%s' value to string.", key); \ return (ret); \ } \ } while(0) static void * lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate, size_t *struct_size) { const ucl_object_t *structs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; int64_t snapshot_req, size, file_offset; structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY); if (structs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_STRUCT_ARR_KEY); return (NULL); } if (ucl_object_type(structs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_STRUCT_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) { snapshot_req = -1; JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req >= 0); if ((enum snapshot_req) snapshot_req == struct_id) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *struct_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } } return (NULL); } static void * lookup_check_dev(const char *dev_name, struct restore_state *rstate, const ucl_object_t *obj, size_t *data_size) { const char *snapshot_req; int64_t size, file_offset; snapshot_req = NULL; JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, &snapshot_req, NULL); assert(snapshot_req != NULL); if (!strcmp(snapshot_req, dev_name)) { JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, &size, NULL); assert(size >= 0); JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, &file_offset, NULL); assert(file_offset >= 0); assert((uint64_t)file_offset + size <= rstate->kdata_len); *data_size = (size_t)size; return ((uint8_t *)rstate->kdata_map + file_offset); } return (NULL); } static void* lookup_dev(const char *dev_name, struct restore_state *rstate, size_t *data_size) { const ucl_object_t *devs = NULL, *obj = NULL; ucl_object_iter_t it = NULL; void *ret; devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY); if (devs == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_DEV_ARR_KEY); return (NULL); } if (ucl_object_type(devs) != UCL_ARRAY) { fprintf(stderr, "Object '%s' is not an array.\n", JSON_DEV_ARR_KEY); return (NULL); } while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { ret = lookup_check_dev(dev_name, rstate, obj, data_size); if (ret != NULL) return (ret); } return (NULL); } static const ucl_object_t * lookup_basic_metadata_object(struct restore_state *rstate) { const ucl_object_t *basic_meta_obj = NULL; basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, JSON_BASIC_METADATA_KEY); if (basic_meta_obj == NULL) { fprintf(stderr, "Failed to find '%s' object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } if (ucl_object_type(basic_meta_obj) != UCL_OBJECT) { fprintf(stderr, "Object '%s' is not a JSON object.\n", JSON_BASIC_METADATA_KEY); return (NULL); } return (basic_meta_obj); } const char * lookup_vmname(struct restore_state *rstate) { const char *vmname; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (NULL); JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); return (vmname); } int lookup_memflags(struct restore_state *rstate) { int64_t memflags; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); return ((int)memflags); } size_t lookup_memsize(struct restore_state *rstate) { int64_t memsize; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); if (memsize < 0) memsize = 0; return ((size_t)memsize); } int lookup_guest_ncpus(struct restore_state *rstate) { int64_t ncpus; const ucl_object_t *obj; obj = lookup_basic_metadata_object(rstate); if (obj == NULL) return (0); JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); return ((int)ncpus); } static void winch_handler(int signal __unused) { #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ } static int print_progress(size_t crtval, const size_t maxval) { size_t rc; double crtval_gb, maxval_gb; size_t i, win_width, prog_start, prog_done, prog_end; int mval_len; static char prog_buf[PROG_BUF_SZ]; static const size_t len = sizeof(prog_buf); static size_t div; static const char *div_str; static char wip_bar[] = { '/', '-', '\\', '|' }; static int wip_idx = 0; if (maxval == 0) { printf("[0B / 0B]\r\n"); return (0); } if (crtval > maxval) crtval = maxval; if (maxval > 10 * GB) { div = GB; div_str = "GiB"; } else if (maxval > 10 * MB) { div = MB; div_str = "MiB"; } else { div = KB; div_str = "KiB"; } crtval_gb = (double) crtval / div; maxval_gb = (double) maxval / div; rc = snprintf(prog_buf, len, "%.03lf", maxval_gb); if (rc == len) { fprintf(stderr, "Maxval too big\n"); return (-1); } mval_len = rc; rc = snprintf(prog_buf, len, "\r[%*.03lf%s / %.03lf%s] |", mval_len, crtval_gb, div_str, maxval_gb, div_str); if (rc == len) { fprintf(stderr, "Buffer too small to print progress\n"); return (-1); } win_width = min(winsize.ws_col, len); prog_start = rc; if (prog_start < (win_width - 2)) { prog_end = win_width - prog_start - 2; prog_done = prog_end * (crtval_gb / maxval_gb); for (i = prog_start; i < prog_start + prog_done; i++) prog_buf[i] = '#'; if (crtval != maxval) { prog_buf[i] = wip_bar[wip_idx]; wip_idx = (wip_idx + 1) % sizeof(wip_bar); i++; } else { prog_buf[i++] = '#'; } for (; i < win_width - 2; i++) prog_buf[i] = '_'; prog_buf[win_width - 2] = '|'; } prog_buf[win_width - 1] = '\0'; write(STDOUT_FILENO, prog_buf, win_width); return (0); } static void * snapshot_spinner_cb(void *arg) { int rc; size_t crtval, maxval, total; struct spinner_info *si; struct timespec ts; si = arg; if (si == NULL) pthread_exit(NULL); ts.tv_sec = 0; ts.tv_nsec = 50 * 1000 * 1000; /* 50 ms sleep time */ do { crtval = *si->crtval; maxval = si->maxval; total = si->total; rc = print_progress(crtval, total); if (rc < 0) { fprintf(stderr, "Failed to parse progress\n"); break; } nanosleep(&ts, NULL); } while (crtval < maxval); pthread_exit(NULL); return NULL; } static int vm_snapshot_mem_part(const int snapfd, const size_t foff, void *src, const size_t len, const size_t totalmem, const bool op_wr) { int rc; size_t part_done, todo, rem; ssize_t done; bool show_progress; pthread_t spinner_th; struct spinner_info *si; if (lseek(snapfd, foff, SEEK_SET) < 0) { perror("Failed to change file offset"); return (-1); } show_progress = false; if (isatty(STDIN_FILENO) && (winsize.ws_col != 0)) show_progress = true; part_done = foff; rem = len; if (show_progress) { si = &(struct spinner_info) { .crtval = &part_done, .maxval = foff + len, .total = totalmem }; rc = pthread_create(&spinner_th, 0, snapshot_spinner_cb, si); if (rc) { perror("Unable to create spinner thread"); show_progress = false; } } while (rem > 0) { if (show_progress) todo = min(SNAPSHOT_CHUNK, rem); else todo = rem; if (op_wr) done = write(snapfd, src, todo); else done = read(snapfd, src, todo); if (done < 0) { perror("Failed to write in file"); return (-1); } src = (uint8_t *)src + done; part_done += done; rem -= done; } if (show_progress) { rc = pthread_join(spinner_th, NULL); if (rc) perror("Unable to end spinner thread"); } return (0); } static size_t vm_snapshot_mem(struct vmctx *ctx, int snapfd, size_t memsz, const bool op_wr) { int ret; size_t lowmem, highmem, totalmem; char *baseaddr; ret = vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); if (ret) { fprintf(stderr, "%s: unable to retrieve guest memory size\r\n", __func__); return (0); } totalmem = lowmem + highmem; if ((op_wr == false) && (totalmem != memsz)) { fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\r\n", __func__, totalmem, memsz); return (0); } winsize.ws_col = 80; #ifdef TIOCGWINSZ ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize); #endif /* TIOCGWINSZ */ old_winch_handler = signal(SIGWINCH, winch_handler); ret = vm_snapshot_mem_part(snapfd, 0, baseaddr, lowmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s lowmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } if (highmem == 0) goto done; ret = vm_snapshot_mem_part(snapfd, lowmem, baseaddr + 4*GB, highmem, totalmem, op_wr); if (ret) { fprintf(stderr, "%s: Could not %s highmem\r\n", __func__, op_wr ? "write" : "read"); totalmem = 0; goto done; } done: printf("\r\n"); signal(SIGWINCH, old_winch_handler); return (totalmem); } int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) { size_t restored; restored = vm_snapshot_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len, false); if (restored != rstate->vmmem_len) return (-1); return (0); } static int vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate, const struct vm_snapshot_kern_info *info) { void *struct_ptr; size_t struct_size; int ret; struct vm_snapshot_meta *meta; struct_ptr = lookup_struct(info->req, rstate, &struct_size); if (struct_ptr == NULL) { fprintf(stderr, "%s: Failed to lookup struct %s\r\n", __func__, info->struct_name); ret = -1; goto done; } if (struct_size == 0) { fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n", __func__, info->struct_name); ret = -1; goto done; } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .dev_name = info->struct_name, .dev_req = info->req, .buffer.buf_start = struct_ptr, .buffer.buf_size = struct_size, .buffer.buf = struct_ptr, .buffer.buf_rem = struct_size, .op = VM_SNAPSHOT_RESTORE, }; ret = vm_snapshot_req(meta); if (ret != 0) { fprintf(stderr, "%s: Failed to restore struct: %s\r\n", __func__, info->struct_name); goto done; } done: return (ret); } int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) { size_t i; int ret; for (i = 0; i < nitems(snapshot_kern_structs); i++) { ret = vm_restore_kern_struct(ctx, rstate, &snapshot_kern_structs[i]); if (ret != 0) return (ret); } return (0); } static int vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate, const struct vm_snapshot_dev_info *info) { void *dev_ptr; size_t dev_size; int ret; struct vm_snapshot_meta *meta; dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size); if (dev_ptr == NULL) { fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name); fprintf(stderr, "Continuing the restore/migration process\r\n"); return (0); } if (dev_size == 0) { fprintf(stderr, "%s: Device size is 0. " "Assuming %s is not used\r\n", __func__, info->dev_name); return (0); } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .dev_name = info->dev_name, .buffer.buf_start = dev_ptr, .buffer.buf_size = dev_size, .buffer.buf = dev_ptr, .buffer.buf_rem = dev_size, .op = VM_SNAPSHOT_RESTORE, }; ret = (*info->snapshot_cb)(meta); if (ret != 0) { fprintf(stderr, "Failed to restore dev: %s\r\n", info->dev_name); return (-1); } return (0); } int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate) { size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]); if (ret != 0) return (ret); } return 0; } int -vm_pause_user_devs(struct vmctx *ctx) +vm_pause_user_devs(void) { const struct vm_snapshot_dev_info *info; size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { info = &snapshot_devs[i]; if (info->pause_cb == NULL) continue; - ret = info->pause_cb(ctx, info->dev_name); + ret = info->pause_cb(info->dev_name); if (ret != 0) return (ret); } return (0); } int -vm_resume_user_devs(struct vmctx *ctx) +vm_resume_user_devs(void) { const struct vm_snapshot_dev_info *info; size_t i; int ret; for (i = 0; i < nitems(snapshot_devs); i++) { info = &snapshot_devs[i]; if (info->resume_cb == NULL) continue; - ret = info->resume_cb(ctx, info->dev_name); + ret = info->resume_cb(info->dev_name); if (ret != 0) return (ret); } return (0); } static int vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { int ret; size_t data_size; ssize_t write_cnt; ret = vm_snapshot_req(meta); if (ret != 0) { fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", __func__, meta->dev_name); ret = -1; goto done; } data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ write_cnt = write(data_fd, meta->buffer.buf_start, data_size); if (write_cnt < 0 || (size_t)write_cnt != data_size) { perror("Failed to write all snapshotted data."); ret = -1; goto done; } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n", meta->dev_req); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY); *offset += data_size; done: return (ret); } static int vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret, error; size_t buf_size, i, offset; char *buffer; struct vm_snapshot_meta *meta; error = 0; offset = 0; buf_size = SNAPSHOT_BUFFER_SIZE; buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); if (buffer == NULL) { error = ENOMEM; perror("Failed to allocate memory for snapshot buffer"); goto err_vm_snapshot_kern_data; } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_STRUCT_ARR_KEY); for (i = 0; i < nitems(snapshot_kern_structs); i++) { meta->dev_name = snapshot_kern_structs[i].struct_name; meta->dev_req = snapshot_kern_structs[i].req; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY, meta, &offset); if (ret != 0) { error = -1; goto err_vm_snapshot_kern_data; } } xo_close_list_h(xop, JSON_STRUCT_ARR_KEY); err_vm_snapshot_kern_data: if (buffer != NULL) free(buffer); return (error); } static int vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop, size_t memsz) { xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vm_get_name(ctx)); xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsz); xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", vm_get_memflags(ctx)); xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); return (0); } static int vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, struct vm_snapshot_meta *meta, off_t *offset) { ssize_t ret; size_t data_size; data_size = vm_get_snapshot_size(meta); /* XXX-MJ no handling for short writes. */ ret = write(data_fd, meta->buffer.buf_start, data_size); if (ret < 0 || (size_t)ret != data_size) { perror("Failed to write all snapshotted data."); return (-1); } /* Write metadata. */ xo_open_instance_h(xop, array_key); xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); xo_close_instance_h(xop, array_key); *offset += data_size; return (0); } static int vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info, int data_fd, xo_handle_t *xop, struct vm_snapshot_meta *meta, off_t *offset) { int ret; ret = (*info->snapshot_cb)(meta); if (ret != 0) { fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n", meta->dev_name, ret); return (ret); } ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, offset); if (ret != 0) return (ret); return (0); } static int vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) { int ret; off_t offset; void *buffer; size_t buf_size, i; struct vm_snapshot_meta *meta; buf_size = SNAPSHOT_BUFFER_SIZE; offset = lseek(data_fd, 0, SEEK_CUR); if (offset < 0) { perror("Failed to get data file current offset."); return (-1); } buffer = malloc(buf_size); if (buffer == NULL) { perror("Failed to allocate memory for snapshot buffer"); ret = ENOSPC; goto snapshot_err; } meta = &(struct vm_snapshot_meta) { .ctx = ctx, .buffer.buf_start = buffer, .buffer.buf_size = buf_size, .op = VM_SNAPSHOT_SAVE, }; xo_open_list_h(xop, JSON_DEV_ARR_KEY); /* Restore other devices that support this feature */ for (i = 0; i < nitems(snapshot_devs); i++) { meta->dev_name = snapshot_devs[i].dev_name; memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); meta->buffer.buf = meta->buffer.buf_start; meta->buffer.buf_rem = meta->buffer.buf_size; ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop, meta, &offset); if (ret != 0) goto snapshot_err; } xo_close_list_h(xop, JSON_DEV_ARR_KEY); snapshot_err: if (buffer != NULL) free(buffer); return (ret); } void checkpoint_cpu_add(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_active); if (checkpoint_active) { CPU_SET(vcpu, &vcpus_suspended); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); } pthread_mutex_unlock(&vcpu_lock); } /* * When a vCPU is suspended for any reason, it calls * checkpoint_cpu_suspend(). This records that the vCPU is idle. * Before returning from suspension, checkpoint_cpu_resume() is * called. In suspend we note that the vCPU is idle. In resume we * pause the vCPU thread until the checkpoint is complete. The reason * for the two-step process is that vCPUs might already be stopped in * the debug server when a checkpoint is requested. This approach * allows us to account for and handle those vCPUs. */ void checkpoint_cpu_suspend(int vcpu) { pthread_mutex_lock(&vcpu_lock); CPU_SET(vcpu, &vcpus_suspended); if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) pthread_cond_signal(&vcpus_idle); pthread_mutex_unlock(&vcpu_lock); } void checkpoint_cpu_resume(int vcpu) { pthread_mutex_lock(&vcpu_lock); while (checkpoint_active) pthread_cond_wait(&vcpus_can_run, &vcpu_lock); CPU_CLR(vcpu, &vcpus_suspended); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_pause(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; vm_suspend_cpu(ctx, -1); while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); } static void vm_vcpu_resume(struct vmctx *ctx) { pthread_mutex_lock(&vcpu_lock); checkpoint_active = false; pthread_mutex_unlock(&vcpu_lock); vm_resume_cpu(ctx, -1); pthread_cond_broadcast(&vcpus_can_run); } static int vm_checkpoint(struct vmctx *ctx, const char *checkpoint_file, bool stop_vm) { int fd_checkpoint = 0, kdata_fd = 0; int ret = 0; int error = 0; size_t memsz; xo_handle_t *xop = NULL; char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { fprintf(stderr, "Failed to construct kernel data filename.\n"); return (-1); } kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; goto done; } meta_filename = strcat_extension(checkpoint_file, ".meta"); if (meta_filename == NULL) { fprintf(stderr, "Failed to construct vm metadata filename.\n"); goto done; } meta_file = fopen(meta_filename, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); goto done; } xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); if (xop == NULL) { perror("Failed to get libxo handle on metadata file."); goto done; } vm_vcpu_pause(ctx); - ret = vm_pause_user_devs(ctx); + ret = vm_pause_user_devs(); if (ret != 0) { fprintf(stderr, "Could not pause devices\r\n"); error = ret; goto done; } memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); if (memsz == 0) { perror("Could not write guest memory to file"); error = -1; goto done; } ret = vm_snapshot_basic_metadata(ctx, xop, memsz); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); error = -1; goto done; } ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); error = -1; goto done; } ret = vm_snapshot_user_devs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot device state.\n"); error = -1; goto done; } xo_finish_h(xop); if (stop_vm) { vm_destroy(ctx); exit(0); } done: - ret = vm_resume_user_devs(ctx); + ret = vm_resume_user_devs(); if (ret != 0) fprintf(stderr, "Could not resume devices\r\n"); vm_vcpu_resume(ctx); if (fd_checkpoint > 0) close(fd_checkpoint); if (meta_filename != NULL) free(meta_filename); if (kdata_filename != NULL) free(kdata_filename); if (xop != NULL) xo_destroy(xop); if (meta_file != NULL) fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); return (error); } static int handle_message(struct vmctx *ctx, nvlist_t *nvl) { const char *cmd; struct ipc_command **ipc_cmd; if (!nvlist_exists_string(nvl, "cmd")) return (EINVAL); cmd = nvlist_get_string(nvl, "cmd"); IPC_COMMAND_FOREACH(ipc_cmd, ipc_cmd_set) { if (strcmp(cmd, (*ipc_cmd)->name) == 0) return ((*ipc_cmd)->handler(ctx, nvl)); } return (EOPNOTSUPP); } /* * Listen for commands from bhyvectl */ void * checkpoint_thread(void *param) { int fd; struct checkpoint_thread_info *thread_info; nvlist_t *nvl; pthread_set_name_np(pthread_self(), "checkpoint thread"); thread_info = (struct checkpoint_thread_info *)param; while ((fd = accept(thread_info->socket_fd, NULL, NULL)) != -1) { nvl = nvlist_recv(fd, 0); if (nvl != NULL) handle_message(thread_info->ctx, nvl); else EPRINTLN("nvlist_recv() failed: %s", strerror(errno)); close(fd); nvlist_destroy(nvl); } return (NULL); } static int vm_do_checkpoint(struct vmctx *ctx, const nvlist_t *nvl) { int error; if (!nvlist_exists_string(nvl, "filename") || !nvlist_exists_bool(nvl, "suspend")) error = EINVAL; else error = vm_checkpoint(ctx, nvlist_get_string(nvl, "filename"), nvlist_get_bool(nvl, "suspend")); return (error); } IPC_COMMAND(ipc_cmd_set, checkpoint, vm_do_checkpoint); void init_snapshot(void) { int err; err = pthread_mutex_init(&vcpu_lock, NULL); if (err != 0) errc(1, err, "checkpoint mutex init"); err = pthread_cond_init(&vcpus_idle, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_idle)"); err = pthread_cond_init(&vcpus_can_run, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_can_run)"); } /* * Create the listening socket for IPC with bhyvectl */ int init_checkpoint_thread(struct vmctx *ctx) { struct checkpoint_thread_info *checkpoint_info = NULL; struct sockaddr_un addr; int socket_fd; pthread_t checkpoint_pthread; int err; memset(&addr, 0, sizeof(addr)); socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); if (socket_fd < 0) { EPRINTLN("Socket creation failed: %s", strerror(errno)); err = -1; goto fail; } addr.sun_family = AF_UNIX; snprintf(addr.sun_path, sizeof(addr.sun_path), "%s%s", BHYVE_RUN_DIR, vm_get_name(ctx)); addr.sun_len = SUN_LEN(&addr); unlink(addr.sun_path); if (bind(socket_fd, (struct sockaddr *)&addr, addr.sun_len) != 0) { EPRINTLN("Failed to bind socket \"%s\": %s\n", addr.sun_path, strerror(errno)); err = -1; goto fail; } if (listen(socket_fd, 10) < 0) { EPRINTLN("ipc socket listen: %s\n", strerror(errno)); err = errno; goto fail; } checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd; err = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, checkpoint_info); if (err != 0) goto fail; return (0); fail: free(checkpoint_info); if (socket_fd > 0) close(socket_fd); unlink(addr.sun_path); return (err); } void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) { const char *__op; if (op == VM_SNAPSHOT_SAVE) __op = "save"; else if (op == VM_SNAPSHOT_RESTORE) __op = "restore"; else __op = "unknown"; fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", __func__, __op, bufname); } int vm_snapshot_buf(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); return (E2BIG); } if (op == VM_SNAPSHOT_SAVE) memcpy(buffer->buf, data, data_size); else if (op == VM_SNAPSHOT_RESTORE) memcpy(data, buffer->buf, data_size); else return (EINVAL); buffer->buf += data_size; buffer->buf_rem -= data_size; return (0); } size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta) { size_t length; struct vm_snapshot_buffer *buffer; buffer = &meta->buffer; if (buffer->buf_size < buffer->buf_rem) { fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", __func__, buffer->buf_size, buffer->buf_rem); length = 0; } else { length = buffer->buf_size - buffer->buf_rem; } return (length); } int vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, struct vm_snapshot_meta *meta) { int ret; vm_paddr_t gaddr; if (meta->op == VM_SNAPSHOT_SAVE) { gaddr = paddr_host2guest(meta->ctx, *addrp); if (gaddr == (vm_paddr_t) -1) { if (!restore_null || (restore_null && (*addrp != NULL))) { ret = EFAULT; goto done; } } SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); } else if (meta->op == VM_SNAPSHOT_RESTORE) { SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); if (gaddr == (vm_paddr_t) -1) { if (!restore_null) { ret = EFAULT; goto done; } } *addrp = paddr_guest2host(meta->ctx, gaddr, len); } else { ret = EINVAL; } done: return (ret); } int vm_snapshot_buf_cmp(void *data, size_t data_size, struct vm_snapshot_meta *meta) { struct vm_snapshot_buffer *buffer; int op; int ret; buffer = &meta->buffer; op = meta->op; if (buffer->buf_rem < data_size) { fprintf(stderr, "%s: buffer too small\r\n", __func__); ret = E2BIG; goto done; } if (op == VM_SNAPSHOT_SAVE) { ret = 0; memcpy(buffer->buf, data, data_size); } else if (op == VM_SNAPSHOT_RESTORE) { ret = memcmp(data, buffer->buf, data_size); } else { ret = EINVAL; goto done; } buffer->buf += data_size; buffer->buf_rem -= data_size; done: return (ret); } diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h index 718e48467f56..e59c92f2326f 100644 --- a/usr.sbin/bhyve/snapshot.h +++ b/usr.sbin/bhyve/snapshot.h @@ -1,109 +1,109 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016 Flavius Anton * Copyright (c) 2016 Mihai Tiganus * Copyright (c) 2016-2019 Mihai Carabas * Copyright (c) 2017-2019 Darius Mihai * Copyright (c) 2017-2019 Elena Mihailescu * Copyright (c) 2018-2019 Sergiu Weisz * All rights reserved. * The bhyve-snapshot feature was developed under sponsorships * from Matthew Grooms. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _BHYVE_SNAPSHOT_ #define _BHYVE_SNAPSHOT_ #include #include #include #define BHYVE_RUN_DIR "/var/run/bhyve/" #define MAX_SNAPSHOT_FILENAME PATH_MAX struct vmctx; struct restore_state { int kdata_fd; int vmmem_fd; void *kdata_map; size_t kdata_len; size_t vmmem_len; struct ucl_parser *meta_parser; ucl_object_t *meta_root_obj; }; struct checkpoint_thread_info { struct vmctx *ctx; int socket_fd; }; typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); -typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *); -typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *); +typedef int (*vm_pause_dev_cb) (const char *); +typedef int (*vm_resume_dev_cb) (const char *); struct vm_snapshot_dev_info { const char *dev_name; /* device name */ vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */ vm_pause_dev_cb pause_cb; /* callback for device pause */ vm_resume_dev_cb resume_cb; /* callback for device resume */ }; struct vm_snapshot_kern_info { const char *struct_name; /* kernel structure name*/ enum snapshot_req req; /* request type */ }; void destroy_restore_state(struct restore_state *rstate); const char *lookup_vmname(struct restore_state *rstate); int lookup_memflags(struct restore_state *rstate); size_t lookup_memsize(struct restore_state *rstate); int lookup_guest_ncpus(struct restore_state *rstate); void checkpoint_cpu_add(int vcpu); void checkpoint_cpu_resume(int vcpu); void checkpoint_cpu_suspend(int vcpu); int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate); -int vm_pause_user_devs(struct vmctx *ctx); -int vm_resume_user_devs(struct vmctx *ctx); +int vm_pause_user_devs(void); +int vm_resume_user_devs(void); int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); void *checkpoint_thread(void *param); int init_checkpoint_thread(struct vmctx *ctx); void init_snapshot(void); int load_restore_file(const char *filename, struct restore_state *rstate); #endif diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c index 87d49c2a5e4b..68304f76dc49 100644 --- a/usr.sbin/bhyve/virtio.c +++ b/usr.sbin/bhyve/virtio.c @@ -1,964 +1,962 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "debug.h" #include "pci_emul.h" #include "virtio.h" /* * Functions for dealing with generalized "virtual devices" as * defined by */ /* * In case we decide to relax the "virtio softc comes at the * front of virtio-based device softc" constraint, let's use * this to convert. */ #define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and * the PCI emulation. */ void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues) { int i; /* vs and dev_softc addresses must match */ assert((void *)vs == dev_softc); vs->vs_vc = vc; vs->vs_pi = pi; pi->pi_arg = vs; vs->vs_queues = queues; for (i = 0; i < vc->vc_nvq; i++) { queues[i].vq_vs = vs; queues[i].vq_num = i; } } /* * Reset device (device-wide). This erases all queues, i.e., * all the queues become invalid (though we don't wipe out the * internal pointers, we just clear the VQ_ALLOC flag). * * It resets negotiated features to "none". * * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. */ void vi_reset_dev(struct virtio_softc *vs) { struct vqueue_info *vq; int i, nvq; if (vs->vs_mtx) assert(pthread_mutex_isowned_np(vs->vs_mtx)); nvq = vs->vs_vc->vc_nvq; for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } vs->vs_negotiated_caps = 0; vs->vs_curq = 0; /* vs->vs_status = 0; -- redundant */ if (vs->vs_isr) pci_lintr_deassert(vs->vs_pi); vs->vs_isr = 0; vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; } /* * Set I/O BAR (usually 0) to map PCI config registers. */ void vi_set_io_bar(struct virtio_softc *vs, int barnum) { size_t size; /* * ??? should we use VIRTIO_PCI_CONFIG_OFF(0) if MSI-X is disabled? * Existing code did not... */ size = VIRTIO_PCI_CONFIG_OFF(1) + vs->vs_vc->vc_cfgsize; pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); } /* * Initialize MSI-X vector capabilities if we're to use MSI-X, * or MSI capabilities if not. * * We assume we want one MSI-X vector per queue, here, plus one * for the config vec. */ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) { int nvec; if (use_msix) { vs->vs_flags |= VIRTIO_USE_MSIX; VS_LOCK(vs); vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ VS_UNLOCK(vs); nvec = vs->vs_vc->vc_nvq + 1; if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); /* Legacy interrupts are mandatory for virtio devices */ pci_lintr_request(vs->vs_pi); return (0); } /* * Initialize the currently-selected virtio queue (vs->vs_curq). * The guest just gave us a page frame number, from which we can * calculate the addresses of the queue. */ static void vi_vq_init(struct virtio_softc *vs, uint32_t pfn) { struct vqueue_info *vq; uint64_t phys; size_t size; char *base; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_pfn = pfn; phys = (uint64_t)pfn << VRING_PFN; size = vring_size_aligned(vq->vq_qsize); base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); /* First page(s) are descriptors... */ vq->vq_desc = (struct vring_desc *)base; base += vq->vq_qsize * sizeof(struct vring_desc); /* ... immediately followed by "avail" ring (entirely uint16_t's) */ vq->vq_avail = (struct vring_avail *)base; base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); /* Then it's rounded up to the next page... */ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); /* ... and the last page(s) are the used ring. */ vq->vq_used = (struct vring_used *)base; /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; vq->vq_next_used = 0; vq->vq_save_used = 0; } /* * Helper inline for vq_getchain(): record the i'th "real" * descriptor. */ static inline void _vq_record(int i, struct vring_desc *vd, struct vmctx *ctx, struct iovec *iov, int n_iov, struct vi_req *reqp) { if (i >= n_iov) return; iov[i].iov_base = paddr_guest2host(ctx, vd->addr, vd->len); iov[i].iov_len = vd->len; if ((vd->flags & VRING_DESC_F_WRITE) == 0) reqp->readable++; else reqp->writable++; } #define VQ_MAX_DESCRIPTORS 512 /* see below */ /* * Examine the chain of descriptors starting at the "next one" to * make sure that they describe a sensible request. If so, return * the number of "real" descriptors that would be needed/used in * acting on this request. This may be smaller than the number of * available descriptors, e.g., if there are two available but * they are two separate requests, this just returns 1. Or, it * may be larger: if there are indirect descriptors involved, * there may only be one descriptor available but it may be an * indirect pointing to eight more. We return 8 in this case, * i.e., we do not count the indirect descriptors, only the "real" * ones. * * Basically, this vets the "flags" and "next" field of each * descriptor and tells you how many are involved. Since some may * be indirect, this also needs the vmctx (in the pci_devinst * at vs->vs_pi) so that it can find indirect descriptors. * * As we process each descriptor, we copy and adjust it (guest to * host address wise, also using the vmtctx) into the given iov[] * array (of the given size). If the array overflows, we stop * placing values into the array but keep processing descriptors, * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. * So you, the caller, must not assume that iov[] is as big as the * return value (you can process the same thing twice to allocate * a larger iov array if needed, or supply a zero length to find * out how much space is needed). * * If some descriptor(s) are invalid, this prints a diagnostic message * and returns -1. If no descriptors are ready now it simply returns 0. * * You are assumed to have done a vq_ring_ready() if needed (note * that vq_has_descs() does one). */ int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, struct vi_req *reqp) { int i; u_int ndesc, n_indir; u_int idx, next; struct vi_req req; struct vring_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; const char *name; vs = vq->vq_vs; name = vs->vs_vc->vc_name; memset(&req, 0, sizeof(req)); /* * Note: it's the responsibility of the guest not to * update vq->vq_avail->idx until all of the descriptors * the guest has written are valid (including all their * "next" fields and "flags"). * * Compute (vq_avail->idx - last_avail) in integers mod 2**16. This is * the number of descriptors the device has made available * since the last time we updated vq->vq_last_avail. * * We just need to do the subtraction as an unsigned int, * then trim off excess bits. */ idx = vq->vq_last_avail; ndesc = (uint16_t)((u_int)vq->vq_avail->idx - idx); if (ndesc == 0) return (0); if (ndesc > vq->vq_qsize) { /* XXX need better way to diagnose issues */ EPRINTLN( "%s: ndesc (%u) out of range, driver confused?", name, (u_int)ndesc); return (-1); } /* * Now count/parse "involved" descriptors starting from * the head of the chain. * * To prevent loops, we could be more complicated and * check whether we're re-visiting a previously visited * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; req.idx = next = vq->vq_avail->ring[idx & (vq->vq_qsize - 1)]; vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->next) { if (next >= vq->vq_qsize) { EPRINTLN( "%s: descriptor index %u out of range, " "driver confused?", name, next); return (-1); } vdir = &vq->vq_desc[next]; if ((vdir->flags & VRING_DESC_F_INDIRECT) == 0) { _vq_record(i, vdir, ctx, iov, niov, &req); i++; } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { EPRINTLN( "%s: descriptor has forbidden INDIRECT flag, " "driver confused?", name); return (-1); } else { n_indir = vdir->len / 16; if ((vdir->len & 0xf) || n_indir == 0) { EPRINTLN( "%s: invalid indir len 0x%x, " "driver confused?", name, (u_int)vdir->len); return (-1); } vindir = paddr_guest2host(ctx, vdir->addr, vdir->len); /* * Indirects start at the 0th, then follow * their own embedded "next"s until those run * out. Each one's indirect flag must be off * (we don't really have to check, could just * ignore errors...). */ next = 0; for (;;) { vp = &vindir[next]; if (vp->flags & VRING_DESC_F_INDIRECT) { EPRINTLN( "%s: indirect desc has INDIR flag," " driver confused?", name); return (-1); } _vq_record(i, vp, ctx, iov, niov, &req); if (++i > VQ_MAX_DESCRIPTORS) goto loopy; if ((vp->flags & VRING_DESC_F_NEXT) == 0) break; next = vp->next; if (next >= n_indir) { EPRINTLN( "%s: invalid next %u > %u, " "driver confused?", name, (u_int)next, n_indir); return (-1); } } } if ((vdir->flags & VRING_DESC_F_NEXT) == 0) goto done; } loopy: EPRINTLN( "%s: descriptor loop? count > %d - driver confused?", name, i); return (-1); done: *reqp = req; return (i); } /* * Return the first n_chain request chains back to the available queue. * * (These chains are the ones you handled when you called vq_getchain() * and used its positive return value.) */ void vq_retchains(struct vqueue_info *vq, uint16_t n_chains) { vq->vq_last_avail -= n_chains; } void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { struct vring_used *vuh; struct vring_used_elem *vue; uint16_t mask; /* * Notes: * - mask is N-1 where N is a power of 2 so computes x % N * - vuh points to the "used" data shared with guest * - vue points to the "used" ring entry we want to update */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; vue = &vuh->ring[vq->vq_next_used++ & mask]; vue->id = idx; vue->len = iolen; } void vq_relchain_publish(struct vqueue_info *vq) { /* * Ensure the used descriptor is visible before updating the index. * This is necessary on ISAs with memory ordering less strict than x86 * (and even on x86 to act as a compiler barrier). */ atomic_thread_fence_rel(); vq->vq_used->idx = vq->vq_next_used; } /* * Return specified request chain to the guest, setting its I/O length * to the provided value. * * (This chain is the one you handled when you called vq_getchain() * and used its positive return value.) */ void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) { vq_relchain_prepare(vq, idx, iolen); vq_relchain_publish(vq); } /* * Driver has finished processing "available" chains and calling * vq_relchain on each one. If driver used all the available * chains, used_all should be set. * * If the "used" index moved we may need to inform the guest, i.e., * deliver an interrupt. Even if the used index did NOT move we * may need to deliver an interrupt, if the avail ring is empty and * we are supposed to interrupt on empty. * * Note that used_all_avail is provided by the caller because it's * a snapshot of the ring state when he decided to finish interrupt * processing -- it's possible that descriptors became available after * that point. (It's also typically a constant 1/True as well.) */ void vq_endchains(struct vqueue_info *vq, int used_all_avail) { struct virtio_softc *vs; uint16_t event_idx, new_idx, old_idx; int intr; /* * Interrupt generation: if we're using EVENT_IDX, * interrupt if we've crossed the event threshold. * Otherwise interrupt is generated if we added "used" entries, * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. * * In any case, though, if NOTIFY_ON_EMPTY is set and the * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; old_idx = vq->vq_save_used; vq->vq_save_used = new_idx = vq->vq_used->idx; /* * Use full memory barrier between "idx" store from preceding * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or * "flags" field below. */ atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) { event_idx = VQ_USED_EVENT_IDX(vq); /* * This calculation is per docs and the kernel * (see src/sys/dev/virtio/virtio_ring.h). */ intr = (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old_idx); } else { intr = new_idx != old_idx && !(vq->vq_avail->flags & VRING_AVAIL_F_NO_INTERRUPT); } if (intr) vq_interrupt(vs, vq); } /* Note: these are in sorted order to make for a fast search */ static struct config_reg { uint16_t cr_offset; /* register offset */ uint8_t cr_size; /* size (bytes) */ uint8_t cr_ro; /* true => reg is read only */ const char *cr_name; /* name of reg */ } config_regs[] = { { VIRTIO_PCI_HOST_FEATURES, 4, 1, "HOST_FEATURES" }, { VIRTIO_PCI_GUEST_FEATURES, 4, 0, "GUEST_FEATURES" }, { VIRTIO_PCI_QUEUE_PFN, 4, 0, "QUEUE_PFN" }, { VIRTIO_PCI_QUEUE_NUM, 2, 1, "QUEUE_NUM" }, { VIRTIO_PCI_QUEUE_SEL, 2, 0, "QUEUE_SEL" }, { VIRTIO_PCI_QUEUE_NOTIFY, 2, 0, "QUEUE_NOTIFY" }, { VIRTIO_PCI_STATUS, 1, 0, "STATUS" }, { VIRTIO_PCI_ISR, 1, 0, "ISR" }, { VIRTIO_MSI_CONFIG_VECTOR, 2, 0, "CONFIG_VECTOR" }, { VIRTIO_MSI_QUEUE_VECTOR, 2, 0, "QUEUE_VECTOR" }, }; static inline struct config_reg * vi_find_cr(int offset) { u_int hi, lo, mid; struct config_reg *cr; lo = 0; hi = sizeof(config_regs) / sizeof(*config_regs) - 1; while (hi >= lo) { mid = (hi + lo) >> 1; cr = &config_regs[mid]; if (cr->cr_offset == offset) return (cr); if (cr->cr_offset < offset) lo = mid + 1; else hi = mid - 1; } return (NULL); } /* * Handle pci config space reads. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ uint64_t -vi_pci_read(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size) +vi_pci_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct virtio_softc *vs = pi->pi_arg; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; uint32_t value; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { return (pci_emul_msix_tread(pi, offset, size)); } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. * If that fails, fall into general code. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; if (vc->vc_cfgread != NULL) error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); else error = 0; if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size) { if (cr != NULL) { /* offset must be OK, so size must be bad */ EPRINTLN( "%s: read from %s: bad size %d", name, cr->cr_name, size); } else { EPRINTLN( "%s: read from bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_HOST_FEATURES: value = vc->vc_hv_caps; break; case VIRTIO_PCI_GUEST_FEATURES: value = vs->vs_negotiated_caps; break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq < vc->vc_nvq) value = vs->vs_queues[vs->vs_curq].vq_pfn; break; case VIRTIO_PCI_QUEUE_NUM: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_qsize : 0; break; case VIRTIO_PCI_QUEUE_SEL: value = vs->vs_curq; break; case VIRTIO_PCI_QUEUE_NOTIFY: value = 0; /* XXX */ break; case VIRTIO_PCI_STATUS: value = vs->vs_status; break; case VIRTIO_PCI_ISR: value = vs->vs_isr; vs->vs_isr = 0; /* a read clears this flag */ if (value) pci_lintr_deassert(pi); break; case VIRTIO_MSI_CONFIG_VECTOR: value = vs->vs_msix_cfg_idx; break; case VIRTIO_MSI_QUEUE_VECTOR: value = vs->vs_curq < vc->vc_nvq ? vs->vs_queues[vs->vs_curq].vq_msix_idx : VIRTIO_MSI_NO_VECTOR; break; } done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); return (value); } /* * Handle pci config space writes. * If it's to the MSI-X info, do that. * If it's part of the virtio standard stuff, do that. * Otherwise dispatch to the actual driver. */ void -vi_pci_write(struct vmctx *ctx __unused, - struct pci_devinst *pi, int baridx, uint64_t offset, int size, +vi_pci_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct virtio_softc *vs = pi->pi_arg; struct vqueue_info *vq; struct virtio_consts *vc; struct config_reg *cr; uint64_t virtio_config_size, max; const char *name; uint32_t newoff; int error; if (vs->vs_flags & VIRTIO_USE_MSIX) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { pci_emul_msix_twrite(pi, offset, size, value); return; } } /* XXX probably should do something better than just assert() */ assert(baridx == 0); if (vs->vs_mtx) pthread_mutex_lock(vs->vs_mtx); vc = vs->vs_vc; name = vc->vc_name; if (size != 1 && size != 2 && size != 4) goto bad; virtio_config_size = VIRTIO_PCI_CONFIG_OFF(pci_msix_enabled(pi)); if (offset >= virtio_config_size) { /* * Subtract off the standard size (including MSI-X * registers if enabled) and dispatch to underlying driver. */ newoff = offset - virtio_config_size; max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; if (newoff + size > max) goto bad; if (vc->vc_cfgwrite != NULL) error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); else error = 0; if (!error) goto done; } bad: cr = vi_find_cr(offset); if (cr == NULL || cr->cr_size != size || cr->cr_ro) { if (cr != NULL) { /* offset must be OK, wrong size and/or reg is R/O */ if (cr->cr_size != size) EPRINTLN( "%s: write to %s: bad size %d", name, cr->cr_name, size); if (cr->cr_ro) EPRINTLN( "%s: write to read-only reg %s", name, cr->cr_name); } else { EPRINTLN( "%s: write to bad offset/size %jd/%d", name, (uintmax_t)offset, size); } goto done; } switch (offset) { case VIRTIO_PCI_GUEST_FEATURES: vs->vs_negotiated_caps = value & vc->vc_hv_caps; if (vc->vc_apply_features) (*vc->vc_apply_features)(DEV_SOFTC(vs), vs->vs_negotiated_caps); break; case VIRTIO_PCI_QUEUE_PFN: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vi_vq_init(vs, value); break; case VIRTIO_PCI_QUEUE_SEL: /* * Note that the guest is allowed to select an * invalid queue; we just need to return a QNUM * of 0 while the bad queue is selected. */ vs->vs_curq = value; break; case VIRTIO_PCI_QUEUE_NOTIFY: if (value >= (unsigned int)vc->vc_nvq) { EPRINTLN("%s: queue %d notify out of range", name, (int)value); goto done; } vq = &vs->vs_queues[value]; if (vq->vq_notify) (*vq->vq_notify)(DEV_SOFTC(vs), vq); else if (vc->vc_qnotify) (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); else EPRINTLN( "%s: qnotify queue %d: missing vq/vc notify", name, (int)value); break; case VIRTIO_PCI_STATUS: vs->vs_status = value; if (value == 0) (*vc->vc_reset)(DEV_SOFTC(vs)); break; case VIRTIO_MSI_CONFIG_VECTOR: vs->vs_msix_cfg_idx = value; break; case VIRTIO_MSI_QUEUE_VECTOR: if (vs->vs_curq >= vc->vc_nvq) goto bad_qindex; vq = &vs->vs_queues[vs->vs_curq]; vq->vq_msix_idx = value; break; } goto done; bad_qindex: EPRINTLN( "%s: write config reg %s: curq %d >= max %d", name, cr->cr_name, vs->vs_curq, vc->vc_nvq); done: if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } #ifdef BHYVE_SNAPSHOT int -vi_pci_pause(struct vmctx *ctx __unused, struct pci_devinst *pi) +vi_pci_pause(struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_pause != NULL); (*vc->vc_pause)(DEV_SOFTC(vs)); return (0); } int -vi_pci_resume(struct vmctx *ctx __unused, struct pci_devinst *pi) +vi_pci_resume(struct pci_devinst *pi) { struct virtio_softc *vs; struct virtio_consts *vc; vs = pi->pi_arg; vc = vs->vs_vc; vc = vs->vs_vc; assert(vc->vc_resume != NULL); (*vc->vc_resume)(DEV_SOFTC(vs)); return (0); } static int vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta) { int ret; SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done); done: return (ret); } static int vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta) { int i; int ret; struct virtio_consts *vc; struct vqueue_info *vq; uint64_t addr_size; vc = vs->vs_vc; /* Save virtio queue info */ for (i = 0; i < vc->vc_nvq; i++) { vq = &vs->vs_queues[i]; SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done); SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_next_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done); if (!vq_ring_ready(vq)) continue; addr_size = vq->vq_qsize * sizeof(struct vring_desc); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size, false, meta, ret, done); addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size, false, meta, ret, done); addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t); SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size, false, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size_aligned(vq->vq_qsize), meta, ret, done); } done: return (ret); } int vi_pci_snapshot(struct vm_snapshot_meta *meta) { int ret; struct pci_devinst *pi; struct virtio_softc *vs; struct virtio_consts *vc; pi = meta->dev_data; vs = pi->pi_arg; vc = vs->vs_vc; /* Save virtio softc */ ret = vi_pci_snapshot_softc(vs, meta); if (ret != 0) goto done; /* Save virtio consts */ ret = vi_pci_snapshot_consts(vc, meta); if (ret != 0) goto done; /* Save virtio queue info */ ret = vi_pci_snapshot_queues(vs, meta); if (ret != 0) goto done; /* Save device softc, if needed */ if (vc->vc_snapshot != NULL) { ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta); if (ret != 0) goto done; } done: return (ret); } #endif diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index b6ef2bc8d9de..7fc3e7c3fc84 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -1,438 +1,437 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 Chris Torek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _BHYVE_VIRTIO_H_ #define _BHYVE_VIRTIO_H_ #include #include #include #include /* * These are derived from several virtio specifications. * * Some useful links: * https://github.com/rustyrussell/virtio-spec * http://people.redhat.com/pbonzini/virtio-spec.pdf */ /* * A virtual device has zero or more "virtual queues" (virtqueue). * Each virtqueue uses at least two 4096-byte pages, laid out thus: * * +-----------------------------------------------+ * | "desc": descriptors, 16 bytes each | * | ----------------------------------------- | * | "avail": 2 uint16; uint16; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * | "used": 2 x uint16; elems; 1 uint16 | * | ----------------------------------------- | * | pad to 4k boundary | * +-----------------------------------------------+ * * The number that appears here is always a power of two and is * limited to no more than 32768 (as it must fit in a 16-bit field). * If is sufficiently large, the above will occupy more than * two pages. In any case, all pages must be physically contiguous * within the guest's physical address space. * * The 16-byte "desc" descriptors consist of a 64-bit guest * physical address , a 32-bit length , a 16-bit * , and a 16-bit field (all in guest byte order). * * There are three flags that may be set : * NEXT descriptor is chained, so use its "next" field * WRITE descriptor is for host to write into guest RAM * (else host is to read from guest RAM) * INDIRECT descriptor address field is (guest physical) * address of a linear array of descriptors * * Unless INDIRECT is set, is the number of bytes that may * be read/written from guest physical address . If * INDIRECT is set, WRITE is ignored and provides the length * of the indirect descriptors (and must be a multiple of * 16). Note that NEXT may still be set in the main descriptor * pointing to the indirect, and should be set in each indirect * descriptor that uses the next descriptor (these should generally * be numbered sequentially). However, INDIRECT must not be set * in the indirect descriptors. Upon reaching an indirect descriptor * without a NEXT bit, control returns to the direct descriptors. * * Except inside an indirect, each value must be in the * range [0 .. N) (i.e., the half-open interval). (Inside an * indirect, each must be in the range [0 .. /16).) * * The "avail" data structures reside in the same pages as the * "desc" structures since both together are used by the device to * pass information to the hypervisor's virtual driver. These * begin with a 16-bit field and 16-bit index , then * have 16-bit values, followed by one final 16-bit * field . The entries are simply indices * indices into the descriptor ring (and thus must meet the same * constraints as each value). However, is counted * up from 0 (initially) and simply wraps around after 65535; it * is taken mod to find the next available entry. * * The "used" ring occupies a separate page or pages, and contains * values written from the virtual driver back to the guest OS. * This begins with a 16-bit and 16-bit , then there * are "vring_used" elements, followed by a 16-bit . * The "vring_used" elements consist of a 32-bit and a * 32-bit (vu_tlen below). The is simply the index of * the head of a descriptor chain the guest made available * earlier, and the is the number of bytes actually written, * e.g., in the case of a network driver that provided a large * receive buffer but received only a small amount of data. * * The two event fields, and , in the * avail and used rings (respectively -- note the reversal!), are * always provided, but are used only if the virtual device * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature * negotiation. Similarly, both rings provide a flag -- * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in * their field, indicating that the guest does not need an * interrupt, or that the hypervisor driver does not need a * notify, when descriptors are added to the corresponding ring. * (These are provided only for interrupt optimization and need * not be implemented.) */ #define VRING_ALIGN 4096 /* * The address of any given virtual queue is determined by a single * Page Frame Number register. The guest writes the PFN into the * PCI config space. However, a device that has two or more * virtqueues can have a different PFN, and size, for each queue. * The number of queues is determinable via the PCI config space * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means * queue #0, 1 means queue#1, etc. Once a queue is selected, the * remaining PFN and QNUM registers refer to that queue. * * QNUM is a read-only register containing a nonzero power of two * that indicates the (hypervisor's) queue size. Or, if reading it * produces zero, the hypervisor does not have a corresponding * queue. (The number of possible queues depends on the virtual * device. The block device has just one; the network device * provides either two -- 0 = receive, 1 = transmit -- or three, * with 2 = control.) * * PFN is a read/write register giving the physical page address of * the virtqueue in guest memory (the guest must allocate enough space * based on the hypervisor's provided QNUM). * * QNOTIFY is effectively write-only: when the guest writes a queue * number to the register, the hypervisor should scan the specified * virtqueue. (Reading QNOTIFY currently always gets 0). */ /* * PFN register shift amount */ #define VRING_PFN 12 /* * PCI vendor/device IDs */ #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 #define VIRTIO_DEV_CONSOLE 0x1003 #define VIRTIO_DEV_SCSI 0x1004 #define VIRTIO_DEV_RANDOM 0x1005 #define VIRTIO_DEV_9P 0x1009 #define VIRTIO_DEV_INPUT 0x1052 /* * PCI revision IDs */ #define VIRTIO_REV_INPUT 1 /* * PCI subvendor IDs */ #define VIRTIO_SUBVEN_INPUT 0x108E /* * PCI subdevice IDs */ #define VIRTIO_SUBDEV_INPUT 0x1100 /* From section 2.3, "Virtqueue Configuration", of the virtio specification */ static inline int vring_size_aligned(u_int qsz) { return (roundup2(vring_size(qsz, VRING_ALIGN), VRING_ALIGN)); } -struct vmctx; struct pci_devinst; struct vqueue_info; struct vm_snapshot_meta; /* * A virtual device, with some number (possibly 0) of virtual * queues and some size (possibly 0) of configuration-space * registers private to the device. The virtio_softc should come * at the front of each "derived class", so that a pointer to the * virtio_softc is also a pointer to the more specific, derived- * from-virtio driver's softc. * * Note: inside each hypervisor virtio driver, changes to these * data structures must be locked against other threads, if any. * Except for PCI config space register read/write, we assume each * driver does the required locking, but we need a pointer to the * lock (if there is one) for PCI config space read/write ops. * * When the guest reads or writes the device's config space, the * generic layer checks for operations on the special registers * described above. If the offset of the register(s) being read * or written is past the CFG area (CFG0 or CFG1), the request is * passed on to the virtual device, after subtracting off the * generic-layer size. (So, drivers can just use the offset as * an offset into "struct config", for instance.) * * (The virtio layer also makes sure that the read or write is to/ * from a "good" config offset, hence vc_cfgsize, and on BAR #0. * However, the driver must verify the read or write size and offset * and that no one is writing a readonly register.) * * The BROKED flag ("this thing done gone and broked") is for future * use. */ #define VIRTIO_USE_MSIX 0x01 #define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ #define VIRTIO_BROKED 0x08 /* ??? */ struct virtio_softc { struct virtio_consts *vs_vc; /* constants (see below) */ int vs_flags; /* VIRTIO_* flags from above */ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ struct pci_devinst *vs_pi; /* PCI device instance */ uint32_t vs_negotiated_caps; /* negotiated capabilities */ struct vqueue_info *vs_queues; /* one per vc_nvq */ int vs_curq; /* current queue */ uint8_t vs_status; /* value from last status write */ uint8_t vs_isr; /* ISR flags, if not MSI-X */ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ }; #define VS_LOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_lock(vs->vs_mtx); \ } while (0) #define VS_UNLOCK(vs) \ do { \ if (vs->vs_mtx) \ pthread_mutex_unlock(vs->vs_mtx); \ } while (0) struct virtio_consts { const char *vc_name; /* name of driver (for diagnostics) */ int vc_nvq; /* number of virtual queues */ size_t vc_cfgsize; /* size of dev-specific config regs */ void (*vc_reset)(void *); /* called on virtual device reset */ void (*vc_qnotify)(void *, struct vqueue_info *); /* called on QNOTIFY if no VQ notify */ int (*vc_cfgread)(void *, int, int, uint32_t *); /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ void (*vc_pause)(void *); /* called to pause device activity */ void (*vc_resume)(void *); /* called to resume device activity */ int (*vc_snapshot)(void *, struct vm_snapshot_meta *); /* called to save / restore device state */ }; /* * Data structure allocated (statically) per virtual queue. * * Drivers may change vq_qsize after a reset. When the guest OS * requests a device reset, the hypervisor first calls * vs->vs_vc->vc_reset(); then the data structure below is * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). * * The remaining fields should only be fussed-with by the generic * code. * * Note: the addresses of vq_desc, vq_avail, and vq_used are all * computable from each other, but it's a lot simpler if we just * keep a pointer to each one. The event indices are similarly * (but more easily) computable, and this time we'll compute them: * they're just XX_ring[N]. */ #define VQ_ALLOC 0x01 /* set once we have a pfn */ #define VQ_BROKED 0x02 /* ??? */ struct vqueue_info { uint16_t vq_qsize; /* size of this queue (a power of 2) */ void (*vq_notify)(void *, struct vqueue_info *); /* called instead of vc_notify, if not NULL */ struct virtio_softc *vq_vs; /* backpointer to softc */ uint16_t vq_num; /* we're the num'th queue in the softc */ uint16_t vq_flags; /* flags (see above) */ uint16_t vq_last_avail; /* a recent value of vq_avail->idx */ uint16_t vq_next_used; /* index of the next used slot to be filled */ uint16_t vq_save_used; /* saved vq_used->idx; see vq_endchains */ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ struct vring_desc *vq_desc; /* descriptor array */ struct vring_avail *vq_avail; /* the "avail" ring */ struct vring_used *vq_used; /* the "used" ring */ }; /* as noted above, these are sort of backwards, name-wise */ #define VQ_AVAIL_EVENT_IDX(vq) \ (*(uint16_t *)&(vq)->vq_used->ring[(vq)->vq_qsize]) #define VQ_USED_EVENT_IDX(vq) \ ((vq)->vq_avail->ring[(vq)->vq_qsize]) /* * Is this ring ready for I/O? */ static inline int vq_ring_ready(struct vqueue_info *vq) { return (vq->vq_flags & VQ_ALLOC); } /* * Are there "available" descriptors? (This does not count * how many, just returns True if there are some.) */ static inline int vq_has_descs(struct vqueue_info *vq) { return (vq_ring_ready(vq) && vq->vq_last_avail != vq->vq_avail->idx); } /* * Deliver an interrupt to the guest for a specific MSI-X queue or * event. */ static inline void vi_interrupt(struct virtio_softc *vs, uint8_t isr, uint16_t msix_idx) { if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, msix_idx); else { VS_LOCK(vs); vs->vs_isr |= isr; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); VS_UNLOCK(vs); } } /* * Deliver an interrupt to the guest on the given virtual queue (if * possible, or a generic MSI interrupt if not using MSI-X). */ static inline void vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) { vi_interrupt(vs, VIRTIO_PCI_ISR_INTR, vq->vq_msix_idx); } static inline void vq_kick_enable(struct vqueue_info *vq) { vq->vq_used->flags &= ~VRING_USED_F_NO_NOTIFY; /* * Full memory barrier to make sure the store to vq_used->flags * happens before the load from vq_avail->idx, which results from a * subsequent call to vq_has_descs(). */ atomic_thread_fence_seq_cst(); } static inline void vq_kick_disable(struct vqueue_info *vq) { vq->vq_used->flags |= VRING_USED_F_NO_NOTIFY; } struct iovec; /* * Request description returned by vq_getchain. * * Writable iovecs start at iov[req.readable]. */ struct vi_req { int readable; /* num of readable iovecs */ int writable; /* num of writable iovecs */ unsigned int idx; /* ring index */ }; void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, void *dev_softc, struct pci_devinst *pi, struct vqueue_info *queues); int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); int vq_getchain(struct vqueue_info *vq, struct iovec *iov, int niov, struct vi_req *reqp); void vq_retchains(struct vqueue_info *vq, uint16_t n_chains); void vq_relchain_prepare(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_relchain_publish(struct vqueue_info *vq); void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); -uint64_t vi_pci_read(struct vmctx *ctx, struct pci_devinst *pi, - int baridx, uint64_t offset, int size); -void vi_pci_write(struct vmctx *ctx, struct pci_devinst *pi, - int baridx, uint64_t offset, int size, uint64_t value); +uint64_t vi_pci_read(struct pci_devinst *pi, int baridx, uint64_t offset, + int size); +void vi_pci_write(struct pci_devinst *pi, int baridx, uint64_t offset, + int size, uint64_t value); #ifdef BHYVE_SNAPSHOT int vi_pci_snapshot(struct vm_snapshot_meta *meta); -int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi); -int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi); +int vi_pci_pause(struct pci_devinst *pi); +int vi_pci_resume(struct pci_devinst *pi); #endif #endif /* _BHYVE_VIRTIO_H_ */