diff --git a/lib/Makefile b/lib/Makefile index fdfe198bea10..af0079978075 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,234 +1,235 @@ .include # The SUBDIR_BOOTSTRAP list is a small set of libraries which are used by many # of the other libraries. These are built first with a .WAIT between them # and the main list to avoid needing a SUBDIR_DEPEND line on every library # naming just these few items. SUBDIR_BOOTSTRAP= \ csu \ .WAIT \ libc \ libc_nonshared \ libcompiler_rt \ ${_libclang_rt} \ libc++ \ libc++experimental \ libcxxrt \ libdiff \ libelf \ libssp \ libssp_nonshared \ libsys \ msun # The main list; please keep these sorted alphabetically. # The only exception is sqlite3: we place it at the start of the list since it # takes a long time to build and starting it first improves parallelism. SUBDIR= ${SUBDIR_BOOTSTRAP} \ .WAIT \ libsqlite3 \ geom \ lib9p \ libalias \ libarchive \ libauditd \ libbegemot \ libblocksruntime \ libbsddialog \ libbsdstat \ libbsm \ libbz2 \ libcalendar \ libcam \ libcapsicum \ libcasper \ libcompat \ libcrypt \ libdevctl \ libdevdctl \ libdevinfo \ libdevstat \ libdl \ libdwarf \ libedit \ libelftc \ libevent1 \ libexecinfo \ libexpat \ libfetch \ libgcc_eh \ libgcc_s \ libgeom \ libifconfig \ libipsec \ libiscsiutil \ libjail \ libkiconv \ libkldelf \ libkvm \ liblua \ liblzma \ libmemstat \ libmd \ libmixer \ libmt \ lib80211 \ libnetbsd \ libnetmap \ libnv \ libnvmf \ libopenbsd \ libpam \ libpathconv \ libpcap \ libpjdlog \ libproc \ libprocstat \ libregex \ librpcsvc \ librss \ librt \ librtld_db \ libsbuf \ libsmb \ libstdbuf \ libstdthreads \ libsysdecode \ libtacplus \ libthr \ libthread_db \ libucl \ libufs \ libugidfw \ libulog \ libutil \ ${_libvgl} \ libwrap \ libxo \ liby \ libz \ libzstd \ ncurses \ nss_tacplus # Inter-library dependencies. When the makefile for a library contains LDADD # libraries, those libraries should be listed as build order dependencies here. SUBDIR_DEPEND_geom= libufs SUBDIR_DEPEND_googletest= libregex SUBDIR_DEPEND_libarchive= libz libbz2 libexpat liblzma libmd libzstd SUBDIR_DEPEND_libauditdm= libbsm SUBDIR_DEPEND_libbsddialog= ncurses SUBDIR_DEPEND_libbsnmp= ${_libnetgraph} SUBDIR_DEPEND_libc++:= libcxxrt # libssp_nonshared doesn't need to be linked into libc on every arch, but it is # small enough to build that this bit of serialization is likely insignificant. SUBDIR_DEPEND_libc= libsys libcompiler_rt libssp_nonshared SUBDIR_DEPEND_libcam= libsbuf SUBDIR_DEPEND_libcasper= libnv SUBDIR_DEPEND_libcrypt= libmd SUBDIR_DEPEND_libdevstat= libkvm SUBDIR_DEPEND_libdpv= libfigpar ncurses libutil SUBDIR_DEPEND_libedit= ncurses SUBDIR_DEPEND_libgeom= libexpat libsbuf SUBDIR_DEPEND_librpcsec_gss= libgssapi SUBDIR_DEPEND_libmagic= libz SUBDIR_DEPEND_libmemstat= libkvm SUBDIR_DEPEND_libpam= libcrypt ${_libradius} librpcsvc libtacplus libutil ${_libypclnt} ${_libcom_err} SUBDIR_DEPEND_libpjdlog= libutil SUBDIR_DEPEND_libprocstat= libkvm libutil SUBDIR_DEPEND_libradius= libmd SUBDIR_DEPEND_libsmb= libkiconv # See comment above about libssp_nonshared SUBDIR_DEPEND_libsys= libcompiler_rt libssp_nonshared SUBDIR_DEPEND_libtacplus= libmd SUBDIR_DEPEND_libulog= libmd SUBDIR_DEPEND_libunbound= ${_libldns} SUBDIR_DEPEND_liblzma= libthr .if ${MK_OFED} != "no" SUBDIR_DEPEND_libpcap= ofed .endif SUBDIR_DEPEND_nss_tacplus= libtacplus # NB: keep these sorted by MK_* knobs SUBDIR.${MK_BEARSSL}+= libbearssl libsecureboot SUBDIR.${MK_BLACKLIST}+=libblacklist SUBDIR.${MK_BLUETOOTH}+=libbluetooth libsdp SUBDIR.${MK_BSNMP}+= libbsnmp .if !defined(COMPAT_LIBCOMPAT) .if ${MK_CLANG} != "no" || ${MK_LLD} != "no" || \ ${MK_LLDB} != "no" || ${MK_LLVM_BINUTILS} != "no" SUBDIR+= clang .endif .endif SUBDIR.${MK_CUSE}+= libcuse SUBDIR.${MK_TOOLCHAIN}+=libpe SUBDIR.${MK_DIALOG}+= libdpv libfigpar SUBDIR.${MK_FDT}+= libfdt SUBDIR.${MK_FILE}+= libmagic SUBDIR.${MK_GPIO}+= libgpio SUBDIR.${MK_GSSAPI}+= libgssapi librpcsec_gss SUBDIR.${MK_ICONV}+= libiconv_modules SUBDIR.${MK_KERBEROS_SUPPORT}+= libcom_err SUBDIR.${MK_LDNS}+= libldns SUBDIR.${MK_STATS}+= libstats # The libraries under libclang_rt can only be built by clang. .if ${COMPILER_TYPE} == "clang" && ${MK_CLANG} != "no" _libclang_rt= libclang_rt .elif (${MK_ASAN} != "no" || ${MK_UBSAN} != "no") && make(all) .error Requested build with sanitizers but cannot build runtime libraries! .endif SUBDIR.${MK_EFI}+= libefivar SUBDIR.${MK_GOOGLETEST}+= googletest SUBDIR.${MK_NETGRAPH}+= libnetgraph SUBDIR.${MK_NIS}+= libypclnt .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _libvgl= libvgl .endif .if ${MACHINE_CPUARCH} == "aarch64" SUBDIR.${MK_PMC}+= libopencsd .endif .if ${MACHINE_CPUARCH} == "amd64" SUBDIR.${MK_PMC}+= libipt .endif -.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "aarch64" +.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "aarch64" || \ + ${MACHINE_CPUARCH} == "riscv" SUBDIR.${MK_BHYVE}+= libvmmapi .endif .if ${MACHINE_ARCH} != "powerpc" && ${MACHINE_CPUARCH} != "arm" SUBDIR.${MK_OPENMP}+= libomp .endif .if ${MK_USB} != "no" SUBDIR.${MK_OPENSSH}+= libcbor libfido2 .endif SUBDIR.${MK_OPENSSL}+= libmp SUBDIR.${MK_PF}+= libpfctl SUBDIR.${MK_PMC}+= libpmc libpmcstat SUBDIR.${MK_RADIUS_SUPPORT}+= libradius SUBDIR.${MK_SENDMAIL}+= libmilter libsm libsmdb libsmutil SUBDIR.${MK_TELNET}+= libtelnet SUBDIR.${MK_TESTS_SUPPORT}+= atf SUBDIR.${MK_TESTS_SUPPORT}+= liblutok SUBDIR.${MK_TESTS}+= tests SUBDIR.${MK_UNBOUND}+= libunbound SUBDIR.${MK_USB}+= libusbhid libusb SUBDIR.${MK_OFED}+= ofed SUBDIR.${MK_VERIEXEC}+= libveriexec SUBDIR.${MK_ZFS}+= libbe .if !make(install) SUBDIR_PARALLEL= .endif .include diff --git a/lib/libvmmapi/riscv/Makefile.inc b/lib/libvmmapi/riscv/Makefile.inc new file mode 100644 index 000000000000..663ea0ab90a3 --- /dev/null +++ b/lib/libvmmapi/riscv/Makefile.inc @@ -0,0 +1 @@ +SRCS+= vmmapi_machdep.c diff --git a/lib/libvmmapi/riscv/vmmapi_machdep.c b/lib/libvmmapi/riscv/vmmapi_machdep.c new file mode 100644 index 000000000000..9c70185942c9 --- /dev/null +++ b/lib/libvmmapi/riscv/vmmapi_machdep.c @@ -0,0 +1,117 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include + +#include +#include + +#include "vmmapi.h" +#include "internal.h" + +const char *vm_capstrmap[] = { + [VM_CAP_MAX] = NULL, +}; + +#define VM_MD_IOCTLS \ + VM_ATTACH_APLIC, \ + VM_ASSERT_IRQ, \ + VM_DEASSERT_IRQ, \ + VM_RAISE_MSI + +const cap_ioctl_t vm_ioctl_cmds[] = { + VM_COMMON_IOCTLS, + VM_MD_IOCTLS, +}; +size_t vm_ioctl_ncmds = nitems(vm_ioctl_cmds); + +int +vm_attach_aplic(struct vmctx *ctx, uint64_t mem_start, size_t mem_size) +{ + struct vm_aplic_descr aplic; + + bzero(&aplic, sizeof(aplic)); + aplic.mem_start = mem_start; + aplic.mem_size = mem_size; + + return (ioctl(ctx->fd, VM_ATTACH_APLIC, &aplic)); +} + +int +vm_assert_irq(struct vmctx *ctx, uint32_t irq) +{ + struct vm_irq vi; + + bzero(&vi, sizeof(vi)); + vi.irq = irq; + + return (ioctl(ctx->fd, VM_ASSERT_IRQ, &vi)); +} + +int +vm_deassert_irq(struct vmctx *ctx, uint32_t irq) +{ + struct vm_irq vi; + + bzero(&vi, sizeof(vi)); + vi.irq = irq; + + return (ioctl(ctx->fd, VM_DEASSERT_IRQ, &vi)); +} + +int +vm_raise_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg, + int bus, int slot, int func) +{ + struct vm_msi vmsi; + + bzero(&vmsi, sizeof(vmsi)); + vmsi.addr = addr; + vmsi.msg = msg; + vmsi.bus = bus; + vmsi.slot = slot; + vmsi.func = func; + + return (ioctl(ctx->fd, VM_RAISE_MSI, &vmsi)); +} + +int +vm_inject_exception(struct vcpu *vcpu, uint64_t scause) +{ + struct vm_exception vmexc; + + bzero(&vmexc, sizeof(vmexc)); + vmexc.scause = scause; + + return (vcpu_ioctl(vcpu, VM_INJECT_EXCEPTION, &vmexc)); +} diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h index d2a217c4d2e9..0ea1d5824271 100644 --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -1,295 +1,300 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include #include #include #include #include /* * API version for out-of-tree consumers like grub-bhyve for making compile * time decisions. */ #define VMMAPI_VERSION 0200 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vcpu; struct vmctx; struct vm_snapshot_meta; enum x2apic_state; /* * Different styles of mapping the memory assigned to a VM into the address * space of the controlling process. */ enum vm_mmap_style { VM_MMAP_NONE, /* no mapping */ VM_MMAP_ALL, /* fully and statically mapped */ VM_MMAP_SPARSE, /* mappings created on-demand */ }; /* * 'flags' value passed to 'vm_set_memflags()'. */ #define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ #define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ /* * Identifiers for memory segments: * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. * - the remaining identifiers can be used to create devmem segments. */ enum { VM_SYSMEM, VM_BOOTROM, VM_FRAMEBUFFER, VM_PCIROM, }; __BEGIN_DECLS /* * Get the length and name of the memory segment identified by 'segid'. * Note that system memory segments are identified with a nul name. * * Returns 0 on success and non-zero otherwise. */ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, size_t namesiz); /* * Iterate over the guest address space. This function finds an address range * that starts at an address >= *gpa. * * Returns 0 if the next address range was found and non-zero otherwise. */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, size_t *lowmem_size, size_t *highmem_size); /* * Create a device memory segment identified by 'segid'. * * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. */ void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len); /* * Map the memory segment identified by 'segid' into the guest address space * at [gpa,gpa+len) with protection 'prot'. */ int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t segoff, size_t len, int prot); int vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len); int vm_create(const char *name); struct vmctx *vm_open(const char *name); void vm_close(struct vmctx *ctx); void vm_destroy(struct vmctx *ctx); int vm_limit_rights(struct vmctx *ctx); struct vcpu *vm_vcpu_open(struct vmctx *ctx, int vcpuid); void vm_vcpu_close(struct vcpu *vcpu); int vcpu_id(struct vcpu *vcpu); int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); /* inverse operation to vm_map_gpa - extract guest address from host pointer */ vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); #ifdef __amd64__ int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); #endif int vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); const char *vm_get_name(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); vm_paddr_t vm_get_highmem_base(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); #ifdef __amd64__ int vm_set_desc(struct vcpu *vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access); int vm_get_desc(struct vcpu *vcpu, int reg, uint64_t *base, uint32_t *limit, uint32_t *access); int vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *seg_desc); #endif int vm_set_register(struct vcpu *vcpu, int reg, uint64_t val); int vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval); int vm_set_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vcpu *vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_run(struct vcpu *vcpu, struct vm_run *vmrun); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_raise_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg, int bus, int slot, int func); -#ifdef __aarch64__ +#if defined(__aarch64__) int vm_attach_vgic(struct vmctx *ctx, uint64_t dist_start, size_t dist_size, uint64_t redist_start, size_t redist_size); +int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far); +#elif defined(__riscv) +int vm_attach_aplic(struct vmctx *ctx, uint64_t mem_start, size_t mem_size); +int vm_inject_exception(struct vcpu *vcpu, uint64_t scause); +#endif +#if defined(__aarch64__) || defined(__riscv) int vm_assert_irq(struct vmctx *ctx, uint32_t irq); int vm_deassert_irq(struct vmctx *ctx, uint32_t irq); -int vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far); #endif #ifdef __amd64__ int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); int vm_lapic_irq(struct vcpu *vcpu, int vector); int vm_lapic_local_irq(struct vcpu *vcpu, int vector); int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg); int vm_ioapic_assert_irq(struct vmctx *ctx, int irq); int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq); int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq); int vm_ioapic_pincount(struct vmctx *ctx, int *pincount); int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq); int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq, enum vm_intr_trigger trigger); int vm_inject_nmi(struct vcpu *vcpu); int vm_readwrite_kernemu_device(struct vcpu *vcpu, vm_paddr_t gpa, bool write, int size, uint64_t *value); #endif int vm_capability_name2type(const char *capname); const char *vm_capability_type2name(int type); int vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval); int vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val); int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func); int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); int vm_unmap_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func, vm_paddr_t gpa, size_t len); int vm_setup_pptdev_msi(struct vmctx *ctx, int bus, int slot, int func, uint64_t addr, uint64_t msg, int numvec); int vm_setup_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); int vm_disable_pptdev_msix(struct vmctx *ctx, int bus, int slot, int func); int vm_get_intinfo(struct vcpu *vcpu, uint64_t *i1, uint64_t *i2); int vm_set_intinfo(struct vcpu *vcpu, uint64_t exit_intinfo); /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ uint64_t *vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, int *ret_entries); const char *vm_get_stat_desc(struct vmctx *ctx, int index); #ifdef __amd64__ int vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *s); int vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state s); int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. * The 'iovcnt' should be big enough to accommodate all GPA segments. * * retval fault Interpretation * 0 0 Success * 0 1 An exception was injected into the guest * EFAULT N/A Error */ int vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *pg, uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, int *fault); #endif void vm_copyin(struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(const void *host_src, struct iovec *guest_iov, size_t len); void vm_copy_teardown(struct iovec *iov, int iovcnt); #ifdef __amd64__ /* RTC */ int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); int vm_rtc_settime(struct vmctx *ctx, time_t secs); int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); #endif /* Reset vcpu register state */ int vcpu_reset(struct vcpu *vcpu); int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vcpu *vcpu); int vm_suspend_all_cpus(struct vmctx *ctx); int vm_suspend_cpu(struct vcpu *vcpu); int vm_resume_all_cpus(struct vmctx *ctx); int vm_resume_cpu(struct vcpu *vcpu); int vm_restart_instruction(struct vcpu *vcpu); /* CPU topology */ int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); /* * FreeBSD specific APIs */ int vm_setup_freebsd_registers(struct vcpu *vcpu, uint64_t rip, uint64_t cr3, uint64_t gdtbase, uint64_t rsp); int vm_setup_freebsd_registers_i386(struct vcpu *vcpu, uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); /* * Save and restore */ int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); /* * Deprecated interfaces, do not use them in new code. */ int vm_get_device_fd(struct vmctx *ctx); const cap_ioctl_t *vm_get_ioctls(size_t *len); __END_DECLS #endif /* _VMMAPI_H_ */ diff --git a/usr.sbin/Makefile.riscv b/usr.sbin/Makefile.riscv new file mode 100644 index 000000000000..a8897983059c --- /dev/null +++ b/usr.sbin/Makefile.riscv @@ -0,0 +1,2 @@ +SUBDIR+= bhyve +SUBDIR+= bhyvectl diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c index b61f29aa6830..2f04a488d9c1 100644 --- a/usr.sbin/bhyve/pci_emul.c +++ b/usr.sbin/bhyve/pci_emul.c @@ -1,2804 +1,2804 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi.h" #include "bhyverun.h" #include "bootrom.h" #include "config.h" #include "debug.h" #ifdef __amd64__ #include "amd64/inout.h" #endif #include "mem.h" #include "pci_emul.h" #ifdef __amd64__ #include "amd64/pci_lpc.h" #include "pci_passthru.h" #endif #include "qemu_fwcfg.h" #define CONF1_ADDR_PORT 0x0cf8 #define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) #define GB (1024 * 1024 * 1024UL) struct funcinfo { nvlist_t *fi_config; struct pci_devemu *fi_pde; struct pci_devinst *fi_devi; }; struct intxinfo { int ii_count; struct pci_irq ii_irq; }; struct slotinfo { struct intxinfo si_intpins[4]; struct funcinfo si_funcs[MAXFUNCS]; }; struct businfo { uint16_t iobase, iolimit; /* I/O window */ uint32_t membase32, memlimit32; /* mmio window below 4GB */ uint64_t membase64, memlimit64; /* mmio window above 4GB */ struct slotinfo slotinfo[MAXSLOTS]; }; static struct businfo *pci_businfo[MAXBUSES]; SET_DECLARE(pci_devemu_set, struct pci_devemu); static uint64_t pci_emul_iobase; static uint8_t *pci_emul_rombase; static uint64_t pci_emul_romoffset; static uint8_t *pci_emul_romlim; static uint64_t pci_emul_membase32; static uint64_t pci_emul_membase64; static uint64_t pci_emul_memlim64; struct pci_bar_allocation { TAILQ_ENTRY(pci_bar_allocation) chain; struct pci_devinst *pdi; int idx; enum pcibar_type type; uint64_t size; }; static TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER(pci_bars); struct boot_device { TAILQ_ENTRY(boot_device) boot_device_chain; struct pci_devinst *pdi; int bootindex; }; static TAILQ_HEAD(boot_list, boot_device) boot_devices = TAILQ_HEAD_INITIALIZER( boot_devices); #if defined(__amd64__) #define PCI_EMUL_IOBASE 0x2000 #define PCI_EMUL_IOLIMIT 0x10000 #define PCI_EMUL_IOMASK 0xffff /* * OVMF always uses 0xc0000000 as base address for 32 bit PCI MMIO. Don't * change this address without changing it in OVMF. */ #define PCI_EMUL_MEMBASE32 0xc0000000 -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(__riscv) #define PCI_EMUL_IOBASE 0xdf000000UL #define PCI_EMUL_IOLIMIT 0xe0000000UL #define PCI_EMUL_MEMBASE32 0xa0000000UL #else #error Unsupported platform #endif #define PCI_EMUL_ROMSIZE 0x10000000 #define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */ #define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */ #ifdef __amd64__ SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE); #endif #define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE #define PCI_EMUL_MEMSIZE64 (32*GB) static void pci_lintr_route(struct pci_devinst *pi); static void pci_lintr_update(struct pci_devinst *pi); static struct pci_devemu *pci_emul_finddev(const char *name); static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); static __inline void CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) { if (bytes == 1) pci_set_cfgdata8(pi, coff, val); else if (bytes == 2) pci_set_cfgdata16(pi, coff, val); else pci_set_cfgdata32(pi, coff, val); } static __inline uint32_t CFGREAD(struct pci_devinst *pi, int coff, int bytes) { if (bytes == 1) return (pci_get_cfgdata8(pi, coff)); else if (bytes == 2) return (pci_get_cfgdata16(pi, coff)); else return (pci_get_cfgdata32(pi, coff)); } static int is_pcir_bar(int coff) { return (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)); } static int is_pcir_bios(int coff) { return (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4); } /* * I/O access */ /* * Slot options are in the form: * * ::,[,] * [:],[,] * * slot is 0..31 * func is 0..7 * emul is a string describing the type of PCI device e.g. virtio-net * config is an optional string, depending on the device, that can be * used for configuration. * Examples are: * 1,virtio-net,tap0 * 3:0,dummy */ static void pci_parse_slot_usage(char *aopt) { EPRINTLN("Invalid PCI slot info field \"%s\"", aopt); } /* * Helper function to parse a list of comma-separated options where * each option is formatted as "name[=value]". If no value is * provided, the option is treated as a boolean and is given a value * of true. */ int pci_parse_legacy_config(nvlist_t *nvl, const char *opt) { char *config, *name, *tofree, *value; if (opt == NULL) return (0); config = tofree = strdup(opt); while ((name = strsep(&config, ",")) != NULL) { value = strchr(name, '='); if (value != NULL) { *value = '\0'; value++; set_config_value_node(nvl, name, value); } else set_config_bool_node(nvl, name, true); } free(tofree); return (0); } /* * PCI device configuration is stored in MIBs that encode the device's * location: * * pci... * * Where "bus", "slot", and "func" are all decimal values without * leading zeroes. Each valid device must have a "device" node which * identifies the driver model of the device. * * Device backends can provide a parser for the "config" string. If * a custom parser is not provided, pci_parse_legacy_config() is used * to parse the string. */ int pci_parse_slot(char *opt) { char node_name[sizeof("pci.XXX.XX.X")]; struct pci_devemu *pde; char *emul, *config, *str, *cp; int error, bnum, snum, fnum; nvlist_t *nvl; error = -1; str = strdup(opt); emul = config = NULL; if ((cp = strchr(str, ',')) != NULL) { *cp = '\0'; emul = cp + 1; if ((cp = strchr(emul, ',')) != NULL) { *cp = '\0'; config = cp + 1; } } else { pci_parse_slot_usage(opt); goto done; } /* :: */ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) { bnum = 0; /* : */ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) { fnum = 0; /* */ if (sscanf(str, "%d", &snum) != 1) { snum = -1; } } } if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS || fnum < 0 || fnum >= MAXFUNCS) { pci_parse_slot_usage(opt); goto done; } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown device \"%s\"", bnum, snum, fnum, emul); goto done; } snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bnum, snum, fnum); nvl = find_config_node(node_name); if (nvl != NULL) { EPRINTLN("pci slot %d:%d:%d already occupied!", bnum, snum, fnum); goto done; } nvl = create_config_node(node_name); if (pde->pe_alias != NULL) set_config_value_node(nvl, "device", pde->pe_alias); else set_config_value_node(nvl, "device", pde->pe_emu); if (pde->pe_legacy_config != NULL) error = pde->pe_legacy_config(nvl, config); else error = pci_parse_legacy_config(nvl, config); done: free(str); return (error); } void pci_print_supported_devices(void) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; printf("%s\n", pdp->pe_emu); } } uint32_t pci_config_read_reg(const struct pcisel *const host_sel, nvlist_t *nvl, const uint32_t reg, const uint8_t size, const uint32_t def) { const char *config; const nvlist_t *pci_regs; assert(size == 1 || size == 2 || size == 4); pci_regs = find_relative_config_node(nvl, "pcireg"); if (pci_regs == NULL) { return def; } switch (reg) { case PCIR_DEVICE: config = get_config_value_node(pci_regs, "device"); break; case PCIR_VENDOR: config = get_config_value_node(pci_regs, "vendor"); break; case PCIR_REVID: config = get_config_value_node(pci_regs, "revid"); break; case PCIR_SUBVEND_0: config = get_config_value_node(pci_regs, "subvendor"); break; case PCIR_SUBDEV_0: config = get_config_value_node(pci_regs, "subdevice"); break; default: return (-1); } if (config == NULL) { return def; } else if (host_sel != NULL && strcmp(config, "host") == 0) { #ifdef __amd64__ return pci_host_read_config(host_sel, reg, size); #else errx(1, "cannot fetch host PCI configuration"); #endif } else { return strtol(config, NULL, 16); } } static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { if (offset < pi->pi_msix.pba_offset) return (0); if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { return (0); } return (1); } int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, uint64_t value) { int msix_entry_offset; int tab_index; char *dest; /* support only 4 or 8 byte writes */ if (size != 4 && size != 8) return (-1); /* * Return if table index is beyond what device supports */ tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index >= pi->pi_msix.table_count) return (-1); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned writes */ if ((msix_entry_offset % size) != 0) return (-1); dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 4) *((uint32_t *)dest) = value; else *((uint64_t *)dest) = value; return (0); } uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) { char *dest; int msix_entry_offset; int tab_index; uint64_t retval = ~0; /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) return (retval); msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; /* support only aligned reads */ if ((msix_entry_offset % size) != 0) { return (retval); } tab_index = offset / MSIX_TABLE_ENTRY_SIZE; if (tab_index < pi->pi_msix.table_count) { /* valid MSI-X Table access */ dest = (char *)(pi->pi_msix.table + tab_index); dest += msix_entry_offset; if (size == 1) retval = *((uint8_t *)dest); else if (size == 4) retval = *((uint32_t *)dest); else retval = *((uint64_t *)dest); } else if (pci_valid_pba_offset(pi, offset)) { /* return 0 for PBA access */ retval = 0; } return (retval); } int pci_msix_table_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.table_bar); else return (-1); } int pci_msix_pba_bar(struct pci_devinst *pi) { if (pi->pi_msix.table != NULL) return (pi->pi_msix.pba_bar); else return (-1); } #ifdef __amd64__ static int pci_emul_io_handler(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg) { struct pci_devinst *pdi = arg; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int i; assert(port >= 0); for (i = 0; i <= PCI_BARMAX; i++) { if (pdi->pi_bar[i].type == PCIBAR_IO && (uint64_t)port >= pdi->pi_bar[i].addr && (uint64_t)port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) { offset = port - pdi->pi_bar[i].addr; if (in) *eax = (*pe->pe_barread)(pdi, i, offset, bytes); else (*pe->pe_barwrite)(pdi, i, offset, bytes, *eax); return (0); } } return (-1); } #else static int pci_emul_iomem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int)arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_IO); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); assert(size == 1 || size == 2 || size == 4); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_READ) *val = (*pe->pe_barread)(pdi, bidx, offset, size); else (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); return (0); } #endif /* !__amd64__ */ static int pci_emul_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int size, uint64_t *val, void *arg1, long arg2) { struct pci_devinst *pdi = arg1; struct pci_devemu *pe = pdi->pi_d; uint64_t offset; int bidx = (int)arg2; assert(bidx <= PCI_BARMAX); assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 || pdi->pi_bar[bidx].type == PCIBAR_MEM64); assert(addr >= pdi->pi_bar[bidx].addr && addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size); offset = addr - pdi->pi_bar[bidx].addr; if (dir == MEM_F_WRITE) { if (size == 8) { (*pe->pe_barwrite)(pdi, bidx, offset, 4, *val & 0xffffffff); (*pe->pe_barwrite)(pdi, bidx, offset + 4, 4, *val >> 32); } else { (*pe->pe_barwrite)(pdi, bidx, offset, size, *val); } } else { if (size == 8) { *val = (*pe->pe_barread)(pdi, bidx, offset, 4); *val |= (*pe->pe_barread)(pdi, bidx, offset + 4, 4) << 32; } else { *val = (*pe->pe_barread)(pdi, bidx, offset, size); } } return (0); } static int pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size, uint64_t *addr) { uint64_t base; assert((size & (size - 1)) == 0); /* must be a power of 2 */ base = roundup2(*baseptr, size); if (base + size <= limit) { *addr = base; *baseptr = base + size; return (0); } else return (-1); } /* * Register (or unregister) the MMIO or I/O region associated with the BAR * register 'idx' of an emulated pci device. */ static void modify_bar_registration(struct pci_devinst *pi, int idx, int registration) { struct pci_devemu *pe; int error; enum pcibar_type type; pe = pi->pi_d; type = pi->pi_bar[idx].type; switch (type) { case PCIBAR_IO: { #ifdef __amd64__ struct inout_port iop; bzero(&iop, sizeof(struct inout_port)); iop.name = pi->pi_name; iop.port = pi->pi_bar[idx].addr; iop.size = pi->pi_bar[idx].size; if (registration) { iop.flags = IOPORT_F_INOUT; iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); } else error = unregister_inout(&iop); #else struct mem_range mr; bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_iomem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); #endif break; } case PCIBAR_MEM32: case PCIBAR_MEM64: { struct mem_range mr; bzero(&mr, sizeof(struct mem_range)); mr.name = pi->pi_name; mr.base = pi->pi_bar[idx].addr; mr.size = pi->pi_bar[idx].size; if (registration) { mr.flags = MEM_F_RW; mr.handler = pci_emul_mem_handler; mr.arg1 = pi; mr.arg2 = idx; error = register_mem(&mr); } else error = unregister_mem(&mr); break; } case PCIBAR_ROM: error = 0; break; default: error = EINVAL; break; } assert(error == 0); if (pe->pe_baraddr != NULL) (*pe->pe_baraddr)(pi, idx, registration, pi->pi_bar[idx].addr); } static void unregister_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 0); } static void register_bar(struct pci_devinst *pi, int idx) { modify_bar_registration(pi, idx, 1); } /* Is the ROM enabled for the emulated pci device? */ static int romen(struct pci_devinst *pi) { return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) == PCIM_BIOS_ENABLE; } /* Are we decoding i/o port accesses for the emulated pci device? */ static int porten(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_PORTEN); } /* Are we decoding memory accesses for the emulated pci device? */ static int memen(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (cmd & PCIM_CMD_MEMEN); } /* * Update the MMIO or I/O address that is decoded by the BAR register. * * If the pci device has enabled the address space decoding then intercept * the address range decoded by the BAR register. */ static void update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; if (pi->pi_bar[idx].type == PCIBAR_IO) decode = porten(pi); else decode = memen(pi); if (decode) unregister_bar(pi, idx); switch (type) { case PCIBAR_IO: case PCIBAR_MEM32: pi->pi_bar[idx].addr = addr; break; case PCIBAR_MEM64: pi->pi_bar[idx].addr &= ~0xffffffffUL; pi->pi_bar[idx].addr |= addr; break; case PCIBAR_MEMHI64: pi->pi_bar[idx].addr &= 0xffffffff; pi->pi_bar[idx].addr |= addr; break; default: assert(0); } if (decode) register_bar(pi, idx); } int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size) { assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX)); assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX)); if ((size & (size - 1)) != 0) size = 1UL << flsl(size); /* round up to a power of 2 */ /* Enforce minimum BAR sizes required by the PCI standard */ if (type == PCIBAR_IO) { if (size < 4) size = 4; } else if (type == PCIBAR_ROM) { if (size < ~PCIM_BIOS_ADDR_MASK + 1) size = ~PCIM_BIOS_ADDR_MASK + 1; } else { if (size < 16) size = 16; } /* * To reduce fragmentation of the MMIO space, we allocate the BARs by * size. Therefore, don't allocate the BAR yet. We create a list of all * BAR allocation which is sorted by BAR size. When all PCI devices are * initialized, we will assign an address to the BARs. */ /* create a new list entry */ struct pci_bar_allocation *const new_bar = malloc(sizeof(*new_bar)); memset(new_bar, 0, sizeof(*new_bar)); new_bar->pdi = pdi; new_bar->idx = idx; new_bar->type = type; new_bar->size = size; /* * Search for a BAR which size is lower than the size of our newly * allocated BAR. */ struct pci_bar_allocation *bar = NULL; TAILQ_FOREACH(bar, &pci_bars, chain) { if (bar->size < size) { break; } } if (bar == NULL) { /* * Either the list is empty or new BAR is the smallest BAR of * the list. Append it to the end of our list. */ TAILQ_INSERT_TAIL(&pci_bars, new_bar, chain); } else { /* * The found BAR is smaller than our new BAR. For that reason, * insert our new BAR before the found BAR. */ TAILQ_INSERT_BEFORE(bar, new_bar, chain); } /* * Enable PCI BARs only if we don't have a boot ROM, i.e., bhyveload was * used to load the initial guest image. Otherwise, we rely on the boot * ROM to handle this. */ if (!get_config_bool_default("pci.enable_bars", !bootrom_boot())) return (0); /* * pci_passthru devices synchronize their physical and virtual command * register on init. For that reason, the virtual cmd reg should be * updated as early as possible. */ uint16_t enbit = 0; switch (type) { case PCIBAR_IO: enbit = PCIM_CMD_PORTEN; break; case PCIBAR_MEM64: case PCIBAR_MEM32: enbit = PCIM_CMD_MEMEN; break; default: enbit = 0; break; } const uint16_t cmd = pci_get_cfgdata16(pdi, PCIR_COMMAND); pci_set_cfgdata16(pdi, PCIR_COMMAND, cmd | enbit); return (0); } static int pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx, const enum pcibar_type type, const uint64_t size) { int error; uint64_t *baseptr, limit, addr, mask, lobits, bar; switch (type) { case PCIBAR_NONE: baseptr = NULL; addr = mask = lobits = 0; break; case PCIBAR_IO: baseptr = &pci_emul_iobase; limit = PCI_EMUL_IOLIMIT; mask = PCIM_BAR_IO_BASE; lobits = PCIM_BAR_IO_SPACE; break; case PCIBAR_MEM64: /* * XXX * Some drivers do not work well if the 64-bit BAR is allocated * above 4GB. Allow for this by allocating small requests under * 4GB unless then allocation size is larger than some arbitrary * number (128MB currently). */ if (size > 128 * 1024 * 1024) { baseptr = &pci_emul_membase64; limit = pci_emul_memlim64; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 | PCIM_BAR_MEM_PREFETCH; } else { baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64; } break; case PCIBAR_MEM32: baseptr = &pci_emul_membase32; limit = PCI_EMUL_MEMLIMIT32; mask = PCIM_BAR_MEM_BASE; lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32; break; case PCIBAR_ROM: /* do not claim memory for ROM. OVMF will do it for us. */ baseptr = NULL; limit = 0; mask = PCIM_BIOS_ADDR_MASK; lobits = 0; break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); assert(0); } if (baseptr != NULL) { error = pci_emul_alloc_resource(baseptr, limit, size, &addr); if (error != 0) return (error); } else { addr = 0; } pdi->pi_bar[idx].type = type; pdi->pi_bar[idx].addr = addr; pdi->pi_bar[idx].size = size; /* * passthru devices are using same lobits as physical device they set * this property */ if (pdi->pi_bar[idx].lobits != 0) { lobits = pdi->pi_bar[idx].lobits; } else { pdi->pi_bar[idx].lobits = lobits; } /* Initialize the BAR register in config space */ bar = (addr & mask) | lobits; pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar); if (type == PCIBAR_MEM64) { assert(idx + 1 <= PCI_BARMAX); pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } switch (type) { case PCIBAR_IO: if (porten(pdi)) register_bar(pdi, idx); break; case PCIBAR_MEM32: case PCIBAR_MEM64: case PCIBAR_MEMHI64: if (memen(pdi)) register_bar(pdi, idx); break; default: break; } return (0); } int pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size, void **const addr) { /* allocate ROM space once on first call */ if (pci_emul_rombase == 0) { pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM, "pcirom", PCI_EMUL_ROMSIZE); if (pci_emul_rombase == MAP_FAILED) { warnx("%s: failed to create rom segment", __func__); return (-1); } pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE; pci_emul_romoffset = 0; } /* ROM size should be a power of 2 and greater than 2 KB */ const uint64_t rom_size = MAX(1UL << flsl(size), ~PCIM_BIOS_ADDR_MASK + 1); /* check if ROM fits into ROM space */ if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) { warnx("%s: no space left in rom segment:", __func__); warnx("%16lu bytes left", PCI_EMUL_ROMSIZE - pci_emul_romoffset); warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus, pdi->pi_slot, pdi->pi_func); return (-1); } /* allocate ROM BAR */ const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM, rom_size); if (error) return error; /* return address */ *addr = pci_emul_rombase + pci_emul_romoffset; /* save offset into ROM Space */ pdi->pi_romoffset = pci_emul_romoffset; /* increase offset for next ROM */ pci_emul_romoffset += rom_size; return (0); } int pci_emul_add_boot_device(struct pci_devinst *pi, int bootindex) { struct boot_device *new_device, *device; /* don't permit a negative bootindex */ if (bootindex < 0) { errx(4, "Invalid bootindex %d for %s", bootindex, pi->pi_name); } /* alloc new boot device */ new_device = calloc(1, sizeof(struct boot_device)); if (new_device == NULL) { return (ENOMEM); } new_device->pdi = pi; new_device->bootindex = bootindex; /* search for boot device with higher boot index */ TAILQ_FOREACH(device, &boot_devices, boot_device_chain) { if (device->bootindex == bootindex) { errx(4, "Could not set bootindex %d for %s. Bootindex already occupied by %s", bootindex, pi->pi_name, device->pdi->pi_name); } else if (device->bootindex > bootindex) { break; } } /* add boot device to queue */ if (device == NULL) { TAILQ_INSERT_TAIL(&boot_devices, new_device, boot_device_chain); } else { TAILQ_INSERT_BEFORE(device, new_device, boot_device_chain); } return (0); } #define CAP_START_OFFSET 0x40 static int pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen) { int i, capoff, reallen; uint16_t sts; assert(caplen > 0); reallen = roundup2(caplen, 4); /* dword aligned */ sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) == 0) capoff = CAP_START_OFFSET; else capoff = pi->pi_capend + 1; /* Check if we have enough space */ if (capoff + reallen > PCI_REGMAX + 1) return (-1); /* Set the previous capability pointer */ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) { pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff); pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT); } else pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff); /* Copy the capability */ for (i = 0; i < caplen; i++) pci_set_cfgdata8(pi, capoff + i, capdata[i]); /* Set the next capability pointer */ pci_set_cfgdata8(pi, capoff + 1, 0); pi->pi_prevcap = capoff; pi->pi_capend = capoff + reallen - 1; return (0); } static struct pci_devemu * pci_emul_finddev(const char *name) { struct pci_devemu **pdpp, *pdp; SET_FOREACH(pdpp, pci_devemu_set) { pdp = *pdpp; if (!strcmp(pdp->pe_emu, name)) { return (pdp); } } return (NULL); } static int pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot, int func, struct funcinfo *fi) { struct pci_devinst *pdi; int err; pdi = calloc(1, sizeof(struct pci_devinst)); pdi->pi_vmctx = ctx; pdi->pi_bus = bus; pdi->pi_slot = slot; pdi->pi_func = func; pthread_mutex_init(&pdi->pi_lintr.lock, NULL); pdi->pi_lintr.pin = 0; pdi->pi_lintr.state = IDLE; pci_irq_init_irq(&pdi->pi_lintr.irq); pdi->pi_d = pde; snprintf(pdi->pi_name, PI_NAMESZ, "%s@pci.%d.%d.%d", pde->pe_emu, bus, slot, func); /* Disable legacy interrupts */ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255); pci_set_cfgdata8(pdi, PCIR_INTPIN, 0); if (get_config_bool_default("pci.enable_bars", !bootrom_boot())) pci_set_cfgdata8(pdi, PCIR_COMMAND, PCIM_CMD_BUSMASTEREN); err = (*pde->pe_init)(pdi, fi->fi_config); if (err == 0) fi->fi_devi = pdi; else free(pdi); return (err); } void pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; bzero(msicap, sizeof(struct msicap)); msicap->capid = PCIY_MSI; msicap->nextptr = nextptr; msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1); } int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum) { struct msicap msicap; pci_populate_msicap(&msicap, msgnum, 0); return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap))); } static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { assert(msix_tab_size % 4096 == 0); bzero(msixcap, sizeof(struct msixcap)); msixcap->capid = PCIY_MSIX; /* * Message Control Register, all fields set to * zero except for the Table Size. * Note: Table size N is encoded as N-1 */ msixcap->msgctrl = msgnum - 1; /* * MSI-X BAR setup: * - MSI-X table start at offset 0 * - PBA table starts at a 4K aligned offset after the MSI-X table */ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK; msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK); } static void pci_msix_table_init(struct pci_devinst *pi, int table_entries) { int i, table_size; assert(table_entries > 0); assert(table_entries <= MAX_MSIX_TABLE_ENTRIES); table_size = table_entries * MSIX_TABLE_ENTRY_SIZE; pi->pi_msix.table = calloc(1, table_size); /* set mask bit of vector control register */ for (i = 0; i < table_entries; i++) pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK; } int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) { uint32_t tab_size; struct msixcap msixcap; assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ tab_size = roundup2(tab_size, 4096); pi->pi_msix.table_bar = barnum; pi->pi_msix.pba_bar = barnum; pi->pi_msix.table_offset = 0; pi->pi_msix.table_count = msgnum; pi->pi_msix.pba_offset = tab_size; pi->pi_msix.pba_size = PBA_SIZE(msgnum); pci_msix_table_init(pi, msgnum); pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size); /* allocate memory for MSI-X Table and PBA */ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32, tab_size + pi->pi_msix.pba_size); return (pci_emul_add_capability(pi, (u_char *)&msixcap, sizeof(msixcap))); } static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; int off; off = offset - capoff; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); } CFGWRITE(pi, offset, val, bytes); } static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask, msgdata, mme; uint32_t addrlo; /* * If guest is writing to the message control register make sure * we do not overwrite read-only fields. */ if ((offset - capoff) == 2 && bytes == 2) { rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE; msgctrl = pci_get_cfgdata16(pi, offset); msgctrl &= ~rwmask; msgctrl |= val & rwmask; val = msgctrl; } CFGWRITE(pi, offset, val, bytes); msgctrl = pci_get_cfgdata16(pi, capoff + 2); addrlo = pci_get_cfgdata32(pi, capoff + 4); if (msgctrl & PCIM_MSICTRL_64BIT) msgdata = pci_get_cfgdata16(pi, capoff + 12); else msgdata = pci_get_cfgdata16(pi, capoff + 8); mme = msgctrl & PCIM_MSICTRL_MME_MASK; pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0; if (pi->pi_msi.enabled) { pi->pi_msi.addr = addrlo; pi->pi_msi.msg_data = msgdata; pi->pi_msi.maxmsgnum = 1 << (mme >> 4); } else { pi->pi_msi.maxmsgnum = 0; } pci_lintr_update(pi); } static void pciecap_cfgwrite(struct pci_devinst *pi, int capoff __unused, int offset, int bytes, uint32_t val) { /* XXX don't write to the readonly parts */ CFGWRITE(pi, offset, val, bytes); } #define PCIECAP_VERSION 0x2 int pci_emul_add_pciecap(struct pci_devinst *pi, int type) { int err; struct pciecap pciecap; bzero(&pciecap, sizeof(pciecap)); /* * Use the integrated endpoint type for endpoints on a root complex bus. * * NB: bhyve currently only supports a single PCI bus that is the root * complex bus, so all endpoints are integrated. */ if ((type == PCIEM_TYPE_ENDPOINT) && (pi->pi_bus == 0)) type = PCIEM_TYPE_ROOT_INT_EP; pciecap.capid = PCIY_EXPRESS; pciecap.pcie_capabilities = PCIECAP_VERSION | type; if (type != PCIEM_TYPE_ROOT_INT_EP) { pciecap.link_capabilities = 0x411; /* gen1, x1 */ pciecap.link_status = 0x11; /* gen1, x1 */ } err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap)); return (err); } /* * This function assumes that 'coff' is in the capabilities region of the * config space. A capoff parameter of zero will force a search for the * offset and type. */ void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, uint8_t capoff, int capid) { uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; if (capoff == 0) { /* Find the capability that we want to update */ capoff = CAP_START_OFFSET; while (1) { nextoff = pci_get_cfgdata8(pi, capoff + 1); if (nextoff == 0) break; if (offset >= capoff && offset < nextoff) break; capoff = nextoff; } assert(offset >= capoff); capid = pci_get_cfgdata8(pi, capoff); } /* * Capability ID and Next Capability Pointer are readonly. * However, some o/s's do 4-byte writes that include these. * For this case, trim the write back to 2 bytes and adjust * the data. */ if (offset == capoff || offset == capoff + 1) { if (offset == capoff && bytes == 4) { bytes = 2; offset += 2; val >>= 16; } else return; } switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_MSIX: msixcap_cfgwrite(pi, capoff, offset, bytes, val); break; case PCIY_EXPRESS: pciecap_cfgwrite(pi, capoff, offset, bytes, val); break; default: break; } } static int pci_emul_iscap(struct pci_devinst *pi, int offset) { uint16_t sts; sts = pci_get_cfgdata16(pi, PCIR_STATUS); if ((sts & PCIM_STATUS_CAPPRESENT) != 0) { if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend) return (1); } return (0); } static int pci_emul_fallback_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr __unused, int size __unused, uint64_t *val, void *arg1 __unused, long arg2 __unused) { /* * Ignore writes; return 0xff's for reads. The mem read code * will take care of truncating to the correct size. */ if (dir == MEM_F_READ) { *val = 0xffffffffffffffff; } return (0); } static int pci_emul_ecfg_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, int bytes, uint64_t *val, void *arg1 __unused, long arg2 __unused) { int bus, slot, func, coff, in; coff = addr & 0xfff; func = (addr >> 12) & 0x7; slot = (addr >> 15) & 0x1f; bus = (addr >> 20) & 0xff; in = (dir == MEM_F_READ); if (in) *val = ~0UL; pci_cfgrw(in, bus, slot, func, coff, bytes, (uint32_t *)val); return (0); } uint64_t pci_ecfg_base(void) { return (PCI_EMUL_ECFG_BASE); } static int init_bootorder(void) { struct boot_device *device; FILE *fp; char *bootorder; size_t bootorder_len; if (TAILQ_EMPTY(&boot_devices)) return (0); fp = open_memstream(&bootorder, &bootorder_len); TAILQ_FOREACH(device, &boot_devices, boot_device_chain) { fprintf(fp, "/pci@i0cf8/pci@%d,%d\n", device->pdi->pi_slot, device->pdi->pi_func); } fclose(fp); return (qemu_fwcfg_add_file("bootorder", bootorder_len, bootorder)); } #define BUSIO_ROUNDUP 32 #define BUSMEM32_ROUNDUP (1024 * 1024) #define BUSMEM64_ROUNDUP (512 * 1024 * 1024) int init_pci(struct vmctx *ctx) { char node_name[sizeof("pci.XXX.XX.X")]; struct mem_range mr; struct pci_devemu *pde; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; nvlist_t *nvl; const char *emul; size_t lowmem; int bus, slot, func; int error; if (vm_get_lowmem_limit(ctx) > PCI_EMUL_MEMBASE32) errx(EX_OSERR, "Invalid lowmem limit"); pci_emul_iobase = PCI_EMUL_IOBASE; pci_emul_membase32 = PCI_EMUL_MEMBASE32; pci_emul_membase64 = vm_get_highmem_base(ctx) + vm_get_highmem_size(ctx); pci_emul_membase64 = roundup2(pci_emul_membase64, PCI_EMUL_MEMSIZE64); pci_emul_memlim64 = pci_emul_membase64 + PCI_EMUL_MEMSIZE64; TAILQ_INIT(&boot_devices); for (bus = 0; bus < MAXBUSES; bus++) { snprintf(node_name, sizeof(node_name), "pci.%d", bus); nvl = find_config_node(node_name); if (nvl == NULL) continue; pci_businfo[bus] = calloc(1, sizeof(struct businfo)); bi = pci_businfo[bus]; /* * Keep track of the i/o and memory resources allocated to * this bus. */ bi->iobase = pci_emul_iobase; bi->membase32 = pci_emul_membase32; bi->membase64 = pci_emul_membase64; /* first run: init devices */ for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; snprintf(node_name, sizeof(node_name), "pci.%d.%d.%d", bus, slot, func); nvl = find_config_node(node_name); if (nvl == NULL) continue; fi->fi_config = nvl; emul = get_config_value_node(nvl, "device"); if (emul == NULL) { EPRINTLN("pci slot %d:%d:%d: missing " "\"device\" value", bus, slot, func); return (EINVAL); } pde = pci_emul_finddev(emul); if (pde == NULL) { EPRINTLN("pci slot %d:%d:%d: unknown " "device \"%s\"", bus, slot, func, emul); return (EINVAL); } if (pde->pe_alias != NULL) { EPRINTLN("pci slot %d:%d:%d: legacy " "device \"%s\", use \"%s\" instead", bus, slot, func, emul, pde->pe_alias); return (EINVAL); } fi->fi_pde = pde; error = pci_emul_init(ctx, pde, bus, slot, func, fi); if (error) return (error); } } /* second run: assign BARs and free list */ struct pci_bar_allocation *bar; struct pci_bar_allocation *bar_tmp; TAILQ_FOREACH_SAFE(bar, &pci_bars, chain, bar_tmp) { pci_emul_assign_bar(bar->pdi, bar->idx, bar->type, bar->size); free(bar); } TAILQ_INIT(&pci_bars); /* * Add some slop to the I/O and memory resources decoded by * this bus to give a guest some flexibility if it wants to * reprogram the BARs. */ pci_emul_iobase += BUSIO_ROUNDUP; pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP); bi->iolimit = pci_emul_iobase; pci_emul_membase32 += BUSMEM32_ROUNDUP; pci_emul_membase32 = roundup2(pci_emul_membase32, BUSMEM32_ROUNDUP); bi->memlimit32 = pci_emul_membase32; pci_emul_membase64 += BUSMEM64_ROUNDUP; pci_emul_membase64 = roundup2(pci_emul_membase64, BUSMEM64_ROUNDUP); bi->memlimit64 = pci_emul_membase64; } /* * PCI backends are initialized before routing INTx interrupts * so that LPC devices are able to reserve ISA IRQs before * routing PIRQ pins. */ for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; pci_lintr_route(fi->fi_devi); } } } #ifdef __amd64__ lpc_pirq_routed(); #endif if ((error = init_bootorder()) != 0) { warnx("%s: Unable to init bootorder", __func__); return (error); } /* * The guest physical memory map looks like the following on amd64: * [0, lowmem) guest system memory * [lowmem, 0xC0000000) memory hole (may be absent) * [0xC0000000, 0xE0000000) PCI hole (32-bit BAR allocation) * [0xE0000000, 0xF0000000) PCI extended config window * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware * [4GB, 4GB + highmem) guest system memory * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation * * On arm64 the guest physical memory map looks like this: * [0x0DF00000, 0x10000000) PCI I/O memory * [0xA0000000, 0xE0000000) PCI 32-bit BAR allocation * [0xE0000000, 0xF0000000) PCI extended config window * [4GB, 4GB + highmem) guest system memory * [roundup(4GB + highmem, 32GB), ...) PCI 64-bit BAR allocation * * "lowmem" is guest memory below 0xC0000000. amd64 guests provisioned * with less than 3GB of RAM will have no memory above the 4GB boundary. * System memory for arm64 guests is all above the 4GB boundary. */ /* * Accesses to memory addresses that are not allocated to system * memory or PCI devices return 0xff's. */ lowmem = vm_get_lowmem_size(ctx); bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI hole"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = lowmem; mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem; mr.handler = pci_emul_fallback_handler; error = register_mem_fallback(&mr); assert(error == 0); /* PCI extended config space */ bzero(&mr, sizeof(struct mem_range)); mr.name = "PCI ECFG"; mr.flags = MEM_F_RW | MEM_F_IMMUTABLE; mr.base = PCI_EMUL_ECFG_BASE; mr.size = PCI_EMUL_ECFG_SIZE; mr.handler = pci_emul_ecfg_handler; error = register_mem(&mr); assert(error == 0); return (0); } #ifdef __amd64__ static void pci_apic_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq, void *arg __unused) { dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" Zero,"); dsdt_line(" 0x%X", irq->ioapic_irq); dsdt_line(" },"); } static void pci_pirq_prt_entry(int bus __unused, int slot, int pin, struct pci_irq *irq, void *arg __unused) { char *name; name = lpc_pirq_name(irq->pirq_pin); if (name == NULL) return; dsdt_line(" Package ()"); dsdt_line(" {"); dsdt_line(" 0x%X,", slot << 16 | 0xffff); dsdt_line(" 0x%02X,", pin - 1); dsdt_line(" %s,", name); dsdt_line(" 0x00"); dsdt_line(" },"); free(name); } #endif /* * A bhyve virtual machine has a flat PCI hierarchy with a root port * corresponding to each PCI bus. */ static void pci_bus_write_dsdt(int bus) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; int func, slot; /* * If there are no devices on this 'bus' then just return. */ if ((bi = pci_businfo[bus]) == NULL) { /* * Bus 0 is special because it decodes the I/O ports used * for PCI config space access even if there are no devices * on it. */ if (bus != 0) return; } dsdt_line(" Device (PC%02X)", bus); dsdt_line(" {"); dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))"); dsdt_line(" Method (_BBN, 0, NotSerialized)"); dsdt_line(" {"); dsdt_line(" Return (0x%08X)", bus); dsdt_line(" }"); dsdt_line(" Name (_CRS, ResourceTemplate ()"); dsdt_line(" {"); dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, " "MaxFixed, PosDecode,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bus); dsdt_line(" 0x%04X, // Range Maximum", bus); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0001, // Length"); dsdt_line(" ,, )"); #ifdef __amd64__ if (bus == 0) { dsdt_indent(3); dsdt_fixed_ioport(0xCF8, 8); dsdt_unindent(3); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0000, // Range Minimum"); dsdt_line(" 0x0CF7, // Range Maximum"); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x0CF8, // Length"); dsdt_line(" ,, , TypeStatic)"); dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x0D00, // Range Minimum"); dsdt_line(" 0x%04X, // Range Maximum", PCI_EMUL_IOBASE - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", PCI_EMUL_IOBASE - 0x0D00); dsdt_line(" ,, , TypeStatic)"); if (bi == NULL) { dsdt_line(" })"); goto done; } } #endif assert(bi != NULL); /* i/o window */ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, " "PosDecode, EntireRange,"); dsdt_line(" 0x0000, // Granularity"); dsdt_line(" 0x%04X, // Range Minimum", bi->iobase); dsdt_line(" 0x%04X, // Range Maximum", bi->iolimit - 1); dsdt_line(" 0x0000, // Translation Offset"); dsdt_line(" 0x%04X, // Length", bi->iolimit - bi->iobase); dsdt_line(" ,, , TypeStatic)"); /* mmio window (32-bit) */ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x00000000, // Granularity"); dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32); dsdt_line(" 0x%08X, // Range Maximum\n", bi->memlimit32 - 1); dsdt_line(" 0x00000000, // Translation Offset"); dsdt_line(" 0x%08X, // Length\n", bi->memlimit32 - bi->membase32); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); /* mmio window (64-bit) */ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, " "MinFixed, MaxFixed, NonCacheable, ReadWrite,"); dsdt_line(" 0x0000000000000000, // Granularity"); dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64); dsdt_line(" 0x%016lX, // Range Maximum\n", bi->memlimit64 - 1); dsdt_line(" 0x0000000000000000, // Translation Offset"); dsdt_line(" 0x%016lX, // Length\n", bi->memlimit64 - bi->membase64); dsdt_line(" ,, , AddressRangeMemory, TypeStatic)"); dsdt_line(" })"); #ifdef __amd64__ if (pci_count_lintr(bus) != 0) { dsdt_indent(2); dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); dsdt_line(" {"); dsdt_line(" Return (APRT)"); dsdt_line(" }"); dsdt_line(" Else"); dsdt_line(" {"); dsdt_line(" Return (PPRT)"); dsdt_line(" }"); dsdt_line("}"); dsdt_unindent(2); } #endif dsdt_indent(2); for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (func = 0; func < MAXFUNCS; func++) { pi = si->si_funcs[func].fi_devi; if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL) pi->pi_d->pe_write_dsdt(pi); } } dsdt_unindent(2); #ifdef __amd64__ done: #endif dsdt_line(" }"); } void pci_write_dsdt(void) { int bus; dsdt_indent(1); dsdt_line("Name (PICM, 0x00)"); dsdt_line("Method (_PIC, 1, NotSerialized)"); dsdt_line("{"); dsdt_line(" Store (Arg0, PICM)"); dsdt_line("}"); dsdt_line(""); dsdt_line("Scope (_SB)"); dsdt_line("{"); for (bus = 0; bus < MAXBUSES; bus++) pci_bus_write_dsdt(bus); dsdt_line("}"); dsdt_unindent(1); } int pci_bus_configured(int bus) { assert(bus >= 0 && bus < MAXBUSES); return (pci_businfo[bus] != NULL); } int pci_msi_enabled(struct pci_devinst *pi) { return (pi->pi_msi.enabled); } int pci_msi_maxmsgnum(struct pci_devinst *pi) { if (pi->pi_msi.enabled) return (pi->pi_msi.maxmsgnum); else return (0); } int pci_msix_enabled(struct pci_devinst *pi) { return (pi->pi_msix.enabled && !pi->pi_msi.enabled); } void pci_generate_msix(struct pci_devinst *pi, int index) { struct msix_table_entry *mte; if (!pci_msix_enabled(pi)) return; if (pi->pi_msix.function_mask) return; if (index >= pi->pi_msix.table_count) return; mte = &pi->pi_msix.table[index]; if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { /* XXX Set PBA bit if interrupt is disabled */ vm_raise_msi(pi->pi_vmctx, mte->addr, mte->msg_data, pi->pi_bus, pi->pi_slot, pi->pi_func); } } void pci_generate_msi(struct pci_devinst *pi, int index) { if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) { vm_raise_msi(pi->pi_vmctx, pi->pi_msi.addr, pi->pi_msi.msg_data + index, pi->pi_bus, pi->pi_slot, pi->pi_func); } } static bool pci_lintr_permitted(struct pci_devinst *pi) { uint16_t cmd; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); return (!(pi->pi_msi.enabled || pi->pi_msix.enabled || (cmd & PCIM_CMD_INTxDIS))); } void pci_lintr_request(struct pci_devinst *pi) { struct businfo *bi; struct slotinfo *si; int bestpin, bestcount, pin; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); /* * Just allocate a pin from our slot. The pin will be * assigned IRQs later when interrupts are routed. */ si = &bi->slotinfo[pi->pi_slot]; bestpin = 0; bestcount = si->si_intpins[0].ii_count; for (pin = 1; pin < 4; pin++) { if (si->si_intpins[pin].ii_count < bestcount) { bestpin = pin; bestcount = si->si_intpins[pin].ii_count; } } si->si_intpins[bestpin].ii_count++; pi->pi_lintr.pin = bestpin + 1; pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1); } static void pci_lintr_route(struct pci_devinst *pi) { struct businfo *bi; struct intxinfo *ii; struct pci_irq *irq; if (pi->pi_lintr.pin == 0) return; bi = pci_businfo[pi->pi_bus]; assert(bi != NULL); ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1]; irq = &ii->ii_irq; pci_irq_route(pi, irq); pi->pi_lintr.irq = *irq; pci_set_cfgdata8(pi, PCIR_INTLINE, pci_irq_intline(irq)); } void pci_lintr_assert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == IDLE) { if (pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } else pi->pi_lintr.state = PENDING; } pthread_mutex_unlock(&pi->pi_lintr.lock); } void pci_lintr_deassert(struct pci_devinst *pi) { assert(pi->pi_lintr.pin > 0); pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED) { pi->pi_lintr.state = IDLE; pci_irq_deassert(pi); } else if (pi->pi_lintr.state == PENDING) pi->pi_lintr.state = IDLE; pthread_mutex_unlock(&pi->pi_lintr.lock); } static void pci_lintr_update(struct pci_devinst *pi) { pthread_mutex_lock(&pi->pi_lintr.lock); if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) { pci_irq_deassert(pi); pi->pi_lintr.state = PENDING; } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) { pi->pi_lintr.state = ASSERTED; pci_irq_assert(pi); } pthread_mutex_unlock(&pi->pi_lintr.lock); } int pci_count_lintr(int bus) { int count, slot, pin; struct slotinfo *slotinfo; count = 0; if (pci_businfo[bus] != NULL) { for (slot = 0; slot < MAXSLOTS; slot++) { slotinfo = &pci_businfo[bus]->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { if (slotinfo->si_intpins[pin].ii_count != 0) count++; } } } return (count); } void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg) { struct businfo *bi; struct slotinfo *si; struct intxinfo *ii; int slot, pin; if ((bi = pci_businfo[bus]) == NULL) return; for (slot = 0; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; for (pin = 0; pin < 4; pin++) { ii = &si->si_intpins[pin]; if (ii->ii_count != 0) cb(bus, slot, pin + 1, &ii->ii_irq, arg); } } } /* * Return 1 if the emulated device in 'slot' is a multi-function device. * Return 0 otherwise. */ static int pci_emul_is_mfdev(int bus, int slot) { struct businfo *bi; struct slotinfo *si; int f, numfuncs; numfuncs = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; for (f = 0; f < MAXFUNCS; f++) { if (si->si_funcs[f].fi_devi != NULL) { numfuncs++; } } } return (numfuncs > 1); } /* * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on * whether or not is a multi-function being emulated in the pci 'slot'. */ static void pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) { int mfdev; if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) { mfdev = pci_emul_is_mfdev(bus, slot); switch (bytes) { case 1: case 2: *rv &= ~PCIM_MFDEV; if (mfdev) { *rv |= PCIM_MFDEV; } break; case 4: *rv &= ~(PCIM_MFDEV << 16); if (mfdev) { *rv |= (PCIM_MFDEV << 16); } break; } } } /* * Update device state in response to changes to the PCI command * register. */ void pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old) { int i; uint16_t changed, new; new = pci_get_cfgdata16(pi, PCIR_COMMAND); changed = old ^ new; /* * If the MMIO or I/O address space decoding has changed then * register/unregister all BARs that decode that address space. */ for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) { switch (pi->pi_bar[i].type) { case PCIBAR_NONE: case PCIBAR_MEMHI64: break; case PCIBAR_IO: /* I/O address space decoding changed? */ if (changed & PCIM_CMD_PORTEN) { if (new & PCIM_CMD_PORTEN) register_bar(pi, i); else unregister_bar(pi, i); } break; case PCIBAR_ROM: /* skip (un-)register of ROM if it disabled */ if (!romen(pi)) break; /* fallthrough */ case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ if (changed & PCIM_CMD_MEMEN) { if (new & PCIM_CMD_MEMEN) register_bar(pi, i); else unregister_bar(pi, i); } break; default: assert(0); } } /* * If INTx has been unmasked and is pending, assert the * interrupt. */ pci_lintr_update(pi); } static void pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { int rshift; uint32_t cmd, old, readonly; cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. * * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are * 'write 1 to clear'. However these bits are not set to '1' by * any device emulation so it is simpler to treat them as readonly. */ rshift = (coff & 0x3) * 8; readonly = 0xFFFFF880 >> rshift; old = CFGREAD(pi, coff, bytes); new &= ~readonly; new |= (old & readonly); CFGWRITE(pi, coff, new, bytes); /* update config */ pci_emul_cmd_changed(pi, cmd); } static void pci_cfgrw(int in, int bus, int slot, int func, int coff, int bytes, uint32_t *valp) { struct businfo *bi; struct slotinfo *si; struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; uint64_t addr, bar, mask; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; pi = si->si_funcs[func].fi_devi; } else pi = NULL; /* * Just return if there is no device at this slot:func or if the * guest is doing an un-aligned access. */ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) || (coff & (bytes - 1)) != 0) { if (in) *valp = 0xffffffff; return; } /* * Ignore all writes beyond the standard config space and return all * ones on reads. */ if (coff >= PCI_REGMAX + 1) { if (in) { *valp = 0xffffffff; /* * Extended capabilities begin at offset 256 in config * space. Absence of extended capabilities is signaled * with all 0s in the extended capability header at * offset 256. */ if (coff <= PCI_REGMAX + 4) *valp = 0x00000000; } return; } pe = pi->pi_d; /* * Config read */ if (in) { /* Let the device emulation override the default handler */ if (pe->pe_cfgread != NULL) { needcfg = pe->pe_cfgread(pi, coff, bytes, valp); } else { needcfg = 1; } if (needcfg) *valp = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, valp); } else { /* Let the device emulation override the default handler */ if (pe->pe_cfgwrite != NULL && (*pe->pe_cfgwrite)(pi, coff, bytes, *valp) == 0) return; /* * Special handling for write to BAR and ROM registers */ if (is_pcir_bar(coff) || is_pcir_bios(coff)) { /* * Ignore writes to BAR registers that are not * 4-byte aligned. */ if (bytes != 4 || (coff & 0x3) != 0) return; if (is_pcir_bar(coff)) { idx = (coff - PCIR_BAR(0)) / 4; } else if (is_pcir_bios(coff)) { idx = PCI_ROM_IDX; } else { errx(4, "%s: invalid BAR offset %d", __func__, coff); } mask = ~(pi->pi_bar[idx].size - 1); switch (pi->pi_bar[idx].type) { case PCIBAR_NONE: pi->pi_bar[idx].addr = bar = 0; break; case PCIBAR_IO: addr = *valp & mask; #if defined(PCI_EMUL_IOMASK) addr &= PCI_EMUL_IOMASK; #endif bar = addr | pi->pi_bar[idx].lobits; /* * Register the new BAR value for interception */ if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_IO); } break; case PCIBAR_MEM32: addr = bar = *valp & mask; bar |= pi->pi_bar[idx].lobits; if (addr != pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM32); } break; case PCIBAR_MEM64: addr = bar = *valp & mask; bar |= pi->pi_bar[idx].lobits; if (addr != (uint32_t)pi->pi_bar[idx].addr) { update_bar_address(pi, addr, idx, PCIBAR_MEM64); } break; case PCIBAR_MEMHI64: mask = ~(pi->pi_bar[idx - 1].size - 1); addr = ((uint64_t)*valp << 32) & mask; bar = addr >> 32; if (bar != pi->pi_bar[idx - 1].addr >> 32) { update_bar_address(pi, addr, idx - 1, PCIBAR_MEMHI64); } break; case PCIBAR_ROM: addr = bar = *valp & mask; if (memen(pi) && romen(pi)) { unregister_bar(pi, idx); } pi->pi_bar[idx].addr = addr; pi->pi_bar[idx].lobits = *valp & PCIM_BIOS_ENABLE; /* romen could have changed it value */ if (memen(pi) && romen(pi)) { register_bar(pi, idx); } bar |= pi->pi_bar[idx].lobits; break; default: assert(0); } pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *valp, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *valp, bytes); } else { CFGWRITE(pi, coff, *valp, bytes); } } } #ifdef __amd64__ static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff; static int pci_emul_cfgaddr(struct vmctx *ctx __unused, int in, int port __unused, int bytes, uint32_t *eax, void *arg __unused) { uint32_t x; if (bytes != 4) { if (in) *eax = (bytes == 2) ? 0xffff : 0xff; return (0); } if (in) { x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff; if (cfgenable) x |= CONF1_ENABLE; *eax = x; } else { x = *eax; cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE; cfgoff = (x & PCI_REGMAX) & ~0x03; cfgfunc = (x >> 8) & PCI_FUNCMAX; cfgslot = (x >> 11) & PCI_SLOTMAX; cfgbus = (x >> 16) & PCI_BUSMAX; } return (0); } INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr); static int pci_emul_cfgdata(struct vmctx *ctx __unused, int in, int port, int bytes, uint32_t *eax, void *arg __unused) { int coff; assert(bytes == 1 || bytes == 2 || bytes == 4); coff = cfgoff + (port - CONF1_DATA_PORT); if (cfgenable) { pci_cfgrw(in, cfgbus, cfgslot, cfgfunc, coff, bytes, eax); } else { /* Ignore accesses to cfgdata if not enabled by cfgaddr */ if (in) *eax = 0xffffffff; } return (0); } INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #endif #ifdef BHYVE_SNAPSHOT /* * Saves/restores PCI device emulated state. Returns 0 on success. */ static int pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) { struct pci_devinst *pi; int i; int ret; pi = meta->dev_data; SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), meta, ret, done); for (i = 0; i < (int)nitems(pi->pi_bar); i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); } /* Restore MSI-X table. */ for (i = 0; i < pi->pi_msix.table_count; i++) { SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, meta, ret, done); SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, meta, ret, done); } done: return (ret); } int pci_snapshot(struct vm_snapshot_meta *meta) { struct pci_devemu *pde; struct pci_devinst *pdi; int ret; assert(meta->dev_name != NULL); pdi = meta->dev_data; pde = pdi->pi_d; if (pde->pe_snapshot == NULL) return (ENOTSUP); ret = pci_snapshot_pci_dev(meta); if (ret == 0) ret = (*pde->pe_snapshot)(meta); return (ret); } int pci_pause(struct pci_devinst *pdi) { struct pci_devemu *pde = pdi->pi_d; if (pde->pe_pause == NULL) { /* The pause/resume functionality is optional. */ return (0); } return (*pde->pe_pause)(pdi); } int pci_resume(struct pci_devinst *pdi) { struct pci_devemu *pde = pdi->pi_d; if (pde->pe_resume == NULL) { /* The pause/resume functionality is optional. */ return (0); } return (*pde->pe_resume)(pdi); } #endif #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* * Define a dummy test device */ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 #define PCI_EMUL_MSIX_MSGS 16 static int pci_emul_dinit(struct pci_devinst *pi, nvlist_t *nvl __unused) { int error; struct pci_emul_dsoftc *sc; sc = calloc(1, sizeof(struct pci_emul_dsoftc)); pi->pi_arg = sc; pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001); pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD); pci_set_cfgdata8(pi, PCIR_CLASS, 0x02); error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS); assert(error == 0); error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); assert(error == 0); return (0); } static void pci_emul_diow(struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { int i; struct pci_emul_dsoftc *sc = pi->pi_arg; if (baridx == 0) { if (offset + size > DIOSZ) { printf("diow: iow too large, offset %ld size %d\n", offset, size); return; } if (size == 1) { sc->ioregs[offset] = value & 0xff; } else if (size == 2) { *(uint16_t *)&sc->ioregs[offset] = value & 0xffff; } else if (size == 4) { *(uint32_t *)&sc->ioregs[offset] = value; } else { printf("diow: iow unknown size %d\n", size); } /* * Special magic value to generate an interrupt */ if (offset == 4 && size == 4 && pci_msi_enabled(pi)) pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi)); if (value == 0xabcdef) { for (i = 0; i < pci_msi_maxmsgnum(pi); i++) pci_generate_msi(pi, i); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } i = baridx - 1; /* 'memregs' index */ if (size == 1) { sc->memregs[i][offset] = value; } else if (size == 2) { *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } /* * magic interrupt ?? */ } if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } static uint64_t pci_emul_dior(struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; int i; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { value = *(uint16_t *) &sc->ioregs[offset]; } else if (size == 4) { value = *(uint32_t *) &sc->ioregs[offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } i = baridx - 1; /* 'memregs' index */ if (size == 1) { value = sc->memregs[i][offset]; } else if (size == 2) { value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } return (value); } #ifdef BHYVE_SNAPSHOT struct pci_devinst * pci_next(const struct pci_devinst *cursor) { unsigned bus = 0, slot = 0, func = 0; struct businfo *bi; struct slotinfo *si; struct funcinfo *fi; bus = cursor ? cursor->pi_bus : 0; slot = cursor ? cursor->pi_slot : 0; func = cursor ? (cursor->pi_func + 1) : 0; for (; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; if (slot >= MAXSLOTS) slot = 0; for (; slot < MAXSLOTS; slot++) { si = &bi->slotinfo[slot]; if (func >= MAXFUNCS) func = 0; for (; func < MAXFUNCS; func++) { fi = &si->si_funcs[func]; if (fi->fi_devi == NULL) continue; return (fi->fi_devi); } } } return (NULL); } static int pci_emul_snapshot(struct vm_snapshot_meta *meta __unused) { return (0); } #endif static const struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, .pe_barread = pci_emul_dior, #ifdef BHYVE_SNAPSHOT .pe_snapshot = pci_emul_snapshot, #endif }; PCI_EMUL_SET(pci_dummy); #endif /* PCI_EMUL_TEST */ diff --git a/usr.sbin/bhyve/pci_irq.h b/usr.sbin/bhyve/pci_irq.h index 8b556ddc91a2..8078583add52 100644 --- a/usr.sbin/bhyve/pci_irq.h +++ b/usr.sbin/bhyve/pci_irq.h @@ -1,47 +1,49 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef __PCI_IRQ_H__ #define __PCI_IRQ_H__ struct pci_devinst; #if defined(__amd64__) #include "amd64/pci_irq_machdep.h" #elif defined(__aarch64__) #include "aarch64/pci_irq_machdep.h" +#elif defined(__riscv) +#include "riscv/pci_irq_machdep.h" #else #error Unsupported platform #endif void pci_irq_assert(struct pci_devinst *pi); void pci_irq_deassert(struct pci_devinst *pi); void pci_irq_route(struct pci_devinst *pi, struct pci_irq *irq); #endif diff --git a/usr.sbin/bhyve/riscv/Makefile.inc b/usr.sbin/bhyve/riscv/Makefile.inc new file mode 100644 index 000000000000..2b57201c6259 --- /dev/null +++ b/usr.sbin/bhyve/riscv/Makefile.inc @@ -0,0 +1,7 @@ +SRCS+= \ + fdt.c + +.PATH: ${BHYVE_SYSDIR}/sys/riscv/vmm +SRCS+= vmm_instruction_emul.c + +BHYVE_FDT_SUPPORT= diff --git a/usr.sbin/bhyve/riscv/bhyverun_machdep.c b/usr.sbin/bhyve/riscv/bhyverun_machdep.c new file mode 100644 index 000000000000..39d6a7cdf231 --- /dev/null +++ b/usr.sbin/bhyve/riscv/bhyverun_machdep.c @@ -0,0 +1,357 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2024 Ruslan Bukin + * + * This software was developed by the University of Cambridge Computer + * Laboratory (Department of Computer Science and Technology) under Innovate + * UK project 105694, "Digital Security by Design (DSbD) Technology Platform + * Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "bhyverun.h" +#include "config.h" +#include "debug.h" +#include "fdt.h" +#include "mem.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "uart_emul.h" +#include "riscv.h" + +#define FDT_SIZE (64 * 1024) +#define FDT_DTB_ALIGN 8 + +/* Start of lowmem + 64K */ +#define UART_MMIO_BASE 0x10000 +#define UART_MMIO_SIZE 0x1000 +#define UART_INTR 1 + +#define APLIC_MEM_BASE 0x2f000000 +#define APLIC_MEM_SIZE 0x10000 + +#define PCIE_INTA 2 +#define PCIE_INTB 3 +#define PCIE_INTC 4 +#define PCIE_INTD 5 + +void +bhyve_init_config(void) +{ + init_config(); + + /* Set default values prior to option parsing. */ + set_config_bool("acpi_tables", false); + set_config_bool("acpi_tables_in_memory", false); + set_config_value("memory.size", "256M"); +} + +void +bhyve_usage(int code) +{ + const char *progname; + + progname = getprogname(); + + fprintf(stderr, + "Usage: %s [-CDHhSW]\n" + " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" + " %*s [-k config_file] [-m mem] [-o var=value]\n" + " %*s [-p vcpu:hostcpu] [-r file] [-s pci] [-U uuid] vmname\n" + " -C: include guest memory in core file\n" + " -c: number of CPUs and/or topology specification\n" + " -D: destroy on power-off\n" + " -h: help\n" + " -k: key=value flat config file\n" + " -m: memory size\n" + " -o: set config 'var' to 'value'\n" + " -p: pin 'vcpu' to 'hostcpu'\n" + " -S: guest memory cannot be swapped\n" + " -s: PCI slot config\n" + " -U: UUID\n" + " -W: force virtio to use single-vector MSI\n", + progname, (int)strlen(progname), "", (int)strlen(progname), "", + (int)strlen(progname), ""); + exit(code); +} + +void +bhyve_optparse(int argc, char **argv) +{ + const char *optstr; + int c; + + optstr = "hCDSWk:f:o:p:c:s:m:U:"; + while ((c = getopt(argc, argv, optstr)) != -1) { + switch (c) { + case 'c': + if (bhyve_topology_parse(optarg) != 0) { + errx(EX_USAGE, "invalid cpu topology '%s'", + optarg); + } + break; + case 'C': + set_config_bool("memory.guest_in_core", true); + break; + case 'D': + set_config_bool("destroy_on_poweroff", true); + break; + case 'k': + bhyve_parse_simple_config_file(optarg); + break; + case 'm': + set_config_value("memory.size", optarg); + break; + case 'o': + if (!bhyve_parse_config_option(optarg)) { + errx(EX_USAGE, + "invalid configuration option '%s'", + optarg); + } + break; + case 'p': + if (bhyve_pincpu_parse(optarg) != 0) { + errx(EX_USAGE, + "invalid vcpu pinning configuration '%s'", + optarg); + } + break; + case 's': + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + pci_print_supported_devices(); + exit(0); + } else if (pci_parse_slot(optarg) != 0) + exit(4); + else + break; + case 'S': + set_config_bool("memory.wired", true); + break; + case 'U': + set_config_value("uuid", optarg); + break; + case 'W': + set_config_bool("virtio_msix", false); + break; + case 'h': + bhyve_usage(0); + default: + bhyve_usage(1); + } + } +} + +void +bhyve_init_vcpu(struct vcpu *vcpu __unused) +{ +} + +void +bhyve_start_vcpu(struct vcpu *vcpu, bool bsp __unused) +{ + int error; + + /* Set hart ID. */ + error = vm_set_register(vcpu, VM_REG_GUEST_A0, vcpu_id(vcpu)); + assert(error == 0); + + fbsdrun_addcpu(vcpu_id(vcpu)); +} + +/* + * Load the specified boot code at the beginning of high memory. + */ +static void +load_bootrom(struct vmctx *ctx, const char *path, uint64_t *elrp, + uint64_t *lenp) +{ + struct stat sb; + void *data, *gptr; + vm_paddr_t loadaddr; + off_t size; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) + err(1, "open(%s)", path); + if (fstat(fd, &sb) != 0) + err(1, "fstat(%s)", path); + + size = sb.st_size; + + loadaddr = vm_get_highmem_base(ctx); + gptr = vm_map_gpa(ctx, loadaddr, round_page(size)); + + data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + if (data == MAP_FAILED) + err(1, "mmap(%s)", path); + (void)close(fd); + memcpy(gptr, data, size); + + if (munmap(data, size) != 0) + err(1, "munmap(%s)", path); + + *elrp = loadaddr; + *lenp = size; +} + +static void +mmio_uart_intr_assert(void *arg) +{ + struct vmctx *ctx = arg; + + vm_assert_irq(ctx, UART_INTR); +} + +static void +mmio_uart_intr_deassert(void *arg) +{ + struct vmctx *ctx = arg; + + vm_deassert_irq(ctx, UART_INTR); +} + +static int +mmio_uart_mem_handler(struct vcpu *vcpu __unused, int dir, uint64_t addr, + int size __unused, uint64_t *val, void *arg1, long arg2) +{ + struct uart_ns16550_softc *sc = arg1; + long reg; + + reg = addr - arg2; + if (dir == MEM_F_WRITE) + uart_ns16550_write(sc, reg, *val); + else + *val = uart_ns16550_read(sc, reg); + + return (0); +} + +static bool +init_mmio_uart(struct vmctx *ctx) +{ + struct uart_ns16550_softc *sc; + struct mem_range mr; + const char *path; + int error; + + path = get_config_value("console"); + if (path == NULL) + return (false); + + sc = uart_ns16550_init(mmio_uart_intr_assert, mmio_uart_intr_deassert, + ctx); + if (uart_ns16550_tty_open(sc, path) != 0) { + EPRINTLN("Unable to initialize backend '%s' for mmio uart", + path); + assert(0); + } + + bzero(&mr, sizeof(struct mem_range)); + mr.name = "uart"; + mr.base = UART_MMIO_BASE; + mr.size = UART_MMIO_SIZE; + mr.flags = MEM_F_RW; + mr.handler = mmio_uart_mem_handler; + mr.arg1 = sc; + mr.arg2 = mr.base; + error = register_mem(&mr); + assert(error == 0); + + return (true); +} + +int +bhyve_init_platform(struct vmctx *ctx, struct vcpu *bsp) +{ + const char *bootrom; + uint64_t elr; + uint64_t len; + int error; + int pcie_intrs[4] = {PCIE_INTA, PCIE_INTB, PCIE_INTC, PCIE_INTD}; + vm_paddr_t fdt_gpa; + + bootrom = get_config_value("bootrom"); + if (bootrom == NULL) { + warnx("no bootrom specified"); + return (ENOENT); + } + load_bootrom(ctx, bootrom, &elr, &len); + error = vm_set_register(bsp, VM_REG_GUEST_SEPC, elr); + if (error != 0) { + warn("vm_set_register(GUEST_SEPC)"); + return (error); + } + + fdt_gpa = vm_get_highmem_base(ctx) + roundup2(len, FDT_DTB_ALIGN); + error = fdt_init(ctx, guest_ncpus, fdt_gpa, FDT_SIZE); + if (error != 0) + return (error); + + /* Set FDT base address to the bootable hart. */ + error = vm_set_register(bsp, VM_REG_GUEST_A1, fdt_gpa); + assert(error == 0); + + fdt_add_aplic(APLIC_MEM_BASE, APLIC_MEM_SIZE); + error = vm_attach_aplic(ctx, APLIC_MEM_BASE, APLIC_MEM_SIZE); + if (error != 0) { + warn("vm_attach_aplic()"); + return (error); + } + + if (init_mmio_uart(ctx)) + fdt_add_uart(UART_MMIO_BASE, UART_MMIO_SIZE, UART_INTR); + + pci_irq_init(pcie_intrs); + fdt_add_pcie(pcie_intrs); + vmexit_set_bsp(vcpu_id(bsp)); + + return (0); +} + +int +bhyve_init_platform_late(struct vmctx *ctx __unused, struct vcpu *bsp __unused) +{ + + fdt_finalize(); + + return (0); +} diff --git a/usr.sbin/bhyve/riscv/fdt.c b/usr.sbin/bhyve/riscv/fdt.c new file mode 100644 index 000000000000..54b75c68ea76 --- /dev/null +++ b/usr.sbin/bhyve/riscv/fdt.c @@ -0,0 +1,326 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022 The FreeBSD Foundation + * Copyright (c) 2024 Ruslan Bukin + * + * This software was developed by Andrew Turner under sponsorship from + * the FreeBSD Foundation. + * + * This software was developed by the University of Cambridge Computer + * Laboratory (Department of Computer Science and Technology) under Innovate + * UK project 105694, "Digital Security by Design (DSbD) Technology Platform + * Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +#include "config.h" +#include "bhyverun.h" +#include "fdt.h" + +#define SET_PROP_U32(prop, idx, val) \ + ((uint32_t *)(prop))[(idx)] = cpu_to_fdt32(val) +#define SET_PROP_U64(prop, idx, val) \ + ((uint64_t *)(prop))[(idx)] = cpu_to_fdt64(val) + +#define IRQ_TYPE_LEVEL_HIGH 4 +#define IRQ_TYPE_LEVEL_LOW 8 + +static void *fdtroot; +static uint32_t aplic_phandle = 0; +static uint32_t intc0_phandle = 0; + +static uint32_t +assign_phandle(void *fdt) +{ + static uint32_t next_phandle = 1; + uint32_t phandle; + + phandle = next_phandle; + next_phandle++; + fdt_property_u32(fdt, "phandle", phandle); + + return (phandle); +} + +static void +set_single_reg(void *fdt, uint64_t start, uint64_t len) +{ + void *reg; + + fdt_property_placeholder(fdt, "reg", 2 * sizeof(uint64_t), ®); + SET_PROP_U64(reg, 0, start); + SET_PROP_U64(reg, 1, len); +} + +static void +add_cpu(void *fdt, int cpuid) +{ + char node_name[16]; + + snprintf(node_name, sizeof(node_name), "cpu@%d", cpuid); + + fdt_begin_node(fdt, node_name); + fdt_property_string(fdt, "device_type", "cpu"); + fdt_property_string(fdt, "compatible", "riscv"); + fdt_property_u32(fdt, "reg", cpuid); + fdt_property_string(fdt, "riscv,isa", "rv64imafdc_sstc"); + fdt_property_string(fdt, "mmu-type", "riscv,sv39"); + fdt_property_string(fdt, "clock-frequency", "1000000000"); + + fdt_begin_node(fdt, "interrupt-controller"); + intc0_phandle = assign_phandle(fdt); + fdt_property_u32(fdt, "#address-cells", 2); + fdt_property_u32(fdt, "#interrupt-cells", 1); + fdt_property(fdt, "interrupt-controller", NULL, 0); + fdt_property_string(fdt, "compatible", "riscv,cpu-intc"); + fdt_end_node(fdt); + + fdt_end_node(fdt); +} + +static void +add_cpus(void *fdt, int ncpu) +{ + int cpuid; + + fdt_begin_node(fdt, "cpus"); + /* XXX: Needed given the root #address-cells? */ + fdt_property_u32(fdt, "#address-cells", 1); + fdt_property_u32(fdt, "#size-cells", 0); + fdt_property_u32(fdt, "timebase-frequency", 10000000); + + for (cpuid = 0; cpuid < ncpu; cpuid++) { + add_cpu(fdt, cpuid); + } + fdt_end_node(fdt); +} + +int +fdt_init(struct vmctx *ctx, int ncpu, vm_paddr_t fdtaddr, vm_size_t fdtsize) +{ + void *fdt; + const char *bootargs; + + fdt = paddr_guest2host(ctx, fdtaddr, fdtsize); + if (fdt == NULL) + return (EFAULT); + + fdt_create(fdt, (int)fdtsize); + + /* Add the memory reserve map (needed even if none is reserved) */ + fdt_finish_reservemap(fdt); + + /* Create the root node */ + fdt_begin_node(fdt, ""); + + fdt_property_string(fdt, "compatible", "freebsd,bhyve"); + fdt_property_u32(fdt, "#address-cells", 2); + fdt_property_u32(fdt, "#size-cells", 2); + + fdt_begin_node(fdt, "chosen"); + fdt_property_string(fdt, "stdout-path", "serial0:115200n8"); + bootargs = get_config_value("fdt.bootargs"); + if (bootargs != NULL) + fdt_property_string(fdt, "bootargs", bootargs); + fdt_end_node(fdt); + + fdt_begin_node(fdt, "memory"); + fdt_property_string(fdt, "device_type", "memory"); + /* There is no lowmem on riscv. */ + assert(vm_get_lowmem_size(ctx) == 0); + set_single_reg(fdt, vm_get_highmem_base(ctx), vm_get_highmem_size(ctx)); + fdt_end_node(fdt); + + add_cpus(fdt, ncpu); + + /* Finalized by fdt_finalized(). */ + fdtroot = fdt; + + return (0); +} + +void +fdt_add_aplic(uint64_t mem_base, uint64_t mem_size) +{ + char node_name[32]; + void *fdt, *prop; + + fdt = fdtroot; + + snprintf(node_name, sizeof(node_name), "interrupt-controller@%lx", + (unsigned long)mem_base); + fdt_begin_node(fdt, node_name); + + aplic_phandle = assign_phandle(fdt); + fdt_property_string(fdt, "compatible", "riscv,aplic"); + fdt_property(fdt, "interrupt-controller", NULL, 0); +#if notyet + fdt_property(fdt, "msi-controller", NULL, 0); +#endif + /* XXX: Needed given the root #address-cells? */ + fdt_property_u32(fdt, "#address-cells", 2); + fdt_property_u32(fdt, "#interrupt-cells", 2); + fdt_property_placeholder(fdt, "reg", 2 * sizeof(uint64_t), &prop); + SET_PROP_U64(prop, 0, mem_base); + SET_PROP_U64(prop, 1, mem_size); + + fdt_property_placeholder(fdt, "interrupts-extended", + 2 * sizeof(uint32_t), &prop); + SET_PROP_U32(prop, 0, intc0_phandle); + SET_PROP_U32(prop, 1, 9); + fdt_property_u32(fdt, "riscv,num-sources", 63); + + fdt_end_node(fdt); + + fdt_property_u32(fdt, "interrupt-parent", aplic_phandle); +} + +void +fdt_add_uart(uint64_t uart_base, uint64_t uart_size, int intr) +{ + void *fdt, *interrupts; + char node_name[32]; + + assert(aplic_phandle != 0); + + fdt = fdtroot; + + snprintf(node_name, sizeof(node_name), "serial@%lx", uart_base); + fdt_begin_node(fdt, node_name); + fdt_property_string(fdt, "compatible", "ns16550"); + set_single_reg(fdt, uart_base, uart_size); + fdt_property_u32(fdt, "interrupt-parent", aplic_phandle); + fdt_property_placeholder(fdt, "interrupts", 2 * sizeof(uint32_t), + &interrupts); + SET_PROP_U32(interrupts, 0, intr); + SET_PROP_U32(interrupts, 1, IRQ_TYPE_LEVEL_HIGH); + + fdt_end_node(fdt); + + snprintf(node_name, sizeof(node_name), "/serial@%lx", uart_base); + fdt_begin_node(fdt, "aliases"); + fdt_property_string(fdt, "serial0", node_name); + fdt_end_node(fdt); +} + +void +fdt_add_pcie(int intrs[static 4]) +{ + void *fdt, *prop; + int slot, pin, intr, i; + + assert(aplic_phandle != 0); + + fdt = fdtroot; + + fdt_begin_node(fdt, "pcie@1f0000000"); + fdt_property_string(fdt, "compatible", "pci-host-ecam-generic"); + fdt_property_u32(fdt, "#address-cells", 3); + fdt_property_u32(fdt, "#size-cells", 2); + fdt_property_string(fdt, "device_type", "pci"); + fdt_property_u64(fdt, "bus-range", (0ul << 32) | 1); + set_single_reg(fdt, 0xe0000000, 0x10000000); + fdt_property_placeholder(fdt, "ranges", + 2 * 7 * sizeof(uint32_t), &prop); + SET_PROP_U32(prop, 0, 0x01000000); + + SET_PROP_U32(prop, 1, 0); + SET_PROP_U32(prop, 2, 0xdf000000); + + SET_PROP_U32(prop, 3, 0); + SET_PROP_U32(prop, 4, 0xdf000000); + + SET_PROP_U32(prop, 5, 0); + SET_PROP_U32(prop, 6, 0x01000000); + + SET_PROP_U32(prop, 7, 0x02000000); + + SET_PROP_U32(prop, 8, 0); + SET_PROP_U32(prop, 9, 0xa0000000); + + SET_PROP_U32(prop, 10, 0); + SET_PROP_U32(prop, 11, 0xa0000000); + + SET_PROP_U32(prop, 12, 0); + SET_PROP_U32(prop, 13, 0x3f000000); + +#if notyet + fdt_property_placeholder(fdt, "msi-map", 4 * sizeof(uint32_t), &prop); + SET_PROP_U32(prop, 0, 0); /* RID base */ + SET_PROP_U32(prop, 1, aplic_phandle); /* MSI parent */ + SET_PROP_U32(prop, 2, 0); /* MSI base */ + SET_PROP_U32(prop, 3, 0x10000); /* RID length */ + fdt_property_u32(fdt, "msi-parent", aplic_phandle); +#endif + + fdt_property_u32(fdt, "#interrupt-cells", 1); + fdt_property_u32(fdt, "interrupt-parent", aplic_phandle); + + /* + * Describe standard swizzled interrupts routing (pins rotated by one + * for each consecutive slot). Must match pci_irq_route(). + */ + fdt_property_placeholder(fdt, "interrupt-map-mask", + 4 * sizeof(uint32_t), &prop); + SET_PROP_U32(prop, 0, 3 << 11); + SET_PROP_U32(prop, 1, 0); + SET_PROP_U32(prop, 2, 0); + SET_PROP_U32(prop, 3, 7); + fdt_property_placeholder(fdt, "interrupt-map", + 16 * 9 * sizeof(uint32_t), &prop); + for (i = 0; i < 16; ++i) { + pin = i % 4; + slot = i / 4; + intr = intrs[(pin + slot) % 4]; + SET_PROP_U32(prop, 10 * i + 0, slot << 11); + SET_PROP_U32(prop, 10 * i + 1, 0); + SET_PROP_U32(prop, 10 * i + 2, 0); + SET_PROP_U32(prop, 10 * i + 3, pin + 1); + SET_PROP_U32(prop, 10 * i + 4, aplic_phandle); + SET_PROP_U32(prop, 10 * i + 5, 0); + SET_PROP_U32(prop, 10 * i + 6, 0); + SET_PROP_U32(prop, 10 * i + 7, intr); + SET_PROP_U32(prop, 10 * i + 8, IRQ_TYPE_LEVEL_HIGH); + } + + fdt_end_node(fdt); +} + +void +fdt_finalize(void) +{ + fdt_end_node(fdtroot); + + fdt_finish(fdtroot); +} diff --git a/usr.sbin/bhyve/pci_irq.h b/usr.sbin/bhyve/riscv/fdt.h similarity index 68% copy from usr.sbin/bhyve/pci_irq.h copy to usr.sbin/bhyve/riscv/fdt.h index 8b556ddc91a2..9bebe6ffa29d 100644 --- a/usr.sbin/bhyve/pci_irq.h +++ b/usr.sbin/bhyve/riscv/fdt.h @@ -1,47 +1,45 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2014 Hudson River Trading LLC - * Written by: John H. Baldwin - * All rights reserved. + * Copyright (c) 2022 The FreeBSD Foundation + * + * This software was developed by Andrew Turner under sponsorship from + * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#ifndef __PCI_IRQ_H__ -#define __PCI_IRQ_H__ +#ifndef _FDT_H_ +#define _FDT_H_ -struct pci_devinst; +#include -#if defined(__amd64__) -#include "amd64/pci_irq_machdep.h" -#elif defined(__aarch64__) -#include "aarch64/pci_irq_machdep.h" -#else -#error Unsupported platform -#endif +struct vmctx; -void pci_irq_assert(struct pci_devinst *pi); -void pci_irq_deassert(struct pci_devinst *pi); -void pci_irq_route(struct pci_devinst *pi, struct pci_irq *irq); +int fdt_init(struct vmctx *ctx, int ncpu, vm_paddr_t addrp, + vm_size_t size); +void fdt_add_aplic(uint64_t dist_base, uint64_t dist_size); +void fdt_add_pcie(int intrs[static 4]); +void fdt_add_uart(uint64_t uart_base, uint64_t uart_size, int intr); +void fdt_finalize(void); -#endif +#endif /* _FDT_H_ */ diff --git a/usr.sbin/bhyve/pci_irq.h b/usr.sbin/bhyve/riscv/pci_irq.c similarity index 64% copy from usr.sbin/bhyve/pci_irq.h copy to usr.sbin/bhyve/riscv/pci_irq.c index 8b556ddc91a2..dec65ce23196 100644 --- a/usr.sbin/bhyve/pci_irq.h +++ b/usr.sbin/bhyve/riscv/pci_irq.c @@ -1,47 +1,66 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2014 Hudson River Trading LLC - * Written by: John H. Baldwin - * All rights reserved. + * Copyright (c) 2024 Jessica Clarke * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#ifndef __PCI_IRQ_H__ -#define __PCI_IRQ_H__ +#include -struct pci_devinst; +#include "pci_emul.h" +#include "pci_irq.h" -#if defined(__amd64__) -#include "amd64/pci_irq_machdep.h" -#elif defined(__aarch64__) -#include "aarch64/pci_irq_machdep.h" -#else -#error Unsupported platform -#endif +static int aplic_irqs[4]; -void pci_irq_assert(struct pci_devinst *pi); -void pci_irq_deassert(struct pci_devinst *pi); -void pci_irq_route(struct pci_devinst *pi, struct pci_irq *irq); +void +pci_irq_init(int intrs[static 4]) +{ + int i; -#endif + for (i = 0; i < 4; ++i) + aplic_irqs[i] = intrs[i]; +} + +void +pci_irq_assert(struct pci_devinst *pi) +{ + vm_assert_irq(pi->pi_vmctx, pi->pi_lintr.irq.aplic_irq); +} + +void +pci_irq_deassert(struct pci_devinst *pi) +{ + vm_deassert_irq(pi->pi_vmctx, pi->pi_lintr.irq.aplic_irq); +} + +void +pci_irq_route(struct pci_devinst *pi, struct pci_irq *irq) +{ + /* + * Assign swizzled IRQ for this INTx if one is not yet assigned. Must + * match fdt_add_pcie(). + */ + if (irq->aplic_irq == 0) + irq->aplic_irq = + aplic_irqs[(pi->pi_slot + pi->pi_lintr.pin - 1) % 4]; +} diff --git a/usr.sbin/bhyve/pci_irq.h b/usr.sbin/bhyve/riscv/pci_irq_machdep.h similarity index 72% copy from usr.sbin/bhyve/pci_irq.h copy to usr.sbin/bhyve/riscv/pci_irq_machdep.h index 8b556ddc91a2..3a951d4413ae 100644 --- a/usr.sbin/bhyve/pci_irq.h +++ b/usr.sbin/bhyve/riscv/pci_irq_machdep.h @@ -1,47 +1,49 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2014 Hudson River Trading LLC - * Written by: John H. Baldwin - * All rights reserved. + * Copyright (c) 2024 Jessica Clarke * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#ifndef __PCI_IRQ_H__ -#define __PCI_IRQ_H__ +#ifndef __PCI_IRQ_MD_H__ +#define __PCI_IRQ_MD_H__ -struct pci_devinst; +struct pci_irq { + int aplic_irq; +}; -#if defined(__amd64__) -#include "amd64/pci_irq_machdep.h" -#elif defined(__aarch64__) -#include "aarch64/pci_irq_machdep.h" -#else -#error Unsupported platform -#endif +void pci_irq_init(int intrs[static 4]); + +static inline void +pci_irq_init_irq(struct pci_irq *irq) +{ + irq->aplic_irq = 0; +} -void pci_irq_assert(struct pci_devinst *pi); -void pci_irq_deassert(struct pci_devinst *pi); -void pci_irq_route(struct pci_devinst *pi, struct pci_irq *irq); +static inline uint8_t +pci_irq_intline(struct pci_irq *irq __unused) +{ + return (255); +} #endif diff --git a/usr.sbin/bhyve/pci_irq.h b/usr.sbin/bhyve/riscv/riscv.h similarity index 64% copy from usr.sbin/bhyve/pci_irq.h copy to usr.sbin/bhyve/riscv/riscv.h index 8b556ddc91a2..7d23c403eb8a 100644 --- a/usr.sbin/bhyve/pci_irq.h +++ b/usr.sbin/bhyve/riscv/riscv.h @@ -1,47 +1,41 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2014 Hudson River Trading LLC - * Written by: John H. Baldwin - * All rights reserved. + * Copyright (c) 2024 Ruslan Bukin + * + * This software was developed by the University of Cambridge Computer + * Laboratory (Department of Computer Science and Technology) under Innovate + * UK project 105694, "Digital Security by Design (DSbD) Technology Platform + * Prototype". * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#ifndef __PCI_IRQ_H__ -#define __PCI_IRQ_H__ - -struct pci_devinst; +#ifndef _RISCV_H_ +#define _RISCV_H_ -#if defined(__amd64__) -#include "amd64/pci_irq_machdep.h" -#elif defined(__aarch64__) -#include "aarch64/pci_irq_machdep.h" -#else -#error Unsupported platform -#endif +#define CPU_TO_HART(x) (x) +#define HART_TO_CPU(x) (x) -void pci_irq_assert(struct pci_devinst *pi); -void pci_irq_deassert(struct pci_devinst *pi); -void pci_irq_route(struct pci_devinst *pi, struct pci_irq *irq); +void vmexit_set_bsp(int hart_id); -#endif +#endif /* !_RISCV_H_ */ diff --git a/usr.sbin/bhyve/riscv/vmexit.c b/usr.sbin/bhyve/riscv/vmexit.c new file mode 100644 index 000000000000..b7d1a272b87e --- /dev/null +++ b/usr.sbin/bhyve/riscv/vmexit.c @@ -0,0 +1,366 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * Copyright (c) 2024 Ruslan Bukin + * + * This software was developed by the University of Cambridge Computer + * Laboratory (Department of Computer Science and Technology) under Innovate + * UK project 105694, "Digital Security by Design (DSbD) Technology Platform + * Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "bhyverun.h" +#include "config.h" +#include "debug.h" +#include "mem.h" +#include "vmexit.h" +#include "riscv.h" + +#define BHYVE_VERSION ((uint64_t)__FreeBSD_version) +#define SBI_VERS_MAJOR 2 +#define SBI_VERS_MINOR 0 + +static cpuset_t running_hartmask = CPUSET_T_INITIALIZER(0); + +void +vmexit_set_bsp(int hart_id) +{ + + CPU_SET_ATOMIC(hart_id, &running_hartmask); +} + +static int +vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_run *vmrun) +{ + struct vm_exit *vme; + struct vie *vie; + int err; + + vme = vmrun->vm_exit; + vie = &vme->u.inst_emul.vie; + + err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie, + &vme->u.inst_emul.paging); + if (err) { + if (err == ESRCH) { + EPRINTLN("Unhandled memory access to 0x%lx\n", + vme->u.inst_emul.gpa); + } + goto fail; + } + + return (VMEXIT_CONTINUE); + +fail: + fprintf(stderr, "Failed to emulate instruction "); + FPRINTLN(stderr, "at 0x%lx", vme->pc); + return (VMEXIT_ABORT); +} + +static int +vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) +{ + struct vm_exit *vme; + enum vm_suspend_how how; + int vcpuid = vcpu_id(vcpu); + + vme = vmrun->vm_exit; + how = vme->u.suspended.how; + + fbsdrun_deletecpu(vcpuid); + + switch (how) { + case VM_SUSPEND_RESET: + exit(0); + case VM_SUSPEND_POWEROFF: + if (get_config_bool_default("destroy_on_poweroff", false)) + vm_destroy(ctx); + exit(1); + case VM_SUSPEND_HALT: + exit(2); + default: + fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); + exit(100); + } + + /* NOT REACHED. */ + + return (0); +} + +static int +vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_run *vmrun __unused) +{ + + return (VMEXIT_CONTINUE); +} + +static int +vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_run *vmrun __unused) +{ + + return (VMEXIT_CONTINUE); +} + +static int +vmm_sbi_probe_extension(int ext_id) +{ + + switch (ext_id) { + case SBI_EXT_ID_HSM: + case SBI_EXT_ID_TIME: + case SBI_EXT_ID_IPI: + case SBI_EXT_ID_RFNC: + case SBI_EXT_ID_SRST: + case SBI_CONSOLE_PUTCHAR: + case SBI_CONSOLE_GETCHAR: + break; + default: + return (0); + } + + return (1); +} + +static void +vmexit_ecall_time(struct vmctx *ctx __unused, struct vm_exit *vme __unused) +{ + +} + +static void +vmexit_ecall_hsm(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_exit *vme) +{ + struct vcpu *newvcpu; + uint64_t hart_id; + int func_id; + int error; + int ret; + + hart_id = vme->u.ecall.args[0]; + func_id = vme->u.ecall.args[6]; + + ret = -1; + + if (HART_TO_CPU(hart_id) >= (uint64_t)guest_ncpus) + goto done; + + newvcpu = fbsdrun_vcpu(HART_TO_CPU(hart_id)); + assert(newvcpu != NULL); + + switch (func_id) { + case SBI_HSM_HART_START: + if (CPU_ISSET(hart_id, &running_hartmask)) + break; + + /* Set hart ID. */ + error = vm_set_register(newvcpu, VM_REG_GUEST_A0, hart_id); + assert(error == 0); + + /* Set PC. */ + error = vm_set_register(newvcpu, VM_REG_GUEST_SEPC, + vme->u.ecall.args[1]); + assert(error == 0); + + vm_resume_cpu(newvcpu); + CPU_SET_ATOMIC(hart_id, &running_hartmask); + + ret = 0; + break; + case SBI_HSM_HART_STOP: + if (!CPU_ISSET(hart_id, &running_hartmask)) + break; + CPU_CLR_ATOMIC(hart_id, &running_hartmask); + vm_suspend_cpu(newvcpu); + ret = 0; + break; + case SBI_HSM_HART_STATUS: + /* TODO. */ + break; + default: + break; + } + +done: + error = vm_set_register(vcpu, VM_REG_GUEST_A0, ret); + assert(error == 0); +} + +static void +vmexit_ecall_base(struct vmctx *ctx __unused, struct vcpu *vcpu, + struct vm_exit *vme) +{ + int sbi_function_id; + int ext_id; + int error; + uint32_t val; + int ret; + + sbi_function_id = vme->u.ecall.args[6]; + + ret = 0; + + switch (sbi_function_id) { + case SBI_BASE_GET_SPEC_VERSION: + val = SBI_VERS_MAJOR << SBI_SPEC_VERS_MAJOR_OFFSET; + val |= SBI_VERS_MINOR << SBI_SPEC_VERS_MINOR_OFFSET; + break; + case SBI_BASE_GET_IMPL_ID: + val = SBI_IMPL_ID_BHYVE; + break; + case SBI_BASE_GET_IMPL_VERSION: + val = BHYVE_VERSION; + break; + case SBI_BASE_PROBE_EXTENSION: + ext_id = vme->u.ecall.args[0]; + val = vmm_sbi_probe_extension(ext_id); + break; + case SBI_BASE_GET_MVENDORID: + val = MVENDORID_UNIMPL; + break; + case SBI_BASE_GET_MARCHID: + val = MARCHID_UNIMPL; + break; + case SBI_BASE_GET_MIMPID: + val = 0; + break; + default: + ret = 1; + break; + } + + error = vm_set_register(vcpu, VM_REG_GUEST_A0, ret); + assert(error == 0); + + if (ret == 0) { + error = vm_set_register(vcpu, VM_REG_GUEST_A1, val); + assert(error == 0); + } +} + +static void +vmexit_ecall_srst(struct vmctx *ctx, struct vm_exit *vme) +{ + enum vm_suspend_how how; + int func_id; + int type; + + func_id = vme->u.ecall.args[6]; + type = vme->u.ecall.args[0]; + + switch (func_id) { + case SBI_SRST_SYSTEM_RESET: + switch (type) { + case SBI_SRST_TYPE_SHUTDOWN: + case SBI_SRST_TYPE_COLD_REBOOT: + case SBI_SRST_TYPE_WARM_REBOOT: + how = VM_SUSPEND_POWEROFF; + vm_suspend(ctx, how); + break; + default: + break; + } + default: + break; + } +} + +static int +vmexit_ecall(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun) +{ + int sbi_extension_id; + struct vm_exit *vme; + + vme = vmrun->vm_exit; + + sbi_extension_id = vme->u.ecall.args[7]; + switch (sbi_extension_id) { + case SBI_EXT_ID_SRST: + vmexit_ecall_srst(ctx, vme); + break; + case SBI_EXT_ID_BASE: + vmexit_ecall_base(ctx, vcpu, vme); + break; + case SBI_EXT_ID_TIME: + vmexit_ecall_time(ctx, vme); + break; + case SBI_EXT_ID_HSM: + vmexit_ecall_hsm(ctx, vcpu, vme); + break; + case SBI_CONSOLE_PUTCHAR: + case SBI_CONSOLE_GETCHAR: + default: + /* Unknown SBI extension. */ + break; + } + + return (VMEXIT_CONTINUE); +} + + +static int +vmexit_hyp(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + struct vm_run *vmrun) +{ + struct vm_exit *vme; + + vme = vmrun->vm_exit; + + printf("unhandled exception: scause %#lx\n", vme->u.hyp.scause); + + return (VMEXIT_ABORT); +} + +const vmexit_handler_t vmexit_handlers[VM_EXITCODE_MAX] = { + [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_HYP] = vmexit_hyp, + [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_DEBUG] = vmexit_debug, + [VM_EXITCODE_ECALL] = vmexit_ecall, +}; diff --git a/usr.sbin/bhyvectl/riscv/Makefile.inc b/usr.sbin/bhyvectl/riscv/Makefile.inc new file mode 100644 index 000000000000..047e8d6c70bc --- /dev/null +++ b/usr.sbin/bhyvectl/riscv/Makefile.inc @@ -0,0 +1 @@ +SRCS+= bhyvectl_machdep.c diff --git a/usr.sbin/bhyvectl/riscv/bhyvectl_machdep.c b/usr.sbin/bhyvectl/riscv/bhyvectl_machdep.c new file mode 100644 index 000000000000..3e25f9a260c6 --- /dev/null +++ b/usr.sbin/bhyvectl/riscv/bhyvectl_machdep.c @@ -0,0 +1,82 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Mark Johnston + * + * This software was developed by the University of Cambridge Computer + * Laboratory (Department of Computer Science and Technology) under Innovate + * UK project 105694, "Digital Security by Design (DSbD) Technology Platform + * Prototype". + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "bhyvectl.h" + +void +bhyvectl_dump_vm_run_exitcode(struct vm_exit *vmexit __unused, + int vcpu __unused) +{ +} + +struct option * +bhyvectl_opts(const struct option *options, size_t count) +{ + struct option *all_opts; + + all_opts = calloc(count + 1, sizeof(struct option)); + if (all_opts == NULL) + err(1, "calloc"); + memcpy(all_opts, options, count * sizeof(struct option)); + return (all_opts); +} + +void +bhyvectl_handle_opt(const struct option *opts __unused, int opt __unused) +{ +} + +const char * +bhyvectl_opt_desc(int opt __unused) +{ + /* No riscv-specific options yet. */ + return ("???"); +} + +void +bhyvectl_md_main(struct vmctx *ctx __unused, struct vcpu *vcpu __unused, + int vcpuid __unused, bool get_all __unused) +{ +}