Index: sys/amd64/include/bhyve_hypercall.h =================================================================== --- /dev/null +++ sys/amd64/include/bhyve_hypercall.h @@ -0,0 +1,351 @@ +/*- + * Copyright (c) 2016 Domagoj Stolfa + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MACHINE_HYPERCALL_H_ +#define _MACHINE_HYPERCALL_H_ + +#include +#include + +#include +#include + +#define PUSH(a) "push %[" #a "]\n" +#define PUSH2(a1, a2) \ + PUSH(a1) \ + PUSH(a2) +#define PUSH4(a1, a2, a3, a4) \ + PUSH2(a1, a2) \ + PUSH2(a3, a4) +#define PUSH6(a1, a2, a3, a4, a5, a6) \ + PUSH4(a1, a2, a3, a4) \ + PUSH2(a5, a6) +#define PUSH8(a1, a2, a3, a4, a5, a6, a7, a8) \ + PUSH6(a1, a2, a3, a4, a5, a6) \ + PUSH2(a7, a8) +#define PUSH10(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + PUSH8(a1, a2, a3, a4, a5, a6, a7, a8) \ + PUSH2(a9, a10) +#define PUSH12(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ + PUSH10(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + PUSH2(a11, a12) + +#define HYPERCALL_RET_NOT_IMPL -2 +#define HYPERCALL_RET_ERROR -1 +#define HYPERCALL_RET_SUCCESS 0 + +#define VMCALL ".byte 0x0f,0x01,0xc1\n" +#define VMMCALL ".byte 0x0f,0x01,0xd9\n" + +typedef struct hypercall_arg { + __uint64_t len; + __uint64_t val; +} hc_arg_t; + +/* + * The hypercalls should be defined using a naming + * convention as described here: + * - Each of the bhyve hypercalls does not have + * a prefix attached to it. + * - All of the hypercalls used to emulate other + * hypervisors should be prefixed with a + * corresponding name that identifies the + * hypervisor being emulated appropriately. + * - The name of each hypercall should correspond + * to the name of the function being called by + * that hypercall. + * + * The following defines should be kept in sync with + * hc_dispatcher(vmm.c) and ring_plevel(vmm.c). + * + * Ensure that the HYPERCALL_INDEX_MAX define is + * always correct after adding hypercalls. + * + * Should it happen that one hypervisor mode has + * more hypercalls than others, the define + * HYPERCALL_INDEX_MAX should correspond to the + * highest number and the hypercalls defined in + * hc_dispatcher should be NULL. + */ + +#define HYPERCALL_DTRACE_PROBE_CREATE 0 +#define HYPERCALL_DTRACE_PROBE 1 +#define HYPERCALL_DTRACE_RESERVED1 2 +#define HYPERCALL_DTRACE_RESERVED2 3 +#define HYPERCALL_DTRACE_RESERVED3 4 +#define HYPERCALL_DTRACE_RESERVED4 5 + +#define HYPERCALL_INDEX_MAX 6 + +static __inline __int64_t +hypercall0(__uint64_t c) +{ + const __uint64_t nargs = 0; + __int64_t ret; + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + __asm __volatile( + VMCALL + : "=a"(ret) + : "a"(c), "b"(nargs) + : "memory", "rsp"); + } else { + __asm __volatile( + VMMCALL + : "=a"(ret) + : "a"(c), "b"(nargs) + : "memory", "rsp"); + } + return (ret); +} + +static __inline __int64_t +hypercall1(__uint64_t c, hc_arg_t *arg0) +{ + const __uint64_t nargs = 1; + __int64_t ret; + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + __asm __volatile( + PUSH2(val0, len0) + VMCALL + "add $16, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } else { + __asm __volatile( + PUSH2(val0, len0) + VMMCALL + "add $16, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } + return (ret); +} + +static __inline __int64_t +hypercall2(__uint64_t c, hc_arg_t *arg0, + hc_arg_t *arg1) +{ + const __uint64_t nargs = 2; + __int64_t ret; + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + __asm __volatile( + PUSH4(val1, len1, val0, len0) + VMCALL + "add $32, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } else { + __asm __volatile( + PUSH4(val1, len1, val0, len0) + VMMCALL + "add $32, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } + return (ret); +} + +static __inline __int64_t +hypercall3(__uint64_t c, hc_arg_t *arg0, + hc_arg_t *arg1, hc_arg_t *arg2) +{ + const __uint64_t nargs = 3; + __int64_t ret; + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + __asm __volatile( + PUSH6(val2, len2, val1, len1, + val0, len0) + VMCALL + "add $48, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } else { + __asm __volatile( + PUSH6(val2, len2, val1, len1, + val0, len0) + VMMCALL + "add $48, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } + return (ret); +} + +static __inline __int64_t +hypercall4(__uint64_t c, hc_arg_t *arg0, + hc_arg_t *arg1, hc_arg_t *arg2, + hc_arg_t *arg3) +{ + const __uint64_t nargs = 4; + __int64_t ret; + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + __asm __volatile( + PUSH8(val3, len3, val2, len2, + val1, len1, val0, len0) + VMCALL + "add $64, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val3] "r"(arg3->val), [len3] "r"(arg3->len), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } else { + __asm __volatile( + PUSH8(val3, len3, val2, len2, + val1, len1, val0, len0) + VMMCALL + "add $64, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val3] "r"(arg3->val), [len3] "r"(arg3->len), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } + return (ret); +} + +static __inline __int64_t +hypercall5(__uint64_t c, hc_arg_t *arg0, + hc_arg_t *arg1, hc_arg_t *arg2, + hc_arg_t *arg3, hc_arg_t *arg4) +{ + const __uint64_t nargs = 5; + __int64_t ret; + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + __asm __volatile( + PUSH10(val4, len4, val3, len3, + val2, len2, val1, len1, + val0, len0) + VMCALL + "add $80, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val4] "r"(arg4->val), [len4] "r"(arg4->len), + [val3] "r"(arg3->val), [len3] "r"(arg3->len), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } else { + __asm __volatile( + PUSH10(val4, len4, val3, len3, + val2, len2, val1, len1, + val0, len0) + VMMCALL + "add $80, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val4] "r"(arg4->val), [len4] "r"(arg4->len), + [val3] "r"(arg3->val), [len3] "r"(arg3->len), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } + return (ret); +} + +static __inline __int64_t +hypercall6(__uint64_t c, hc_arg_t *arg0, + hc_arg_t *arg1, hc_arg_t *arg2, + hc_arg_t *arg3, hc_arg_t *arg4, + hc_arg_t *arg5) +{ + const __uint64_t nargs = 6; + __int64_t ret; + if (cpu_vendor_id == CPU_VENDOR_INTEL) { + __asm __volatile( + PUSH12(val5, len5, val4, len4, + val3, len3, val2, len2, + val1, len1, val0, len0) + VMCALL + "add $96, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val5] "r"(arg5->val), [len5] "r"(arg5->len), + [val4] "r"(arg4->val), [len4] "r"(arg4->len), + [val3] "r"(arg3->val), [len3] "r"(arg3->len), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } else { + __asm __volatile( + PUSH12(val5, len5, val4, len4, + val3, len3, val2, len2, + val1, len1, val0, len0) + VMMCALL + "add $96, %%rsp\n" + : "=a"(ret) + : "a"(c), + [val5] "r"(arg5->val), [len5] "r"(arg5->len), + [val4] "r"(arg4->val), [len4] "r"(arg4->len), + [val3] "r"(arg3->val), [len3] "r"(arg3->len), + [val2] "r"(arg2->val), [len2] "r"(arg2->len), + [val1] "r"(arg1->val), [len1] "r"(arg1->len), + [val0] "r"(arg0->val), [len0] "r"(arg0->len), + "b"(nargs) + : "memory", "rsp"); + } + return (ret); +} + +#endif /* _MACHINE_HYPERCALL_H_ */ Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h +++ sys/amd64/include/vmm.h @@ -105,6 +105,7 @@ #ifdef _KERNEL #define VM_MAX_NAMELEN 32 +#define HV_MAX_NAMELEN 32 struct vm; struct vm_exception; @@ -170,6 +171,12 @@ extern struct vmm_ops vmm_ops_intel; extern struct vmm_ops vmm_ops_amd; +#define BHYVE_MODE 0 +#define VMM_MAX_MODES 1 + +extern int hypervisor_mode; +extern int hypercalls_enabled; + int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); int vm_reinit(struct vm *vm); @@ -533,6 +540,7 @@ VM_EXITCODE_MWAIT, VM_EXITCODE_SVM, VM_EXITCODE_REQIDLE, + VM_EXITCODE_HYPERCALL, VM_EXITCODE_MAX }; @@ -573,6 +581,10 @@ struct vm_guest_paging paging; }; +struct vm_hypercall { + struct vm_guest_paging paging; +}; + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ @@ -636,7 +648,8 @@ struct { enum vm_suspend_how how; } suspended; - struct vm_task_switch task_switch; + struct vm_task_switch task_switch; + struct vm_hypercall hypercall; } u; }; Index: sys/amd64/vmm/amd/svm.c =================================================================== --- sys/amd64/vmm/amd/svm.c +++ sys/amd64/vmm/amd/svm.c @@ -158,7 +158,6 @@ static int svm_cleanup(void) { - smp_rendezvous(NULL, svm_disable, NULL, NULL); return (0); } @@ -469,6 +468,9 @@ svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); + /* Enable VMMCALL to be used for DTrace probes */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMMCALL); + /* * From section "Canonicalization and Consistency Checks" in APMv2 * the VMRUN intercept bit must be set to pass the consistency check. @@ -849,6 +851,29 @@ vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); } +static void +svm_handle_hypercall(struct svm_softc *svm_sc, int vcpu, struct vmcb *vmcb, struct vm_exit *vmexit) +{ + struct vm_guest_paging *paging; + struct vmcb_segment seg; + uint64_t rsp; + int error; + + paging = &vmexit->u.hypercall.paging; + vmexit->exitcode = VM_EXITCODE_HYPERCALL; + + error = vmcb_read(svm_sc, vcpu, VM_REG_GUEST_RSP, + &rsp); + KASSERT(error == 0, ("%s: error %d getting RSP", + __func__, error)); + + error = vmcb_seg(vmcb, VM_REG_GUEST_SS, &seg); + KASSERT(error == 0, ("%s: error %d getting segment SS", + __func__, error)); + + svm_paging_info(vmcb, paging); +} + #ifdef KTR static const char * intrtype_to_str(int intr_type) @@ -1243,6 +1268,12 @@ return ("monitor"); case VMCB_EXIT_MWAIT: return ("mwait"); + case VMCB_EXIT_VMMCALL: + return ("vmmcall"); + case VMCB_EXIT_VMLOAD: + return ("vmload"); + case VMCB_EXIT_VMSAVE: + return ("vmsave"); default: snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); return (reasonbuf); @@ -1344,6 +1375,15 @@ case VMCB_EXIT_NMI: /* external NMI */ handled = 1; break; + case VMCB_EXIT_VMMCALL: + if (hypercalls_enabled == 0) { + vm_inject_ud(svm_sc->vm, vcpu); + handled = 1; + } + else { + svm_handle_hypercall(svm_sc, vcpu, vmcb, vmexit); + } + break; case 0x40 ... 0x5F: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); reflect = 1; Index: sys/amd64/vmm/amd/vmcb.h =================================================================== --- sys/amd64/vmm/amd/vmcb.h +++ sys/amd64/vmm/amd/vmcb.h @@ -139,6 +139,8 @@ #define VMCB_EXIT_IO 0x7B #define VMCB_EXIT_MSR 0x7C #define VMCB_EXIT_SHUTDOWN 0x7F +#define VMCB_EXIT_VMMCALL 0x81 +#define VMCB_EXIT_VMLOAD 0x82 #define VMCB_EXIT_VMSAVE 0x83 #define VMCB_EXIT_MONITOR 0x8A #define VMCB_EXIT_MWAIT 0x8B Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c +++ sys/amd64/vmm/intel/vmx.c @@ -2474,6 +2474,15 @@ case EXIT_REASON_MWAIT: vmexit->exitcode = VM_EXITCODE_MWAIT; break; + case EXIT_REASON_VMCALL: + if (hypercalls_enabled == 0) { + vm_inject_ud(vmx->vm, vcpu); + handled = HANDLED; + } else { + vmexit->exitcode = VM_EXITCODE_HYPERCALL; + vmx_paging_info(&vmexit->u.hypercall.paging); + } + break; default: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c +++ sys/amd64/vmm/vmm.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -224,10 +225,113 @@ &trace_guest_exceptions, 0, "Trap into hypervisor on all guest exceptions and reflect them back"); +int hypercalls_enabled = 0; +SYSCTL_INT(_hw_vmm, OID_AUTO, hypercalls_enabled, CTLFLAG_RWTUN, + &hypercalls_enabled, 0, + "Enable hypercalls on all guests"); + +/* + * The maximum amount of arguments currently supproted + * through the hypercall functionality in the VMM. + * Everything higher than HYPERCALL_MAX_ARGS will be + * discarded. + */ +#define HYPERCALL_MAX_ARGS 6 + +typedef int (*hc_handler_t)(uint64_t, struct vm *, int, + struct vm_exit *, bool *); +typedef int64_t (*hc_dispatcher_t)(struct vm *, int, + struct hypercall_arg *, struct vm_guest_paging *); + +/* + * The default hypervisor mode used is BHYVE_MODE. + */ +int hypervisor_mode = BHYVE_MODE; + +static int bhyve_handle_hypercall(uint64_t hcid, struct vm *vm, + int vcpuid, struct vm_exit *vmexit, bool *retu); + +/* + * Hypercall handlers based on the hypervisor mode. + * The naming convention should include a prefix of + * the mode that the corresponding handler is bound + * to. This should be kept in sync with the global + * variable hc_dispatcher(see below). + */ +hc_handler_t hc_handler[VMM_MAX_MODES] = { + [BHYVE_MODE] = bhyve_handle_hypercall +}; + +/* + * Each hypercall mode implements different hypercalls + * with differently mapped hypercall numbers. If the + * hypercall is not implemented it should be kept as + * NULL. This will generate an #UD fault in the guest + * without exception. Keep in sync with + * hc_handler(see above) and ring_plevel(see below). + */ +hc_dispatcher_t hc_dispatcher[VMM_MAX_MODES][HYPERCALL_INDEX_MAX] = { + [BHYVE_MODE] = { + [HYPERCALL_DTRACE_PROBE_CREATE] = NULL, + [HYPERCALL_DTRACE_PROBE] = NULL, + [HYPERCALL_DTRACE_RESERVED1] = NULL, + [HYPERCALL_DTRACE_RESERVED2] = NULL, + [HYPERCALL_DTRACE_RESERVED3] = NULL, + [HYPERCALL_DTRACE_RESERVED4] = NULL + } +}; + +/* + * Each of the hypercalls can only be called from well + * defined protection rings. The most minimal ring + * should be assigned to each of the hypercalls. This + * should be kept in sync with hc_dispatcher(see above). + */ +static int8_t ring_plevel[VMM_MAX_MODES][HYPERCALL_INDEX_MAX] = { + [BHYVE_MODE] = { + [HYPERCALL_DTRACE_PROBE_CREATE] = 0, + [HYPERCALL_DTRACE_PROBE] = 0, + [HYPERCALL_DTRACE_RESERVED1] = 0, + [HYPERCALL_DTRACE_RESERVED2] = 0, + [HYPERCALL_DTRACE_RESERVED3] = 0, + [HYPERCALL_DTRACE_RESERVED4] = 0 + } +}; + static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); +static int +sysctl_vmm_hypervisor_mode(SYSCTL_HANDLER_ARGS) +{ + int error; + char buf[HV_MAX_NAMELEN]; + + if (hypervisor_mode == BHYVE_MODE) { + strlcpy(buf, "bhyve", sizeof(buf)); + } else { + strlcpy(buf, "undefined", sizeof(buf)); + } + + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + + if (strcmp(buf, "bhyve") == 0) { + hypervisor_mode = BHYVE_MODE; + } else { + /* + * Disallow undefined data + */ + hypervisor_mode = BHYVE_MODE; + } + + return (0); +} +SYSCTL_PROC(_hw_vmm, OID_AUTO, hv_mode, CTLTYPE_STRING | CTLFLAG_RDTUN, + NULL, 0, sysctl_vmm_hypervisor_mode, "A", NULL); + #ifdef KTR static const char * vcpu_state2str(enum vcpu_state state) @@ -538,8 +642,9 @@ if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) return (ENOMEM); - else + else { return (0); + } } int @@ -1507,6 +1612,168 @@ return (0); } +static __inline int64_t +hypercall_dispatch(uint64_t hcid, struct vm *vm, int vcpuid, + struct hypercall_arg *args, struct vm_guest_paging *paging) +{ + /* + * Do not allow hypercalls that aren't implemented. + * This unconditionally generates an #UD fault in + * the guest. + */ + if (hc_dispatcher[hypervisor_mode][hcid] == NULL) { + vm_inject_ud(vm, vcpuid); + return (0); + } + return (hc_dispatcher[hypervisor_mode][hcid](vm, vcpuid, args, paging)); +} + +static __inline int +hypercall_handle(uint64_t hcid, struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + return (hc_handler[hypervisor_mode](hcid, vm, vcpuid, vmexit, retu)); +} + +static int +hypercall_copy_arg(struct vm *vm, int vcpuid, uint64_t ds_base, + struct hypercall_arg *arg, struct vm_guest_paging *paging, void *dst) +{ + struct vm_copyinfo copyinfo[2]; + uint64_t gla; + int error, fault; + + gla = ds_base + arg->val; + error = vm_copy_setup(vm, vcpuid, paging, gla, arg->len, + PROT_READ, copyinfo, nitems(copyinfo), &fault); + if (error || fault) { + return (error); + } + + vm_copyin(vm, vcpuid, copyinfo, dst, arg->len); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + return (0); +} + +static int +bhyve_handle_hypercall(uint64_t hcid, struct vm *vm, int vcpuid, + struct vm_exit *vmexit, bool *retu) +{ + struct vm_copyinfo copyinfo[2]; + struct vm_guest_paging *paging; + struct hypercall_arg args[HYPERCALL_MAX_ARGS]; + struct seg_desc ss_desc; + uint64_t nargs, rsp, stack_gla, cr0, rflags; + int64_t retval; + int error, fault, stackaddrsize, size, handled, addrsize; + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RBX, &nargs); + KASSERT(error == 0, ("%s: error %d getting RBX", + __func__, error)); + + if (nargs > HYPERCALL_MAX_ARGS) { + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, HYPERCALL_RET_ERROR); + KASSERT(error == 0, ("%s: error %d setting RAX", + __func__, error)); + return (0); + } + + handled = 0; + paging = &vmexit->u.hypercall.paging; + stackaddrsize = 8; + addrsize = 8; + size = sizeof(struct hypercall_arg); + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + KASSERT(error == 0, ("%s: error %d getting CR0", + __func__, error)); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting RFLAGS", + __func__, error)); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + KASSERT(error == 0, ("%s: error %d getting RSP", + __func__, error)); + + + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc); + KASSERT(error == 0, ("%s: error %d getting SS descriptor", + __func__, error)); + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc, + rsp, addrsize, stackaddrsize, PROT_READ, &stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_canonical_check(paging->cpu_mode, stack_gla)) { + vm_inject_ss(vm, vcpuid, 0); + return (0); + } + + if (vie_alignment_check(paging->cpl, addrsize, cr0, rflags, stack_gla)) { + vm_inject_ac(vm, vcpuid, 0); + return (0); + } + + error = vm_copy_setup(vm, vcpuid, paging, stack_gla, nargs * size, + PROT_READ, copyinfo, nitems(copyinfo), &fault); + if (error || fault) { + return (error); + } + + vm_copyin(vm, vcpuid, copyinfo, args, nargs * size); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + /* + * From this point on, all the arguments passed in from the + * guest are contained in the args array. + */ + retval = hypercall_dispatch(hcid, vm, vcpuid, args, paging); + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, retval); + KASSERT(error == 0, ("%s: error %d setting RAX", + __func__, error)); + return (0); +} + +static int +vm_handle_hypercall(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) +{ + struct seg_desc cs_desc; + uint64_t hcid; + int error; + + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &hcid); + KASSERT(error == 0, ("%s: error %d getting RAX", + __func__, error)); + /* + * Ensure that the hypercall called by the guest never exceed + * the maximum number of hypercalls defined. + */ + if (hcid >= HYPERCALL_INDEX_MAX) { + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, HYPERCALL_RET_ERROR); + KASSERT(error == 0, ("%s: error %d setting RAX", + __func__, error)); + return (0); + } + + error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc); + KASSERT(error == 0, ("%s: error %d getting CS descriptor", + __func__, error)); + + /* + * The check ensures that each of the hypercalls that is called + * from the guest is called from the correct protection ring. + */ + if (SEG_DESC_DPL(cs_desc.access) != ring_plevel[hypervisor_mode][hcid]) { + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, HYPERCALL_RET_ERROR); + KASSERT(error == 0, ("%s: error %d setting RAX", + __func__, error)); + return (0); + } + + return (hypercall_handle(hcid, vm, vcpuid, vmexit, retu)); +} + int vm_suspend(struct vm *vm, enum vm_suspend_how how) { @@ -1675,6 +1942,9 @@ case VM_EXITCODE_MWAIT: vm_inject_ud(vm, vcpuid); break; + case VM_EXITCODE_HYPERCALL: + error = vm_handle_hypercall(vm, vcpuid, vme, &retu); + break; default: retu = true; /* handled in userland */ break; @@ -1855,6 +2125,7 @@ /* Handle exceptions serially */ *retinfo = info2; } + return (1); } Index: sys/amd64/vmm/x86.h =================================================================== --- sys/amd64/vmm/x86.h +++ sys/amd64/vmm/x86.h @@ -29,16 +29,16 @@ #ifndef _X86_H_ #define _X86_H_ -#define CPUID_0000_0000 (0x0) +#define CPUID_0000_0000 (0x0) #define CPUID_0000_0001 (0x1) -#define CPUID_0000_0002 (0x2) -#define CPUID_0000_0003 (0x3) -#define CPUID_0000_0004 (0x4) -#define CPUID_0000_0006 (0x6) -#define CPUID_0000_0007 (0x7) -#define CPUID_0000_000A (0xA) -#define CPUID_0000_000B (0xB) -#define CPUID_0000_000D (0xD) +#define CPUID_0000_0002 (0x2) +#define CPUID_0000_0003 (0x3) +#define CPUID_0000_0004 (0x4) +#define CPUID_0000_0006 (0x6) +#define CPUID_0000_0007 (0x7) +#define CPUID_0000_000A (0xA) +#define CPUID_0000_000B (0xB) +#define CPUID_0000_000D (0xD) #define CPUID_8000_0000 (0x80000000) #define CPUID_8000_0001 (0x80000001) #define CPUID_8000_0002 (0x80000002) @@ -47,7 +47,8 @@ #define CPUID_8000_0006 (0x80000006) #define CPUID_8000_0007 (0x80000007) #define CPUID_8000_0008 (0x80000008) - +#define CPUID_4000_0000 (0x40000000) +#define CPUID_4000_0001 (0x40000001) /* * CPUID instruction Fn0000_0001: */ Index: sys/amd64/vmm/x86.c =================================================================== --- sys/amd64/vmm/x86.c +++ sys/amd64/vmm/x86.c @@ -50,9 +50,26 @@ SYSCTL_DECL(_hw_vmm); static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL); -#define CPUID_VM_HIGH 0x40000000 +#define CPUID_VM_HIGH 0x40000001 +#define CPUID_HV_SPECIFIC_HIGH (CPUID_VM_HIGH & 0x000000FF) +#define CPUID_HV_SPECIFIC_NUM (CPUID_HV_SPECIFIC_HIGH + 1) -static const char bhyve_id[12] = "bhyve bhyve "; +/* + * Maps the specified hypervisor specific CPUID to an + * index used to index the cpuid_dispatcher jumptable. + * The reserved CPUIDs for a hypervisor as seen in + * intel and AMD manuals are 0x40000000-0x400000FF. + */ +#define HC_CPUID_ID(id) (id & 0x000000FF) + +/* + * Advertises the appropriate hypervisor identified based + * on the hypervisor operation mode. This should be kept + * in sync with the possible hypervisor modes. + */ +static const char hypervisor_id[VMM_MAX_MODES][12] = { + [BHYVE_MODE] = "bhyve bhyve " +}; static uint64_t bhyve_xcpuids; SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, @@ -77,6 +94,28 @@ * Round up to the next power of two, if necessary, and then take log2. * Returns -1 if argument is zero. */ + +typedef void (*cpuid_dispatcher_t)(unsigned int regs[4]); + +static void cpuid_advertise_hw_vendor(unsigned int regs[4]); +static void cpuid_bhyve_hypercall_enabled(unsigned int regs[4]); + +/* + * Dispatches the appropriate CPUID handler based on + * the computed index using the HC_CPUID_ID macro. + * This should be kept in sync with allowed hypervisor + * modes. Keep this jumptable as generic as possible + * and in case of a specific CPUID for each hypervisor + * mode, the naming convention for the jumptable entry + * is cpuid__functionality. + */ +cpuid_dispatcher_t cpuid_dispatcher[VMM_MAX_MODES][CPUID_HV_SPECIFIC_NUM] = { + [BHYVE_MODE] = { + [0] = cpuid_advertise_hw_vendor, + [1] = cpuid_bhyve_hypercall_enabled + } +}; + static __inline int log2(u_int x) { @@ -84,6 +123,27 @@ return (fls(x << (1 - powerof2(x))) - 1); } +static __inline void +cpuid_dispatch(unsigned int func, unsigned int regs[4]) +{ + cpuid_dispatcher[hypervisor_mode][HC_CPUID_ID(func)](regs); +} + +static void +cpuid_advertise_hw_vendor(unsigned int regs[4]) +{ + regs[0] = CPUID_VM_HIGH; + bcopy(hypervisor_id[hypervisor_mode], ®s[1], 4); + bcopy(hypervisor_id[hypervisor_mode]+ 4, ®s[2], 4); + bcopy(hypervisor_id[hypervisor_mode]+ 8, ®s[3], 4); +} + +static void +cpuid_bhyve_hypercall_enabled(unsigned int regs[4]) +{ + regs[0] = hypercalls_enabled; +} + int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) @@ -463,11 +523,13 @@ } break; - case 0x40000000: - regs[0] = CPUID_VM_HIGH; - bcopy(bhyve_id, ®s[1], 4); - bcopy(bhyve_id + 4, ®s[2], 4); - bcopy(bhyve_id + 8, ®s[3], 4); + case CPUID_4000_0000: + case CPUID_4000_0001: + /* + * Each of the hypervisor specific CPUIDs should + * be handled with the dispatcher. No exceptions. + */ + cpuid_dispatch(func, regs); break; default: Index: sys/kern/subr_param.c =================================================================== --- sys/kern/subr_param.c +++ sys/kern/subr_param.c @@ -149,6 +149,7 @@ "hv", "vmware", "kvm", + "bhyve", NULL }; CTASSERT(nitems(vm_guest_sysctl_names) - 1 == VM_LAST); Index: sys/sys/systm.h =================================================================== --- sys/sys/systm.h +++ sys/sys/systm.h @@ -74,7 +74,7 @@ * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, - VM_GUEST_VMWARE, VM_GUEST_KVM, VM_LAST }; + VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_LAST }; #if defined(WITNESS) || defined(INVARIANT_SUPPORT) void kassert_panic(const char *fmt, ...) __printflike(1, 2); Index: sys/x86/x86/identcpu.c =================================================================== --- sys/x86/x86/identcpu.c +++ sys/x86/x86/identcpu.c @@ -1288,6 +1288,8 @@ vm_guest = VM_GUEST_HV; else if (strcmp(hv_vendor, "KVMKVMKVM") == 0) vm_guest = VM_GUEST_KVM; + else if (strcmp(hv_vendor, "bhyve bhyve ") == 0) + vm_guest = VM_GUEST_BHYVE; } return; }