Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: head/lib/libvmmapi/vmmapi.c
===================================================================
--- head/lib/libvmmapi/vmmapi.c (revision 332156)
+++ head/lib/libvmmapi/vmmapi.c (revision 332157)
@@ -1,1519 +1,1551 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/_iovec.h>
#include <sys/cpuset.h>
#include <x86/segments.h>
#include <machine/specialreg.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <libutil.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include "vmmapi.h"
#define MB (1024 * 1024UL)
#define GB (1024 * 1024 * 1024UL)
/*
* Size of the guard region before and after the virtual address space
* mapping the guest physical memory. This must be a multiple of the
* superpage size for performance reasons.
*/
#define VM_MMAP_GUARD_SIZE (4 * MB)
#define PROT_RW (PROT_READ | PROT_WRITE)
#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC)
struct vmctx {
int fd;
uint32_t lowmem_limit;
int memflags;
size_t lowmem;
size_t highmem;
char *baseaddr;
char *name;
};
#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
static int
vm_device_open(const char *name)
{
int fd, len;
char *vmfile;
len = strlen("/dev/vmm/") + strlen(name) + 1;
vmfile = malloc(len);
assert(vmfile != NULL);
snprintf(vmfile, len, "/dev/vmm/%s", name);
/* Open the device file */
fd = open(vmfile, O_RDWR, 0);
free(vmfile);
return (fd);
}
int
vm_create(const char *name)
{
return (CREATE((char *)name));
}
struct vmctx *
vm_open(const char *name)
{
struct vmctx *vm;
vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
assert(vm != NULL);
vm->fd = -1;
vm->memflags = 0;
vm->lowmem_limit = 3 * GB;
vm->name = (char *)(vm + 1);
strcpy(vm->name, name);
if ((vm->fd = vm_device_open(vm->name)) < 0)
goto err;
return (vm);
err:
vm_destroy(vm);
return (NULL);
}
void
vm_destroy(struct vmctx *vm)
{
assert(vm != NULL);
if (vm->fd >= 0)
close(vm->fd);
DESTROY(vm->name);
free(vm);
}
int
vm_parse_memsize(const char *optarg, size_t *ret_memsize)
{
char *endptr;
size_t optval;
int error;
optval = strtoul(optarg, &endptr, 0);
if (*optarg != '\0' && *endptr == '\0') {
/*
* For the sake of backward compatibility if the memory size
* specified on the command line is less than a megabyte then
* it is interpreted as being in units of MB.
*/
if (optval < MB)
optval *= MB;
*ret_memsize = optval;
error = 0;
} else
error = expand_number(optarg, ret_memsize);
return (error);
}
uint32_t
vm_get_lowmem_limit(struct vmctx *ctx)
{
return (ctx->lowmem_limit);
}
void
vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
{
ctx->lowmem_limit = limit;
}
void
vm_set_memflags(struct vmctx *ctx, int flags)
{
ctx->memflags = flags;
}
int
vm_get_memflags(struct vmctx *ctx)
{
return (ctx->memflags);
}
/*
* Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
*/
int
vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
size_t len, int prot)
{
struct vm_memmap memmap;
int error, flags;
memmap.gpa = gpa;
memmap.segid = segid;
memmap.segoff = off;
memmap.len = len;
memmap.prot = prot;
memmap.flags = 0;
if (ctx->memflags & VM_MEM_F_WIRED)
memmap.flags |= VM_MEMMAP_F_WIRED;
/*
* If this mapping already exists then don't create it again. This
* is the common case for SYSMEM mappings created by bhyveload(8).
*/
error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
if (error == 0 && gpa == memmap.gpa) {
if (segid != memmap.segid || off != memmap.segoff ||
prot != memmap.prot || flags != memmap.flags) {
errno = EEXIST;
return (-1);
} else {
return (0);
}
}
error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
return (error);
}
int
vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
{
struct vm_memmap memmap;
int error;
bzero(&memmap, sizeof(struct vm_memmap));
memmap.gpa = *gpa;
error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
if (error == 0) {
*gpa = memmap.gpa;
*segid = memmap.segid;
*segoff = memmap.segoff;
*len = memmap.len;
*prot = memmap.prot;
*flags = memmap.flags;
}
return (error);
}
/*
* Return 0 if the segments are identical and non-zero otherwise.
*
* This is slightly complicated by the fact that only device memory segments
* are named.
*/
static int
cmpseg(size_t len, const char *str, size_t len2, const char *str2)
{
if (len == len2) {
if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
return (0);
}
return (-1);
}
static int
vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
{
struct vm_memseg memseg;
size_t n;
int error;
/*
* If the memory segment has already been created then just return.
* This is the usual case for the SYSMEM segment created by userspace
* loaders like bhyveload(8).
*/
error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
sizeof(memseg.name));
if (error)
return (error);
if (memseg.len != 0) {
if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
errno = EINVAL;
return (-1);
} else {
return (0);
}
}
bzero(&memseg, sizeof(struct vm_memseg));
memseg.segid = segid;
memseg.len = len;
if (name != NULL) {
n = strlcpy(memseg.name, name, sizeof(memseg.name));
if (n >= sizeof(memseg.name)) {
errno = ENAMETOOLONG;
return (-1);
}
}
error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
return (error);
}
int
vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
size_t bufsize)
{
struct vm_memseg memseg;
size_t n;
int error;
memseg.segid = segid;
error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
if (error == 0) {
*lenp = memseg.len;
n = strlcpy(namebuf, memseg.name, bufsize);
if (n >= bufsize) {
errno = ENAMETOOLONG;
error = -1;
}
}
return (error);
}
static int
setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
{
char *ptr;
int error, flags;
/* Map 'len' bytes starting at 'gpa' in the guest address space */
error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
if (error)
return (error);
flags = MAP_SHARED | MAP_FIXED;
if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
flags |= MAP_NOCORE;
/* mmap into the process address space on the host */
ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
if (ptr == MAP_FAILED)
return (-1);
return (0);
}
int
vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
{
size_t objsize, len;
vm_paddr_t gpa;
char *baseaddr, *ptr;
int error, flags;
assert(vms == VM_MMAP_ALL);
/*
* If 'memsize' cannot fit entirely in the 'lowmem' segment then
* create another 'highmem' segment above 4GB for the remainder.
*/
if (memsize > ctx->lowmem_limit) {
ctx->lowmem = ctx->lowmem_limit;
ctx->highmem = memsize - ctx->lowmem_limit;
objsize = 4*GB + ctx->highmem;
} else {
ctx->lowmem = memsize;
ctx->highmem = 0;
objsize = ctx->lowmem;
}
error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
if (error)
return (error);
/*
* Stake out a contiguous region covering the guest physical memory
* and the adjoining guard regions.
*/
len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0);
if (ptr == MAP_FAILED)
return (-1);
baseaddr = ptr + VM_MMAP_GUARD_SIZE;
if (ctx->highmem > 0) {
gpa = 4*GB;
len = ctx->highmem;
error = setup_memory_segment(ctx, gpa, len, baseaddr);
if (error)
return (error);
}
if (ctx->lowmem > 0) {
gpa = 0;
len = ctx->lowmem;
error = setup_memory_segment(ctx, gpa, len, baseaddr);
if (error)
return (error);
}
ctx->baseaddr = baseaddr;
return (0);
}
/*
* Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
* the lowmem or highmem regions.
*
* In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
* The instruction emulation code depends on this behavior.
*/
void *
vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
{
if (ctx->lowmem > 0) {
if (gaddr < ctx->lowmem && len <= ctx->lowmem &&
gaddr + len <= ctx->lowmem)
return (ctx->baseaddr + gaddr);
}
if (ctx->highmem > 0) {
if (gaddr >= 4*GB) {
if (gaddr < 4*GB + ctx->highmem &&
len <= ctx->highmem &&
gaddr + len <= 4*GB + ctx->highmem)
return (ctx->baseaddr + gaddr);
}
}
return (NULL);
}
size_t
vm_get_lowmem_size(struct vmctx *ctx)
{
return (ctx->lowmem);
}
size_t
vm_get_highmem_size(struct vmctx *ctx)
{
return (ctx->highmem);
}
void *
vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
{
char pathname[MAXPATHLEN];
size_t len2;
char *base, *ptr;
int fd, error, flags;
fd = -1;
ptr = MAP_FAILED;
if (name == NULL || strlen(name) == 0) {
errno = EINVAL;
goto done;
}
error = vm_alloc_memseg(ctx, segid, len, name);
if (error)
goto done;
strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
strlcat(pathname, ctx->name, sizeof(pathname));
strlcat(pathname, ".", sizeof(pathname));
strlcat(pathname, name, sizeof(pathname));
fd = open(pathname, O_RDWR);
if (fd < 0)
goto done;
/*
* Stake out a contiguous region covering the device memory and the
* adjoining guard regions.
*/
len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
base = mmap(NULL, len2, PROT_NONE, flags, -1, 0);
if (base == MAP_FAILED)
goto done;
flags = MAP_SHARED | MAP_FIXED;
if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
flags |= MAP_NOCORE;
/* mmap the devmem region in the host address space */
ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
done:
if (fd >= 0)
close(fd);
return (ptr);
}
int
vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
vmsegdesc.desc.base = base;
vmsegdesc.desc.limit = limit;
vmsegdesc.desc.access = access;
error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
return (error);
}
int
vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access)
{
int error;
struct vm_seg_desc vmsegdesc;
bzero(&vmsegdesc, sizeof(vmsegdesc));
vmsegdesc.cpuid = vcpu;
vmsegdesc.regnum = reg;
error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
if (error == 0) {
*base = vmsegdesc.desc.base;
*limit = vmsegdesc.desc.limit;
*access = vmsegdesc.desc.access;
}
return (error);
}
int
vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
{
int error;
error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
&seg_desc->access);
return (error);
}
int
vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
vmreg.regval = val;
error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
return (error);
}
int
vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
{
int error;
struct vm_register vmreg;
bzero(&vmreg, sizeof(vmreg));
vmreg.cpuid = vcpu;
vmreg.regnum = reg;
error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
*ret_val = vmreg.regval;
return (error);
}
int
vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
const int *regnums, uint64_t *regvals)
{
int error;
struct vm_register_set vmregset;
bzero(&vmregset, sizeof(vmregset));
vmregset.cpuid = vcpu;
vmregset.count = count;
vmregset.regnums = regnums;
vmregset.regvals = regvals;
error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset);
return (error);
}
int
vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
const int *regnums, uint64_t *regvals)
{
int error;
struct vm_register_set vmregset;
bzero(&vmregset, sizeof(vmregset));
vmregset.cpuid = vcpu;
vmregset.count = count;
vmregset.regnums = regnums;
vmregset.regvals = regvals;
error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset);
return (error);
}
int
vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
{
int error;
struct vm_run vmrun;
bzero(&vmrun, sizeof(vmrun));
vmrun.cpuid = vcpu;
error = ioctl(ctx->fd, VM_RUN, &vmrun);
bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
return (error);
}
int
vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
{
struct vm_suspend vmsuspend;
bzero(&vmsuspend, sizeof(vmsuspend));
vmsuspend.how = how;
return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
}
int
vm_reinit(struct vmctx *ctx)
{
return (ioctl(ctx->fd, VM_REINIT, 0));
}
int
vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
uint32_t errcode, int restart_instruction)
{
struct vm_exception exc;
exc.cpuid = vcpu;
exc.vector = vector;
exc.error_code = errcode;
exc.error_code_valid = errcode_valid;
exc.restart_instruction = restart_instruction;
return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
}
int
vm_apicid2vcpu(struct vmctx *ctx, int apicid)
{
/*
* The apic id associated with the 'vcpu' has the same numerical value
* as the 'vcpu' itself.
*/
return (apicid);
}
int
vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
{
struct vm_lapic_irq vmirq;
bzero(&vmirq, sizeof(vmirq));
vmirq.cpuid = vcpu;
vmirq.vector = vector;
return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
}
int
vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector)
{
struct vm_lapic_irq vmirq;
bzero(&vmirq, sizeof(vmirq));
vmirq.cpuid = vcpu;
vmirq.vector = vector;
return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq));
}
int
vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg)
{
struct vm_lapic_msi vmmsi;
bzero(&vmmsi, sizeof(vmmsi));
vmmsi.addr = addr;
vmmsi.msg = msg;
return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi));
}
int
vm_ioapic_assert_irq(struct vmctx *ctx, int irq)
{
struct vm_ioapic_irq ioapic_irq;
bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
ioapic_irq.irq = irq;
return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq));
}
int
vm_ioapic_deassert_irq(struct vmctx *ctx, int irq)
{
struct vm_ioapic_irq ioapic_irq;
bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
ioapic_irq.irq = irq;
return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq));
}
int
vm_ioapic_pulse_irq(struct vmctx *ctx, int irq)
{
struct vm_ioapic_irq ioapic_irq;
bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
ioapic_irq.irq = irq;
return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq));
}
int
vm_ioapic_pincount(struct vmctx *ctx, int *pincount)
{
return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount));
}
int
vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
{
struct vm_isa_irq isa_irq;
bzero(&isa_irq, sizeof(struct vm_isa_irq));
isa_irq.atpic_irq = atpic_irq;
isa_irq.ioapic_irq = ioapic_irq;
return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq));
}
int
vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
{
struct vm_isa_irq isa_irq;
bzero(&isa_irq, sizeof(struct vm_isa_irq));
isa_irq.atpic_irq = atpic_irq;
isa_irq.ioapic_irq = ioapic_irq;
return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq));
}
int
vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
{
struct vm_isa_irq isa_irq;
bzero(&isa_irq, sizeof(struct vm_isa_irq));
isa_irq.atpic_irq = atpic_irq;
isa_irq.ioapic_irq = ioapic_irq;
return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
}
int
vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
enum vm_intr_trigger trigger)
{
struct vm_isa_irq_trigger isa_irq_trigger;
bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
isa_irq_trigger.atpic_irq = atpic_irq;
isa_irq_trigger.trigger = trigger;
return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
}
int
vm_inject_nmi(struct vmctx *ctx, int vcpu)
{
struct vm_nmi vmnmi;
bzero(&vmnmi, sizeof(vmnmi));
vmnmi.cpuid = vcpu;
return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
}
static struct {
const char *name;
int type;
} capstrmap[] = {
{ "hlt_exit", VM_CAP_HALT_EXIT },
{ "mtrap_exit", VM_CAP_MTRAP_EXIT },
{ "pause_exit", VM_CAP_PAUSE_EXIT },
{ "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
{ "enable_invpcid", VM_CAP_ENABLE_INVPCID },
{ 0 }
};
int
vm_capability_name2type(const char *capname)
{
int i;
for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
if (strcmp(capstrmap[i].name, capname) == 0)
return (capstrmap[i].type);
}
return (-1);
}
const char *
vm_capability_type2name(int type)
{
int i;
for (i = 0; capstrmap[i].name != NULL; i++) {
if (capstrmap[i].type == type)
return (capstrmap[i].name);
}
return (NULL);
}
int
vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval)
{
int error;
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
*retval = vmcap.capval;
return (error);
}
int
vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
{
struct vm_capability vmcap;
bzero(&vmcap, sizeof(vmcap));
vmcap.cpuid = vcpu;
vmcap.captype = cap;
vmcap.capval = val;
return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
}
int
vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
}
int
vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
{
struct vm_pptdev pptdev;
bzero(&pptdev, sizeof(pptdev));
pptdev.bus = bus;
pptdev.slot = slot;
pptdev.func = func;
return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
}
int
vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
struct vm_pptdev_mmio pptmmio;
bzero(&pptmmio, sizeof(pptmmio));
pptmmio.bus = bus;
pptmmio.slot = slot;
pptmmio.func = func;
pptmmio.gpa = gpa;
pptmmio.len = len;
pptmmio.hpa = hpa;
return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
}
int
vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
uint64_t addr, uint64_t msg, int numvec)
{
struct vm_pptdev_msi pptmsi;
bzero(&pptmsi, sizeof(pptmsi));
pptmsi.vcpu = vcpu;
pptmsi.bus = bus;
pptmsi.slot = slot;
pptmsi.func = func;
pptmsi.msg = msg;
pptmsi.addr = addr;
pptmsi.numvec = numvec;
return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
}
int
vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
{
struct vm_pptdev_msix pptmsix;
bzero(&pptmsix, sizeof(pptmsix));
pptmsix.vcpu = vcpu;
pptmsix.bus = bus;
pptmsix.slot = slot;
pptmsix.func = func;
pptmsix.idx = idx;
pptmsix.msg = msg;
pptmsix.addr = addr;
pptmsix.vector_control = vector_control;
return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
}
uint64_t *
vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries)
{
int error;
static struct vm_stats vmstats;
vmstats.cpuid = vcpu;
error = ioctl(ctx->fd, VM_STATS, &vmstats);
if (error == 0) {
if (ret_entries)
*ret_entries = vmstats.num_entries;
if (ret_tv)
*ret_tv = vmstats.tv;
return (vmstats.statbuf);
} else
return (NULL);
}
const char *
vm_get_stat_desc(struct vmctx *ctx, int index)
{
static struct vm_stat_desc statdesc;
statdesc.index = index;
if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
return (statdesc.desc);
else
return (NULL);
}
int
vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
{
int error;
struct vm_x2apic x2apic;
bzero(&x2apic, sizeof(x2apic));
x2apic.cpuid = vcpu;
error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
*state = x2apic.state;
return (error);
}
int
vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
{
int error;
struct vm_x2apic x2apic;
bzero(&x2apic, sizeof(x2apic));
x2apic.cpuid = vcpu;
x2apic.state = state;
error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
return (error);
}
/*
* From Intel Vol 3a:
* Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
*/
int
vcpu_reset(struct vmctx *vmctx, int vcpu)
{
int error;
uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
uint32_t desc_access, desc_limit;
uint16_t sel;
zero = 0;
rflags = 0x2;
error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
if (error)
goto done;
rip = 0xfff0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
goto done;
cr0 = CR0_NE;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
goto done;
cr4 = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
goto done;
/*
* CS: present, r/w, accessed, 16-bit, byte granularity, usable
*/
desc_base = 0xffff0000;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0xf000;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
goto done;
/*
* SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
*/
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0093;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
desc_base, desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
goto done;
/* General purpose registers */
rdx = 0xf00;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
goto done;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
goto done;
/* GDTR, IDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
desc_base, desc_limit, desc_access);
if (error != 0)
goto done;
/* TR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x0000008b;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
goto done;
/* LDTR */
desc_base = 0;
desc_limit = 0xffff;
desc_access = 0x00000082;
error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
desc_limit, desc_access);
if (error)
goto done;
sel = 0;
if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
goto done;
/* XXX cr2, debug registers */
error = 0;
done:
return (error);
}
int
vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
{
int error, i;
struct vm_gpa_pte gpapte;
bzero(&gpapte, sizeof(gpapte));
gpapte.gpa = gpa;
error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
if (error == 0) {
*num = gpapte.ptenum;
for (i = 0; i < gpapte.ptenum; i++)
pte[i] = gpapte.pte[i];
}
return (error);
}
int
vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
{
int error;
struct vm_hpet_cap cap;
bzero(&cap, sizeof(struct vm_hpet_cap));
error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap);
if (capabilities != NULL)
*capabilities = cap.capabilities;
return (error);
}
int
vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint64_t gla, int prot, uint64_t *gpa, int *fault)
{
struct vm_gla2gpa gg;
int error;
bzero(&gg, sizeof(struct vm_gla2gpa));
gg.vcpuid = vcpu;
gg.prot = prot;
gg.gla = gla;
gg.paging = *paging;
error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
if (error == 0) {
*fault = gg.fault;
*gpa = gg.gpa;
}
return (error);
}
int
vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint64_t gla, int prot, uint64_t *gpa, int *fault)
{
struct vm_gla2gpa gg;
int error;
bzero(&gg, sizeof(struct vm_gla2gpa));
gg.vcpuid = vcpu;
gg.prot = prot;
gg.gla = gla;
gg.paging = *paging;
error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg);
if (error == 0) {
*fault = gg.fault;
*gpa = gg.gpa;
}
return (error);
}
#ifndef min
#define min(a,b) (((a) < (b)) ? (a) : (b))
#endif
int
vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
int *fault)
{
void *va;
uint64_t gpa;
int error, i, n, off;
for (i = 0; i < iovcnt; i++) {
iov[i].iov_base = 0;
iov[i].iov_len = 0;
}
while (len) {
assert(iovcnt > 0);
error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault);
if (error || *fault)
return (error);
off = gpa & PAGE_MASK;
n = min(len, PAGE_SIZE - off);
va = vm_map_gpa(ctx, gpa, n);
if (va == NULL)
return (EFAULT);
iov->iov_base = va;
iov->iov_len = n;
iov++;
iovcnt--;
gla += n;
len -= n;
}
return (0);
}
void
vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt)
{
return;
}
void
vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
{
const char *src;
char *dst;
size_t n;
dst = vp;
while (len) {
assert(iov->iov_len);
n = min(len, iov->iov_len);
src = iov->iov_base;
bcopy(src, dst, n);
iov++;
dst += n;
len -= n;
}
}
void
vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
size_t len)
{
const char *src;
char *dst;
size_t n;
src = vp;
while (len) {
assert(iov->iov_len);
n = min(len, iov->iov_len);
dst = iov->iov_base;
bcopy(src, dst, n);
iov++;
src += n;
len -= n;
}
}
static int
vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
{
struct vm_cpuset vm_cpuset;
int error;
bzero(&vm_cpuset, sizeof(struct vm_cpuset));
vm_cpuset.which = which;
vm_cpuset.cpusetsize = sizeof(cpuset_t);
vm_cpuset.cpus = cpus;
error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
return (error);
}
int
vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
{
return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
}
int
vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
{
return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
}
int
+vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
+{
+
+ return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
+}
+
+int
vm_activate_cpu(struct vmctx *ctx, int vcpu)
{
struct vm_activate_cpu ac;
int error;
bzero(&ac, sizeof(struct vm_activate_cpu));
ac.vcpuid = vcpu;
error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
return (error);
}
int
+vm_suspend_cpu(struct vmctx *ctx, int vcpu)
+{
+ struct vm_activate_cpu ac;
+ int error;
+
+ bzero(&ac, sizeof(struct vm_activate_cpu));
+ ac.vcpuid = vcpu;
+ error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
+ return (error);
+}
+
+int
+vm_resume_cpu(struct vmctx *ctx, int vcpu)
+{
+ struct vm_activate_cpu ac;
+ int error;
+
+ bzero(&ac, sizeof(struct vm_activate_cpu));
+ ac.vcpuid = vcpu;
+ error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
+ return (error);
+}
+
+int
vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2)
{
struct vm_intinfo vmii;
int error;
bzero(&vmii, sizeof(struct vm_intinfo));
vmii.vcpuid = vcpu;
error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii);
if (error == 0) {
*info1 = vmii.info1;
*info2 = vmii.info2;
}
return (error);
}
int
vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1)
{
struct vm_intinfo vmii;
int error;
bzero(&vmii, sizeof(struct vm_intinfo));
vmii.vcpuid = vcpu;
vmii.info1 = info1;
error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii);
return (error);
}
int
vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value)
{
struct vm_rtc_data rtcdata;
int error;
bzero(&rtcdata, sizeof(struct vm_rtc_data));
rtcdata.offset = offset;
rtcdata.value = value;
error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata);
return (error);
}
int
vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval)
{
struct vm_rtc_data rtcdata;
int error;
bzero(&rtcdata, sizeof(struct vm_rtc_data));
rtcdata.offset = offset;
error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata);
if (error == 0)
*retval = rtcdata.value;
return (error);
}
int
vm_rtc_settime(struct vmctx *ctx, time_t secs)
{
struct vm_rtc_time rtctime;
int error;
bzero(&rtctime, sizeof(struct vm_rtc_time));
rtctime.secs = secs;
error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime);
return (error);
}
int
vm_rtc_gettime(struct vmctx *ctx, time_t *secs)
{
struct vm_rtc_time rtctime;
int error;
bzero(&rtctime, sizeof(struct vm_rtc_time));
error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime);
if (error == 0)
*secs = rtctime.secs;
return (error);
}
int
vm_restart_instruction(void *arg, int vcpu)
{
struct vmctx *ctx = arg;
return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
}
int
vm_get_device_fd(struct vmctx *ctx)
{
return (ctx->fd);
}
const cap_ioctl_t *
vm_get_ioctls(size_t *len)
{
cap_ioctl_t *cmds;
/* keep in sync with machine/vmm_dev.h */
static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT,
VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG,
VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER,
VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR,
VM_SET_REGISTER_SET, VM_GET_REGISTER_SET,
VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ,
VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ,
VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ,
VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER,
VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV,
VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI,
VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC,
VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE,
VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA,
VM_GLA2GPA_NOFAULT,
- VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SET_INTINFO, VM_GET_INTINFO,
+ VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU,
+ VM_SET_INTINFO, VM_GET_INTINFO,
VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME,
VM_RESTART_INSTRUCTION };
if (len == NULL) {
cmds = malloc(sizeof(vm_ioctl_cmds));
if (cmds == NULL)
return (NULL);
bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds));
return (cmds);
}
*len = nitems(vm_ioctl_cmds);
return (NULL);
}
Index: head/lib/libvmmapi/vmmapi.h
===================================================================
--- head/lib/libvmmapi/vmmapi.h (revision 332156)
+++ head/lib/libvmmapi/vmmapi.h (revision 332157)
@@ -1,231 +1,234 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMMAPI_H_
#define _VMMAPI_H_
#include <sys/param.h>
#include <sys/cpuset.h>
/*
* API version for out-of-tree consumers like grub-bhyve for making compile
* time decisions.
*/
#define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */
struct iovec;
struct vmctx;
enum x2apic_state;
/*
* Different styles of mapping the memory assigned to a VM into the address
* space of the controlling process.
*/
enum vm_mmap_style {
VM_MMAP_NONE, /* no mapping */
VM_MMAP_ALL, /* fully and statically mapped */
VM_MMAP_SPARSE, /* mappings created on-demand */
};
/*
* 'flags' value passed to 'vm_set_memflags()'.
*/
#define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */
#define VM_MEM_F_WIRED 0x02 /* guest memory is wired */
/*
* Identifiers for memory segments:
* - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
* - the remaining identifiers can be used to create devmem segments.
*/
enum {
VM_SYSMEM,
VM_BOOTROM,
VM_FRAMEBUFFER,
};
/*
* Get the length and name of the memory segment identified by 'segid'.
* Note that system memory segments are identified with a nul name.
*
* Returns 0 on success and non-zero otherwise.
*/
int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
size_t namesiz);
/*
* Iterate over the guest address space. This function finds an address range
* that starts at an address >= *gpa.
*
* Returns 0 if the next address range was found and non-zero otherwise.
*/
int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
/*
* Create a device memory segment identified by 'segid'.
*
* Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
*/
void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
size_t len);
/*
* Map the memory segment identified by 'segid' into the guest address space
* at [gpa,gpa+len) with protection 'prot'.
*/
int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
vm_ooffset_t segoff, size_t len, int prot);
int vm_create(const char *name);
int vm_get_device_fd(struct vmctx *ctx);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
int vm_parse_memsize(const char *optarg, size_t *memsize);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, int prot, uint64_t *gpa, int *fault);
int vm_gla2gpa_nofault(struct vmctx *, int vcpuid,
struct vm_guest_paging *paging, uint64_t gla, int prot,
uint64_t *gpa, int *fault);
uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
void vm_set_memflags(struct vmctx *ctx, int flags);
int vm_get_memflags(struct vmctx *ctx);
size_t vm_get_lowmem_size(struct vmctx *ctx);
size_t vm_get_highmem_size(struct vmctx *ctx);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access);
int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t *base, uint32_t *limit, uint32_t *access);
int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
struct seg_desc *seg_desc);
int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
const int *regnums, uint64_t *regvals);
int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count,
const int *regnums, uint64_t *regvals);
int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how);
int vm_reinit(struct vmctx *ctx);
int vm_apicid2vcpu(struct vmctx *ctx, int apicid);
int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
int errcode_valid, uint32_t errcode, int restart_instruction);
int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
int vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
int vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
enum vm_intr_trigger trigger);
int vm_inject_nmi(struct vmctx *ctx, int vcpu);
int vm_capability_name2type(const char *capname);
const char *vm_capability_type2name(int type);
int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int *retval);
int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
int val);
int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
int func, uint64_t addr, uint64_t msg, int numvec);
int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
int func, int idx, uint64_t addr, uint64_t msg,
uint32_t vector_control);
int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2);
int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo);
const cap_ioctl_t *vm_get_ioctls(size_t *len);
/*
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
*/
uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries);
const char *vm_get_stat_desc(struct vmctx *ctx, int index);
int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
/*
* Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
* The 'iovcnt' should be big enough to accommodate all GPA segments.
*
* retval fault Interpretation
* 0 0 Success
* 0 1 An exception was injected into the guest
* EFAULT N/A Error
*/
int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
int *fault);
void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
void *host_dst, size_t len);
void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
struct iovec *guest_iov, size_t len);
void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
int iovcnt);
/* RTC */
int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value);
int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval);
int vm_rtc_settime(struct vmctx *ctx, time_t secs);
int vm_rtc_gettime(struct vmctx *ctx, time_t *secs);
/* Reset vcpu register state */
int vcpu_reset(struct vmctx *ctx, int vcpu);
int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus);
int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus);
+int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus);
int vm_activate_cpu(struct vmctx *ctx, int vcpu);
+int vm_suspend_cpu(struct vmctx *ctx, int vcpu);
+int vm_resume_cpu(struct vmctx *ctx, int vcpu);
/*
* FreeBSD specific APIs
*/
int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
uint64_t rip, uint64_t cr3, uint64_t gdtbase,
uint64_t rsp);
int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
uint32_t eip, uint32_t gdtbase,
uint32_t esp);
void vm_setup_freebsd_gdt(uint64_t *gdtr);
#endif /* _VMMAPI_H_ */
Index: head/sys/amd64/include/vmm.h
===================================================================
--- head/sys/amd64/include/vmm.h (revision 332156)
+++ head/sys/amd64/include/vmm.h (revision 332157)
@@ -1,683 +1,690 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_H_
#define _VMM_H_
#include <x86/segments.h>
enum vm_suspend_how {
VM_SUSPEND_NONE,
VM_SUSPEND_RESET,
VM_SUSPEND_POWEROFF,
VM_SUSPEND_HALT,
VM_SUSPEND_TRIPLEFAULT,
VM_SUSPEND_LAST
};
/*
* Identifiers for architecturally defined registers.
*/
enum vm_reg_name {
VM_REG_GUEST_RAX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_RDX,
VM_REG_GUEST_RSI,
VM_REG_GUEST_RDI,
VM_REG_GUEST_RBP,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15,
VM_REG_GUEST_CR0,
VM_REG_GUEST_CR3,
VM_REG_GUEST_CR4,
VM_REG_GUEST_DR7,
VM_REG_GUEST_RSP,
VM_REG_GUEST_RIP,
VM_REG_GUEST_RFLAGS,
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
VM_REG_GUEST_LDTR,
VM_REG_GUEST_TR,
VM_REG_GUEST_IDTR,
VM_REG_GUEST_GDTR,
VM_REG_GUEST_EFER,
VM_REG_GUEST_CR2,
VM_REG_GUEST_PDPTE0,
VM_REG_GUEST_PDPTE1,
VM_REG_GUEST_PDPTE2,
VM_REG_GUEST_PDPTE3,
VM_REG_GUEST_INTR_SHADOW,
VM_REG_GUEST_DR0,
VM_REG_GUEST_DR1,
VM_REG_GUEST_DR2,
VM_REG_GUEST_DR3,
VM_REG_GUEST_DR6,
VM_REG_LAST
};
enum x2apic_state {
X2APIC_DISABLED,
X2APIC_ENABLED,
X2APIC_STATE_LAST
};
#define VM_INTINFO_VECTOR(info) ((info) & 0xff)
#define VM_INTINFO_DEL_ERRCODE 0x800
#define VM_INTINFO_RSVD 0x7ffff000
#define VM_INTINFO_VALID 0x80000000
#define VM_INTINFO_TYPE 0x700
#define VM_INTINFO_HWINTR (0 << 8)
#define VM_INTINFO_NMI (2 << 8)
#define VM_INTINFO_HWEXCEPTION (3 << 8)
#define VM_INTINFO_SWINTR (4 << 8)
#ifdef _KERNEL
#define VM_MAX_NAMELEN 32
struct vm;
struct vm_exception;
struct seg_desc;
struct vm_exit;
struct vm_run;
struct vhpet;
struct vioapic;
struct vlapic;
struct vmspace;
struct vm_object;
struct vm_guest_paging;
struct pmap;
struct vm_eventinfo {
void *rptr; /* rendezvous cookie */
int *sptr; /* suspend cookie */
int *iptr; /* reqidle cookie */
};
typedef int (*vmm_init_func_t)(int ipinum);
typedef int (*vmm_cleanup_func_t)(void);
typedef void (*vmm_resume_func_t)(void);
typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
struct pmap *pmap, struct vm_eventinfo *info);
typedef void (*vmi_cleanup_func_t)(void *vmi);
typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
uint64_t *retval);
typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
uint64_t val);
typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
struct seg_desc *desc);
typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
vmm_cleanup_func_t cleanup;
vmm_resume_func_t resume;
vmi_init_func_t vminit; /* vm-specific initialization */
vmi_run_func_t vmrun;
vmi_cleanup_func_t vmcleanup;
vmi_get_register_t vmgetreg;
vmi_set_register_t vmsetreg;
vmi_get_desc_t vmgetdesc;
vmi_set_desc_t vmsetdesc;
vmi_get_cap_t vmgetcap;
vmi_set_cap_t vmsetcap;
vmi_vmspace_alloc vmspace_alloc;
vmi_vmspace_free vmspace_free;
vmi_vlapic_init vlapic_init;
vmi_vlapic_cleanup vlapic_cleanup;
};
extern struct vmm_ops vmm_ops_intel;
extern struct vmm_ops vmm_ops_amd;
int vm_create(const char *name, struct vm **retvm);
void vm_destroy(struct vm *vm);
int vm_reinit(struct vm *vm);
const char *vm_name(struct vm *vm);
/*
* APIs that modify the guest memory map require all vcpus to be frozen.
*/
int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
size_t len, int prot, int flags);
int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem);
void vm_free_memseg(struct vm *vm, int ident);
int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
/*
* APIs that inspect the guest memory map require only a *single* vcpu to
* be frozen. This acts like a read lock on the guest memory map since any
* modification requires *all* vcpus to be frozen.
*/
int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
struct vm_object **objptr);
void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len,
int prot, void **cookie);
void vm_gpa_release(void *cookie);
bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa);
int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *ret_desc);
int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc);
int vm_run(struct vm *vm, struct vm_run *vmrun);
int vm_suspend(struct vm *vm, enum vm_suspend_how how);
int vm_inject_nmi(struct vm *vm, int vcpu);
int vm_nmi_pending(struct vm *vm, int vcpuid);
void vm_nmi_clear(struct vm *vm, int vcpuid);
int vm_inject_extint(struct vm *vm, int vcpu);
int vm_extint_pending(struct vm *vm, int vcpuid);
void vm_extint_clear(struct vm *vm, int vcpuid);
struct vlapic *vm_lapic(struct vm *vm, int cpu);
struct vioapic *vm_ioapic(struct vm *vm);
struct vhpet *vm_hpet(struct vm *vm);
int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
int vm_apicid2vcpuid(struct vm *vm, int apicid);
int vm_activate_cpu(struct vm *vm, int vcpu);
+int vm_suspend_cpu(struct vm *vm, int vcpu);
+int vm_resume_cpu(struct vm *vm, int vcpu);
struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
#ifdef _SYS__CPUSET_H_
/*
* Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
* The rendezvous 'func(arg)' is not allowed to do anything that will
* cause the thread to be put to sleep.
*
* If the rendezvous is being initiated from a vcpu context then the
* 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
*
* The caller cannot hold any locks when initiating the rendezvous.
*
* The implementation of this API may cause vcpus other than those specified
* by 'dest' to be stalled. The caller should not rely on any vcpus making
* forward progress when the rendezvous is in progress.
*/
typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
vm_rendezvous_func_t func, void *arg);
cpuset_t vm_active_cpus(struct vm *vm);
+cpuset_t vm_debug_cpus(struct vm *vm);
cpuset_t vm_suspended_cpus(struct vm *vm);
#endif /* _SYS__CPUSET_H_ */
static __inline int
vcpu_rendezvous_pending(struct vm_eventinfo *info)
{
return (*((uintptr_t *)(info->rptr)) != 0);
}
static __inline int
vcpu_suspended(struct vm_eventinfo *info)
{
return (*info->sptr);
}
static __inline int
vcpu_reqidle(struct vm_eventinfo *info)
{
return (*info->iptr);
}
+int vcpu_debugged(struct vm *vm, int vcpuid);
+
/*
* Return 1 if device indicated by bus/slot/func is supposed to be a
* pci passthrough device.
*
* Return 0 otherwise.
*/
int vmm_is_pptdev(int bus, int slot, int func);
void *vm_iommu_domain(struct vm *vm);
enum vcpu_state {
VCPU_IDLE,
VCPU_FROZEN,
VCPU_RUNNING,
VCPU_SLEEPING,
};
int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
bool from_idle);
enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
static int __inline
vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
{
return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
}
#ifdef _SYS_PROC_H_
static int __inline
vcpu_should_yield(struct vm *vm, int vcpu)
{
if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED))
return (1);
else if (curthread->td_owepreempt)
return (1);
else
return (0);
}
#endif
void *vcpu_stats(struct vm *vm, int vcpu);
void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
struct vmspace *vm_get_vmspace(struct vm *vm);
struct vatpic *vm_atpic(struct vm *vm);
struct vatpit *vm_atpit(struct vm *vm);
struct vpmtmr *vm_pmtmr(struct vm *vm);
struct vrtc *vm_rtc(struct vm *vm);
/*
* Inject exception 'vector' into the guest vcpu. This function returns 0 on
* success and non-zero on failure.
*
* Wrapper functions like 'vm_inject_gp()' should be preferred to calling
* this function directly because they enforce the trap-like or fault-like
* behavior of an exception.
*
* This function should only be called in the context of the thread that is
* executing this vcpu.
*/
int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid,
uint32_t errcode, int restart_instruction);
/*
* This function is called after a VM-exit that occurred during exception or
* interrupt delivery through the IDT. The format of 'intinfo' is described
* in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
*
* If a VM-exit handler completes the event delivery successfully then it
* should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
* if the task switch emulation is triggered via a task gate then it should
* call this function with 'intinfo=0' to indicate that the external event
* is not pending anymore.
*
* Return value is 0 on success and non-zero on failure.
*/
int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
/*
* This function is called before every VM-entry to retrieve a pending
* event that should be injected into the guest. This function combines
* nested events into a double or triple fault.
*
* Returns 0 if there are no events that need to be injected into the guest
* and non-zero otherwise.
*/
int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
enum vm_reg_name vm_segment_name(int seg_encoding);
struct vm_copyinfo {
uint64_t gpa;
size_t len;
void *hva;
void *cookie;
};
/*
* Set up 'copyinfo[]' to copy to/from guest linear address space starting
* at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
* a copyin or PROT_WRITE for a copyout.
*
* retval is_fault Interpretation
* 0 0 Success
* 0 1 An exception was injected into the guest
* EFAULT N/A Unrecoverable error
*
* The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
* the return value is 0. The 'copyinfo[]' resources should be freed by calling
* 'vm_copy_teardown()' after the copy is done.
*/
int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
int num_copyinfo, int *is_fault);
void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
int num_copyinfo);
void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
void *kaddr, size_t len);
void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
struct vm_copyinfo *copyinfo, size_t len);
int vcpu_trace_exceptions(struct vm *vm, int vcpuid);
#endif /* KERNEL */
#define VM_MAXCPU 16 /* maximum virtual cpus */
/*
* Identifiers for optional vmm capabilities
*/
enum vm_cap_type {
VM_CAP_HALT_EXIT,
VM_CAP_MTRAP_EXIT,
VM_CAP_PAUSE_EXIT,
VM_CAP_UNRESTRICTED_GUEST,
VM_CAP_ENABLE_INVPCID,
VM_CAP_MAX
};
enum vm_intr_trigger {
EDGE_TRIGGER,
LEVEL_TRIGGER
};
/*
* The 'access' field has the format specified in Table 21-2 of the Intel
* Architecture Manual vol 3b.
*
* XXX The contents of the 'access' field are architecturally defined except
* bit 16 - Segment Unusable.
*/
struct seg_desc {
uint64_t base;
uint32_t limit;
uint32_t access;
};
#define SEG_DESC_TYPE(access) ((access) & 0x001f)
#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3)
#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0)
#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0)
#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0)
#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0)
enum vm_cpu_mode {
CPU_MODE_REAL,
CPU_MODE_PROTECTED,
CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
};
enum vm_paging_mode {
PAGING_MODE_FLAT,
PAGING_MODE_32,
PAGING_MODE_PAE,
PAGING_MODE_64,
};
struct vm_guest_paging {
uint64_t cr3;
int cpl;
enum vm_cpu_mode cpu_mode;
enum vm_paging_mode paging_mode;
};
/*
* The data structures 'vie' and 'vie_op' are meant to be opaque to the
* consumers of instruction decoding. The only reason why their contents
* need to be exposed is because they are part of the 'vm_exit' structure.
*/
struct vie_op {
uint8_t op_byte; /* actual opcode byte */
uint8_t op_type; /* type of operation (e.g. MOV) */
uint16_t op_flags;
};
#define VIE_INST_SIZE 15
struct vie {
uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
uint8_t num_valid; /* size of the instruction */
uint8_t num_processed;
uint8_t addrsize:4, opsize:4; /* address and operand sizes */
uint8_t rex_w:1, /* REX prefix */
rex_r:1,
rex_x:1,
rex_b:1,
rex_present:1,
repz_present:1, /* REP/REPE/REPZ prefix */
repnz_present:1, /* REPNE/REPNZ prefix */
opsize_override:1, /* Operand size override */
addrsize_override:1, /* Address size override */
segment_override:1; /* Segment override */
uint8_t mod:2, /* ModRM byte */
reg:4,
rm:4;
uint8_t ss:2, /* SIB byte */
index:4,
base:4;
uint8_t disp_bytes;
uint8_t imm_bytes;
uint8_t scale;
int base_register; /* VM_REG_GUEST_xyz */
int index_register; /* VM_REG_GUEST_xyz */
int segment_register; /* VM_REG_GUEST_xyz */
int64_t displacement; /* optional addr displacement */
int64_t immediate; /* optional immediate operand */
uint8_t decoded; /* set to 1 if successfully decoded */
struct vie_op op; /* opcode description */
};
enum vm_exitcode {
VM_EXITCODE_INOUT,
VM_EXITCODE_VMX,
VM_EXITCODE_BOGUS,
VM_EXITCODE_RDMSR,
VM_EXITCODE_WRMSR,
VM_EXITCODE_HLT,
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
VM_EXITCODE_PAGING,
VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */
VM_EXITCODE_RENDEZVOUS,
VM_EXITCODE_IOAPIC_EOI,
VM_EXITCODE_SUSPENDED,
VM_EXITCODE_INOUT_STR,
VM_EXITCODE_TASK_SWITCH,
VM_EXITCODE_MONITOR,
VM_EXITCODE_MWAIT,
VM_EXITCODE_SVM,
VM_EXITCODE_REQIDLE,
+ VM_EXITCODE_DEBUG,
VM_EXITCODE_MAX
};
struct vm_inout {
uint16_t bytes:3; /* 1 or 2 or 4 */
uint16_t in:1;
uint16_t string:1;
uint16_t rep:1;
uint16_t port;
uint32_t eax; /* valid for out */
};
struct vm_inout_str {
struct vm_inout inout; /* must be the first element */
struct vm_guest_paging paging;
uint64_t rflags;
uint64_t cr0;
uint64_t index;
uint64_t count; /* rep=1 (%rcx), rep=0 (1) */
int addrsize;
enum vm_reg_name seg_name;
struct seg_desc seg_desc;
};
enum task_switch_reason {
TSR_CALL,
TSR_IRET,
TSR_JMP,
TSR_IDT_GATE, /* task gate in IDT */
};
struct vm_task_switch {
uint16_t tsssel; /* new TSS selector */
int ext; /* task switch due to external event */
uint32_t errcode;
int errcode_valid; /* push 'errcode' on the new stack */
enum task_switch_reason reason;
struct vm_guest_paging paging;
};
struct vm_exit {
enum vm_exitcode exitcode;
int inst_length; /* 0 means unknown */
uint64_t rip;
union {
struct vm_inout inout;
struct vm_inout_str inout_str;
struct {
uint64_t gpa;
int fault_type;
} paging;
struct {
uint64_t gpa;
uint64_t gla;
uint64_t cs_base;
int cs_d; /* CS.D */
struct vm_guest_paging paging;
struct vie vie;
} inst_emul;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.
*/
struct {
int status; /* vmx inst status */
/*
* 'exit_reason' and 'exit_qualification' are valid
* only if 'status' is zero.
*/
uint32_t exit_reason;
uint64_t exit_qualification;
/*
* 'inst_error' and 'inst_type' are valid
* only if 'status' is non-zero.
*/
int inst_type;
int inst_error;
} vmx;
/*
* SVM specific payload.
*/
struct {
uint64_t exitcode;
uint64_t exitinfo1;
uint64_t exitinfo2;
} svm;
struct {
uint32_t code; /* ecx value */
uint64_t wval;
} msr;
struct {
int vcpu;
uint64_t rip;
} spinup_ap;
struct {
uint64_t rflags;
uint64_t intr_status;
} hlt;
struct {
int vector;
} ioapic_eoi;
struct {
enum vm_suspend_how how;
} suspended;
struct vm_task_switch task_switch;
} u;
};
/* APIs to inject faults into the guest */
void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
int errcode);
static __inline void
vm_inject_ud(void *vm, int vcpuid)
{
vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
}
static __inline void
vm_inject_gp(void *vm, int vcpuid)
{
vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
}
static __inline void
vm_inject_ac(void *vm, int vcpuid, int errcode)
{
vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
}
static __inline void
vm_inject_ss(void *vm, int vcpuid, int errcode)
{
vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
}
void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
int vm_restart_instruction(void *vm, int vcpuid);
#endif /* _VMM_H_ */
Index: head/sys/amd64/include/vmm_dev.h
===================================================================
--- head/sys/amd64/include/vmm_dev.h (revision 332156)
+++ head/sys/amd64/include/vmm_dev.h (revision 332157)
@@ -1,403 +1,410 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
#ifdef _KERNEL
void vmmdev_init(void);
int vmmdev_cleanup(void);
#endif
struct vm_memmap {
vm_paddr_t gpa;
int segid; /* memory segment */
vm_ooffset_t segoff; /* offset into memory segment */
size_t len; /* mmap length */
int prot; /* RWX */
int flags;
};
#define VM_MEMMAP_F_WIRED 0x01
#define VM_MEMMAP_F_IOMMU 0x02
#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL)
struct vm_memseg {
int segid;
size_t len;
char name[SPECNAMELEN + 1];
};
struct vm_register {
int cpuid;
int regnum; /* enum vm_reg_name */
uint64_t regval;
};
struct vm_seg_desc { /* data or code segment */
int cpuid;
int regnum; /* enum vm_reg_name */
struct seg_desc desc;
};
struct vm_register_set {
int cpuid;
unsigned int count;
const int *regnums; /* enum vm_reg_name */
uint64_t *regvals;
};
struct vm_run {
int cpuid;
struct vm_exit vm_exit;
};
struct vm_exception {
int cpuid;
int vector;
uint32_t error_code;
int error_code_valid;
int restart_instruction;
};
struct vm_lapic_msi {
uint64_t msg;
uint64_t addr;
};
struct vm_lapic_irq {
int cpuid;
int vector;
};
struct vm_ioapic_irq {
int irq;
};
struct vm_isa_irq {
int atpic_irq;
int ioapic_irq;
};
struct vm_isa_irq_trigger {
int atpic_irq;
enum vm_intr_trigger trigger;
};
struct vm_capability {
int cpuid;
enum vm_cap_type captype;
int capval;
int allcpus;
};
struct vm_pptdev {
int bus;
int slot;
int func;
};
struct vm_pptdev_mmio {
int bus;
int slot;
int func;
vm_paddr_t gpa;
vm_paddr_t hpa;
size_t len;
};
struct vm_pptdev_msi {
int vcpu;
int bus;
int slot;
int func;
int numvec; /* 0 means disabled */
uint64_t msg;
uint64_t addr;
};
struct vm_pptdev_msix {
int vcpu;
int bus;
int slot;
int func;
int idx;
uint64_t msg;
uint32_t vector_control;
uint64_t addr;
};
struct vm_nmi {
int cpuid;
};
#define MAX_VM_STATS 64
struct vm_stats {
int cpuid; /* in */
int num_entries; /* out */
struct timeval tv;
uint64_t statbuf[MAX_VM_STATS];
};
struct vm_stat_desc {
int index; /* in */
char desc[128]; /* out */
};
struct vm_x2apic {
int cpuid;
enum x2apic_state state;
};
struct vm_gpa_pte {
uint64_t gpa; /* in */
uint64_t pte[4]; /* out */
int ptenum;
};
struct vm_hpet_cap {
uint32_t capabilities; /* lower 32 bits of HPET capabilities */
};
struct vm_suspend {
enum vm_suspend_how how;
};
struct vm_gla2gpa {
int vcpuid; /* inputs */
int prot; /* PROT_READ or PROT_WRITE */
uint64_t gla;
struct vm_guest_paging paging;
int fault; /* outputs */
uint64_t gpa;
};
struct vm_activate_cpu {
int vcpuid;
};
struct vm_cpuset {
int which;
int cpusetsize;
cpuset_t *cpus;
};
#define VM_ACTIVE_CPUS 0
#define VM_SUSPENDED_CPUS 1
+#define VM_DEBUG_CPUS 2
struct vm_intinfo {
int vcpuid;
uint64_t info1;
uint64_t info2;
};
struct vm_rtc_time {
time_t secs;
};
struct vm_rtc_data {
int offset;
uint8_t value;
};
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
IOCNUM_RUN = 1,
IOCNUM_SET_CAPABILITY = 2,
IOCNUM_GET_CAPABILITY = 3,
IOCNUM_SUSPEND = 4,
IOCNUM_REINIT = 5,
/* memory apis */
IOCNUM_MAP_MEMORY = 10, /* deprecated */
IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */
IOCNUM_GET_GPA_PMAP = 12,
IOCNUM_GLA2GPA = 13,
IOCNUM_ALLOC_MEMSEG = 14,
IOCNUM_GET_MEMSEG = 15,
IOCNUM_MMAP_MEMSEG = 16,
IOCNUM_MMAP_GETNEXT = 17,
IOCNUM_GLA2GPA_NOFAULT = 18,
/* register/state accessors */
IOCNUM_SET_REGISTER = 20,
IOCNUM_GET_REGISTER = 21,
IOCNUM_SET_SEGMENT_DESCRIPTOR = 22,
IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
IOCNUM_SET_REGISTER_SET = 24,
IOCNUM_GET_REGISTER_SET = 25,
/* interrupt injection */
IOCNUM_GET_INTINFO = 28,
IOCNUM_SET_INTINFO = 29,
IOCNUM_INJECT_EXCEPTION = 30,
IOCNUM_LAPIC_IRQ = 31,
IOCNUM_INJECT_NMI = 32,
IOCNUM_IOAPIC_ASSERT_IRQ = 33,
IOCNUM_IOAPIC_DEASSERT_IRQ = 34,
IOCNUM_IOAPIC_PULSE_IRQ = 35,
IOCNUM_LAPIC_MSI = 36,
IOCNUM_LAPIC_LOCAL_IRQ = 37,
IOCNUM_IOAPIC_PINCOUNT = 38,
IOCNUM_RESTART_INSTRUCTION = 39,
/* PCI pass-thru */
IOCNUM_BIND_PPTDEV = 40,
IOCNUM_UNBIND_PPTDEV = 41,
IOCNUM_MAP_PPTDEV_MMIO = 42,
IOCNUM_PPTDEV_MSI = 43,
IOCNUM_PPTDEV_MSIX = 44,
/* statistics */
IOCNUM_VM_STATS = 50,
IOCNUM_VM_STAT_DESC = 51,
/* kernel device state */
IOCNUM_SET_X2APIC_STATE = 60,
IOCNUM_GET_X2APIC_STATE = 61,
IOCNUM_GET_HPET_CAPABILITIES = 62,
/* legacy interrupt injection */
IOCNUM_ISA_ASSERT_IRQ = 80,
IOCNUM_ISA_DEASSERT_IRQ = 81,
IOCNUM_ISA_PULSE_IRQ = 82,
IOCNUM_ISA_SET_IRQ_TRIGGER = 83,
/* vm_cpuset */
IOCNUM_ACTIVATE_CPU = 90,
IOCNUM_GET_CPUSET = 91,
+ IOCNUM_SUSPEND_CPU = 92,
+ IOCNUM_RESUME_CPU = 93,
/* RTC */
IOCNUM_RTC_READ = 100,
IOCNUM_RTC_WRITE = 101,
IOCNUM_RTC_SETTIME = 102,
IOCNUM_RTC_GETTIME = 103,
};
#define VM_RUN \
_IOWR('v', IOCNUM_RUN, struct vm_run)
#define VM_SUSPEND \
_IOW('v', IOCNUM_SUSPEND, struct vm_suspend)
#define VM_REINIT \
_IO('v', IOCNUM_REINIT)
#define VM_ALLOC_MEMSEG \
_IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg)
#define VM_GET_MEMSEG \
_IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg)
#define VM_MMAP_MEMSEG \
_IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap)
#define VM_MMAP_GETNEXT \
_IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap)
#define VM_SET_REGISTER \
_IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
#define VM_GET_REGISTER \
_IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
#define VM_SET_SEGMENT_DESCRIPTOR \
_IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
#define VM_GET_SEGMENT_DESCRIPTOR \
_IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
#define VM_SET_REGISTER_SET \
_IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set)
#define VM_GET_REGISTER_SET \
_IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set)
#define VM_INJECT_EXCEPTION \
_IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
#define VM_LAPIC_IRQ \
_IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
#define VM_LAPIC_LOCAL_IRQ \
_IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq)
#define VM_LAPIC_MSI \
_IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi)
#define VM_IOAPIC_ASSERT_IRQ \
_IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq)
#define VM_IOAPIC_DEASSERT_IRQ \
_IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq)
#define VM_IOAPIC_PULSE_IRQ \
_IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq)
#define VM_IOAPIC_PINCOUNT \
_IOR('v', IOCNUM_IOAPIC_PINCOUNT, int)
#define VM_ISA_ASSERT_IRQ \
_IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq)
#define VM_ISA_DEASSERT_IRQ \
_IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq)
#define VM_ISA_PULSE_IRQ \
_IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq)
#define VM_ISA_SET_IRQ_TRIGGER \
_IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger)
#define VM_SET_CAPABILITY \
_IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
#define VM_GET_CAPABILITY \
_IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
#define VM_BIND_PPTDEV \
_IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
#define VM_UNBIND_PPTDEV \
_IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
#define VM_MAP_PPTDEV_MMIO \
_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
#define VM_PPTDEV_MSI \
_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
#define VM_PPTDEV_MSIX \
_IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
#define VM_INJECT_NMI \
_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
#define VM_STATS \
_IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
#define VM_STAT_DESC \
_IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
#define VM_SET_X2APIC_STATE \
_IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_X2APIC_STATE \
_IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
#define VM_GET_HPET_CAPABILITIES \
_IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
#define VM_GET_GPA_PMAP \
_IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
#define VM_GLA2GPA \
_IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
#define VM_GLA2GPA_NOFAULT \
_IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa)
#define VM_ACTIVATE_CPU \
_IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
#define VM_GET_CPUS \
_IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define VM_SUSPEND_CPU \
+ _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu)
+#define VM_RESUME_CPU \
+ _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu)
#define VM_SET_INTINFO \
_IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo)
#define VM_GET_INTINFO \
_IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo)
#define VM_RTC_WRITE \
_IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data)
#define VM_RTC_READ \
_IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data)
#define VM_RTC_SETTIME \
_IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time)
#define VM_RTC_GETTIME \
_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
#define VM_RESTART_INSTRUCTION \
_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
#endif
Index: head/sys/amd64/vmm/amd/svm.c
===================================================================
--- head/sys/amd64/vmm/amd/svm.c (revision 332156)
+++ head/sys/amd64/vmm/amd/svm.c (revision 332157)
@@ -1,2278 +1,2284 @@
/*-
* Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/smp.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/cpufunc.h>
#include <machine/psl.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
#include <machine/smp.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
#include "vmm_ktr.h"
#include "vmm_ioport.h"
#include "vatpic.h"
#include "vlapic.h"
#include "vlapic_priv.h"
#include "x86.h"
#include "vmcb.h"
#include "svm.h"
#include "svm_softc.h"
#include "svm_msr.h"
#include "npt.h"
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL);
/*
* SVM CPUID function 0x8000_000A, edx bit decoding.
*/
#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */
#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */
#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */
#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */
#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */
#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */
#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */
#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */
#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */
#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */
#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */
#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \
VMCB_CACHE_IOPM | \
VMCB_CACHE_I | \
VMCB_CACHE_TPR | \
VMCB_CACHE_CR2 | \
VMCB_CACHE_CR | \
VMCB_CACHE_DR | \
VMCB_CACHE_DT | \
VMCB_CACHE_SEG | \
VMCB_CACHE_NP)
static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
0, NULL);
static MALLOC_DEFINE(M_SVM, "svm", "svm");
static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
/* Per-CPU context area. */
extern struct pcpu __pcpu[];
static uint32_t svm_feature = ~0U; /* AMD SVM features. */
SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
"SVM features advertised by CPUID.8000000AH:EDX");
static int disable_npf_assist;
SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
&disable_npf_assist, 0, NULL);
/* Maximum ASIDs supported by the processor */
static uint32_t nasid;
SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
"Number of ASIDs supported by this processor");
/* Current ASID generation for each host cpu */
static struct asid asid[MAXCPU];
/*
* SVM host state saved area of size 4KB for each core.
*/
static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val);
static __inline int
flush_by_asid(void)
{
return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
}
static __inline int
decode_assist(void)
{
return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
}
static void
svm_disable(void *arg __unused)
{
uint64_t efer;
efer = rdmsr(MSR_EFER);
efer &= ~EFER_SVM;
wrmsr(MSR_EFER, efer);
}
/*
* Disable SVM on all CPUs.
*/
static int
svm_cleanup(void)
{
smp_rendezvous(NULL, svm_disable, NULL, NULL);
return (0);
}
/*
* Verify that all the features required by bhyve are available.
*/
static int
check_svm_features(void)
{
u_int regs[4];
/* CPUID Fn8000_000A is for SVM */
do_cpuid(0x8000000A, regs);
svm_feature &= regs[3];
/*
* The number of ASIDs can be configured to be less than what is
* supported by the hardware but not more.
*/
if (nasid == 0 || nasid > regs[1])
nasid = regs[1];
KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
/* bhyve requires the Nested Paging feature */
if (!(svm_feature & AMD_CPUID_SVM_NP)) {
printf("SVM: Nested Paging feature not available.\n");
return (ENXIO);
}
/* bhyve requires the NRIP Save feature */
if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
printf("SVM: NRIP Save feature not available.\n");
return (ENXIO);
}
return (0);
}
static void
svm_enable(void *arg __unused)
{
uint64_t efer;
efer = rdmsr(MSR_EFER);
efer |= EFER_SVM;
wrmsr(MSR_EFER, efer);
wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
}
/*
* Return 1 if SVM is enabled on this processor and 0 otherwise.
*/
static int
svm_available(void)
{
uint64_t msr;
/* Section 15.4 Enabling SVM from APM2. */
if ((amd_feature2 & AMDID2_SVM) == 0) {
printf("SVM: not available.\n");
return (0);
}
msr = rdmsr(MSR_VM_CR);
if ((msr & VM_CR_SVMDIS) != 0) {
printf("SVM: disabled by BIOS.\n");
return (0);
}
return (1);
}
static int
svm_init(int ipinum)
{
int error, cpu;
if (!svm_available())
return (ENXIO);
error = check_svm_features();
if (error)
return (error);
vmcb_clean &= VMCB_CACHE_DEFAULT;
for (cpu = 0; cpu < MAXCPU; cpu++) {
/*
* Initialize the host ASIDs to their "highest" valid values.
*
* The next ASID allocation will rollover both 'gen' and 'num'
* and start off the sequence at {1,1}.
*/
asid[cpu].gen = ~0UL;
asid[cpu].num = nasid - 1;
}
svm_msr_init();
svm_npt_init(ipinum);
/* Enable SVM on all CPUs */
smp_rendezvous(NULL, svm_enable, NULL, NULL);
return (0);
}
static void
svm_restore(void)
{
svm_enable(NULL);
}
/* Pentium compatible MSRs */
#define MSR_PENTIUM_START 0
#define MSR_PENTIUM_END 0x1FFF
/* AMD 6th generation and Intel compatible MSRs */
#define MSR_AMD6TH_START 0xC0000000UL
#define MSR_AMD6TH_END 0xC0001FFFUL
/* AMD 7th and 8th generation compatible MSRs */
#define MSR_AMD7TH_START 0xC0010000UL
#define MSR_AMD7TH_END 0xC0011FFFUL
/*
* Get the index and bit position for a MSR in permission bitmap.
* Two bits are used for each MSR: lower bit for read and higher bit for write.
*/
static int
svm_msr_index(uint64_t msr, int *index, int *bit)
{
uint32_t base, off;
*index = -1;
*bit = (msr % 4) * 2;
base = 0;
if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
*index = msr / 4;
return (0);
}
base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1);
if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
off = (msr - MSR_AMD6TH_START);
*index = (off + base) / 4;
return (0);
}
base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
off = (msr - MSR_AMD7TH_START);
*index = (off + base) / 4;
return (0);
}
return (EINVAL);
}
/*
* Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
*/
static void
svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
{
int index, bit, error;
error = svm_msr_index(msr, &index, &bit);
KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
("%s: invalid index %d for msr %#lx", __func__, index, msr));
KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
"msr %#lx", __func__, bit, msr));
if (read)
perm_bitmap[index] &= ~(1UL << bit);
if (write)
perm_bitmap[index] &= ~(2UL << bit);
}
static void
svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
{
svm_msr_perm(perm_bitmap, msr, true, true);
}
static void
svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
{
svm_msr_perm(perm_bitmap, msr, true, false);
}
static __inline int
svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask)
{
struct vmcb_ctrl *ctrl;
KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
return (ctrl->intercept[idx] & bitmask ? 1 : 0);
}
static __inline void
svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask,
int enabled)
{
struct vmcb_ctrl *ctrl;
uint32_t oldval;
KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
oldval = ctrl->intercept[idx];
if (enabled)
ctrl->intercept[idx] |= bitmask;
else
ctrl->intercept[idx] &= ~bitmask;
if (ctrl->intercept[idx] != oldval) {
svm_set_dirty(sc, vcpu, VMCB_CACHE_I);
VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified "
"from %#x to %#x", idx, oldval, ctrl->intercept[idx]);
}
}
static __inline void
svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
{
svm_set_intercept(sc, vcpu, off, bitmask, 0);
}
static __inline void
svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask)
{
svm_set_intercept(sc, vcpu, off, bitmask, 1);
}
static void
vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa,
uint64_t msrpm_base_pa, uint64_t np_pml4)
{
struct vmcb_ctrl *ctrl;
struct vmcb_state *state;
uint32_t mask;
int n;
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
state = svm_get_vmcb_state(sc, vcpu);
ctrl->iopm_base_pa = iopm_base_pa;
ctrl->msrpm_base_pa = msrpm_base_pa;
/* Enable nested paging */
ctrl->np_enable = 1;
ctrl->n_cr3 = np_pml4;
/*
* Intercept accesses to the control registers that are not shadowed
* in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
*/
for (n = 0; n < 16; n++) {
mask = (BIT(n) << 16) | BIT(n);
if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
else
svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask);
}
/*
* Intercept everything when tracing guest exceptions otherwise
* just intercept machine check exception.
*/
if (vcpu_trace_exceptions(sc->vm, vcpu)) {
for (n = 0; n < 32; n++) {
/*
* Skip unimplemented vectors in the exception bitmap.
*/
if (n == 2 || n == 9) {
continue;
}
svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n));
}
} else {
svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
}
/* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
VMCB_INTCPT_FERR_FREEZE);
svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
/*
* From section "Canonicalization and Consistency Checks" in APMv2
* the VMRUN intercept bit must be set to pass the consistency check.
*/
svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
/*
* The ASID will be set to a non-zero value just before VMRUN.
*/
ctrl->asid = 0;
/*
* Section 15.21.1, Interrupt Masking in EFLAGS
* Section 15.21.2, Virtualizing APIC.TPR
*
* This must be set for %rflag and %cr8 isolation of guest and host.
*/
ctrl->v_intr_masking = 1;
/* Enable Last Branch Record aka LBR for debugging */
ctrl->lbr_virt_en = 1;
state->dbgctl = BIT(0);
/* EFER_SVM must always be set when the guest is executing */
state->efer = EFER_SVM;
/* Set up the PAT to power-on state */
state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) |
PAT_VALUE(1, PAT_WRITE_THROUGH) |
PAT_VALUE(2, PAT_UNCACHED) |
PAT_VALUE(3, PAT_UNCACHEABLE) |
PAT_VALUE(4, PAT_WRITE_BACK) |
PAT_VALUE(5, PAT_WRITE_THROUGH) |
PAT_VALUE(6, PAT_UNCACHED) |
PAT_VALUE(7, PAT_UNCACHEABLE);
/* Set up DR6/7 to power-on state */
state->dr6 = 0xffff0ff0;
state->dr7 = 0x400;
}
/*
* Initialize a virtual machine.
*/
static void *
svm_vminit(struct vm *vm, pmap_t pmap)
{
struct svm_softc *svm_sc;
struct svm_vcpu *vcpu;
vm_paddr_t msrpm_pa, iopm_pa, pml4_pa;
int i;
svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
if (((uintptr_t)svm_sc & PAGE_MASK) != 0)
panic("malloc of svm_softc not aligned on page boundary");
svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
if (svm_sc->msr_bitmap == NULL)
panic("contigmalloc of SVM MSR bitmap failed");
svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
if (svm_sc->iopm_bitmap == NULL)
panic("contigmalloc of SVM IO bitmap failed");
svm_sc->vm = vm;
svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
/*
* Intercept read and write accesses to all MSRs.
*/
memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
/*
* Access to the following MSRs is redirected to the VMCB when the
* guest is executing. Therefore it is safe to allow the guest to
* read/write these MSRs directly without hypervisor involvement.
*/
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
/*
* Intercept writes to make sure that the EFER_SVM bit is not cleared.
*/
svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
/* Intercept access to all I/O ports. */
memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
iopm_pa = vtophys(svm_sc->iopm_bitmap);
msrpm_pa = vtophys(svm_sc->msr_bitmap);
pml4_pa = svm_sc->nptp;
for (i = 0; i < VM_MAXCPU; i++) {
vcpu = svm_get_vcpu(svm_sc, i);
vcpu->nextrip = ~0;
vcpu->lastcpu = NOCPU;
vcpu->vmcb_pa = vtophys(&vcpu->vmcb);
vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa);
svm_msr_guest_init(svm_sc, i);
}
return (svm_sc);
}
/*
* Collateral for a generic SVM VM-exit.
*/
static void
vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
{
vme->exitcode = VM_EXITCODE_SVM;
vme->u.svm.exitcode = code;
vme->u.svm.exitinfo1 = info1;
vme->u.svm.exitinfo2 = info2;
}
static int
svm_cpl(struct vmcb_state *state)
{
/*
* From APMv2:
* "Retrieve the CPL from the CPL field in the VMCB, not
* from any segment DPL"
*/
return (state->cpl);
}
static enum vm_cpu_mode
svm_vcpu_mode(struct vmcb *vmcb)
{
struct vmcb_segment seg;
struct vmcb_state *state;
int error;
state = &vmcb->state;
if (state->efer & EFER_LMA) {
error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
error));
/*
* Section 4.8.1 for APM2, check if Code Segment has
* Long attribute set in descriptor.
*/
if (seg.attrib & VMCB_CS_ATTRIB_L)
return (CPU_MODE_64BIT);
else
return (CPU_MODE_COMPATIBILITY);
} else if (state->cr0 & CR0_PE) {
return (CPU_MODE_PROTECTED);
} else {
return (CPU_MODE_REAL);
}
}
static enum vm_paging_mode
svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
{
if ((cr0 & CR0_PG) == 0)
return (PAGING_MODE_FLAT);
if ((cr4 & CR4_PAE) == 0)
return (PAGING_MODE_32);
if (efer & EFER_LME)
return (PAGING_MODE_64);
else
return (PAGING_MODE_PAE);
}
/*
* ins/outs utility routines
*/
static uint64_t
svm_inout_str_index(struct svm_regctx *regs, int in)
{
uint64_t val;
val = in ? regs->sctx_rdi : regs->sctx_rsi;
return (val);
}
static uint64_t
svm_inout_str_count(struct svm_regctx *regs, int rep)
{
uint64_t val;
val = rep ? regs->sctx_rcx : 1;
return (val);
}
static void
svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1,
int in, struct vm_inout_str *vis)
{
int error, s;
if (in) {
vis->seg_name = VM_REG_GUEST_ES;
} else {
/* The segment field has standard encoding */
s = (info1 >> 10) & 0x7;
vis->seg_name = vm_segment_name(s);
}
error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc);
KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
}
static int
svm_inout_str_addrsize(uint64_t info1)
{
uint32_t size;
size = (info1 >> 7) & 0x7;
switch (size) {
case 1:
return (2); /* 16 bit */
case 2:
return (4); /* 32 bit */
case 4:
return (8); /* 64 bit */
default:
panic("%s: invalid size encoding %d", __func__, size);
}
}
static void
svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
{
struct vmcb_state *state;
state = &vmcb->state;
paging->cr3 = state->cr3;
paging->cpl = svm_cpl(state);
paging->cpu_mode = svm_vcpu_mode(vmcb);
paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
state->efer);
}
#define UNHANDLED 0
/*
* Handle guest I/O intercept.
*/
static int
svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
{
struct vmcb_ctrl *ctrl;
struct vmcb_state *state;
struct svm_regctx *regs;
struct vm_inout_str *vis;
uint64_t info1;
int inout_string;
state = svm_get_vmcb_state(svm_sc, vcpu);
ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
regs = svm_get_guest_regctx(svm_sc, vcpu);
info1 = ctrl->exitinfo1;
inout_string = info1 & BIT(2) ? 1 : 0;
/*
* The effective segment number in EXITINFO1[12:10] is populated
* only if the processor has the DecodeAssist capability.
*
* XXX this is not specified explicitly in APMv2 but can be verified
* empirically.
*/
if (inout_string && !decode_assist())
return (UNHANDLED);
vmexit->exitcode = VM_EXITCODE_INOUT;
vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0;
vmexit->u.inout.string = inout_string;
vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0;
vmexit->u.inout.bytes = (info1 >> 4) & 0x7;
vmexit->u.inout.port = (uint16_t)(info1 >> 16);
vmexit->u.inout.eax = (uint32_t)(state->rax);
if (inout_string) {
vmexit->exitcode = VM_EXITCODE_INOUT_STR;
vis = &vmexit->u.inout_str;
svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging);
vis->rflags = state->rflags;
vis->cr0 = state->cr0;
vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
vis->addrsize = svm_inout_str_addrsize(info1);
svm_inout_str_seginfo(svm_sc, vcpu, info1,
vmexit->u.inout.in, vis);
}
return (UNHANDLED);
}
static int
npf_fault_type(uint64_t exitinfo1)
{
if (exitinfo1 & VMCB_NPF_INFO1_W)
return (VM_PROT_WRITE);
else if (exitinfo1 & VMCB_NPF_INFO1_ID)
return (VM_PROT_EXECUTE);
else
return (VM_PROT_READ);
}
static bool
svm_npf_emul_fault(uint64_t exitinfo1)
{
if (exitinfo1 & VMCB_NPF_INFO1_ID) {
return (false);
}
if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
return (false);
}
if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
return (false);
}
return (true);
}
static void
svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
{
struct vm_guest_paging *paging;
struct vmcb_segment seg;
struct vmcb_ctrl *ctrl;
char *inst_bytes;
int error, inst_len;
ctrl = &vmcb->ctrl;
paging = &vmexit->u.inst_emul.paging;
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
vmexit->u.inst_emul.gpa = gpa;
vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
svm_paging_info(vmcb, paging);
error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
switch(paging->cpu_mode) {
case CPU_MODE_REAL:
vmexit->u.inst_emul.cs_base = seg.base;
vmexit->u.inst_emul.cs_d = 0;
break;
case CPU_MODE_PROTECTED:
case CPU_MODE_COMPATIBILITY:
vmexit->u.inst_emul.cs_base = seg.base;
/*
* Section 4.8.1 of APM2, Default Operand Size or D bit.
*/
vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
1 : 0;
break;
default:
vmexit->u.inst_emul.cs_base = 0;
vmexit->u.inst_emul.cs_d = 0;
break;
}
/*
* Copy the instruction bytes into 'vie' if available.
*/
if (decode_assist() && !disable_npf_assist) {
inst_len = ctrl->inst_len;
inst_bytes = ctrl->inst_bytes;
} else {
inst_len = 0;
inst_bytes = NULL;
}
vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
}
#ifdef KTR
static const char *
intrtype_to_str(int intr_type)
{
switch (intr_type) {
case VMCB_EVENTINJ_TYPE_INTR:
return ("hwintr");
case VMCB_EVENTINJ_TYPE_NMI:
return ("nmi");
case VMCB_EVENTINJ_TYPE_INTn:
return ("swintr");
case VMCB_EVENTINJ_TYPE_EXCEPTION:
return ("exception");
default:
panic("%s: unknown intr_type %d", __func__, intr_type);
}
}
#endif
/*
* Inject an event to vcpu as described in section 15.20, "Event injection".
*/
static void
svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
uint32_t error, bool ec_valid)
{
struct vmcb_ctrl *ctrl;
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
("%s: event already pending %#lx", __func__, ctrl->eventinj));
KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
__func__, vector));
switch (intr_type) {
case VMCB_EVENTINJ_TYPE_INTR:
case VMCB_EVENTINJ_TYPE_NMI:
case VMCB_EVENTINJ_TYPE_INTn:
break;
case VMCB_EVENTINJ_TYPE_EXCEPTION:
if (vector >= 0 && vector <= 31 && vector != 2)
break;
/* FALLTHROUGH */
default:
panic("%s: invalid intr_type/vector: %d/%d", __func__,
intr_type, vector);
}
ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
if (ec_valid) {
ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
ctrl->eventinj |= (uint64_t)error << 32;
VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x",
intrtype_to_str(intr_type), vector, error);
} else {
VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
intrtype_to_str(intr_type), vector);
}
}
static void
svm_update_virqinfo(struct svm_softc *sc, int vcpu)
{
struct vm *vm;
struct vlapic *vlapic;
struct vmcb_ctrl *ctrl;
vm = sc->vm;
vlapic = vm_lapic(vm, vcpu);
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
/* Update %cr8 in the emulated vlapic */
vlapic_set_cr8(vlapic, ctrl->v_tpr);
/* Virtual interrupt injection is not used. */
KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
"v_intr_vector %d", __func__, ctrl->v_intr_vector));
}
static void
svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
{
struct vmcb_ctrl *ctrl;
uint64_t intinfo;
ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
intinfo = ctrl->exitintinfo;
if (!VMCB_EXITINTINFO_VALID(intinfo))
return;
/*
* From APMv2, Section "Intercepts during IDT interrupt delivery"
*
* If a #VMEXIT happened during event delivery then record the event
* that was being delivered.
*/
VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n",
intinfo, VMCB_EXITINTINFO_VECTOR(intinfo));
vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1);
vm_exit_intinfo(svm_sc->vm, vcpu, intinfo);
}
#ifdef INVARIANTS
static __inline int
vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
{
return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
VMCB_INTCPT_VINTR));
}
#endif
static __inline void
enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
{
struct vmcb_ctrl *ctrl;
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
KASSERT(vintr_intercept_enabled(sc, vcpu),
("%s: vintr intercept should be enabled", __func__));
return;
}
VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
ctrl->v_irq = 1;
ctrl->v_ign_tpr = 1;
ctrl->v_intr_vector = 0;
svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
}
static __inline void
disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
{
struct vmcb_ctrl *ctrl;
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
KASSERT(!vintr_intercept_enabled(sc, vcpu),
("%s: vintr intercept should be disabled", __func__));
return;
}
VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting");
ctrl->v_irq = 0;
ctrl->v_intr_vector = 0;
svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
}
static int
svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val)
{
struct vmcb_ctrl *ctrl;
int oldval, newval;
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
oldval = ctrl->intr_shadow;
newval = val ? 1 : 0;
if (newval != oldval) {
ctrl->intr_shadow = newval;
VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval);
}
return (0);
}
static int
svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val)
{
struct vmcb_ctrl *ctrl;
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
*val = ctrl->intr_shadow;
return (0);
}
/*
* Once an NMI is injected it blocks delivery of further NMIs until the handler
* executes an IRET. The IRET intercept is enabled when an NMI is injected to
* to track when the vcpu is done handling the NMI.
*/
static int
nmi_blocked(struct svm_softc *sc, int vcpu)
{
int blocked;
blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
VMCB_INTCPT_IRET);
return (blocked);
}
static void
enable_nmi_blocking(struct svm_softc *sc, int vcpu)
{
KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
}
static void
clear_nmi_blocking(struct svm_softc *sc, int vcpu)
{
int error;
KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
/*
* When the IRET intercept is cleared the vcpu will attempt to execute
* the "iret" when it runs next. However, it is possible to inject
* another NMI into the vcpu before the "iret" has actually executed.
*
* For e.g. if the "iret" encounters a #NPF when accessing the stack
* it will trap back into the hypervisor. If an NMI is pending for
* the vcpu it will be injected into the guest.
*
* XXX this needs to be fixed
*/
svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
/*
* Set 'intr_shadow' to prevent an NMI from being injected on the
* immediate VMRUN.
*/
error = svm_modify_intr_shadow(sc, vcpu, 1);
KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
}
#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL
static int
svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu)
{
struct vm_exit *vme;
struct vmcb_state *state;
uint64_t changed, lma, oldval;
int error;
state = svm_get_vmcb_state(sc, vcpu);
oldval = state->efer;
VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */
changed = oldval ^ newval;
if (newval & EFER_MBZ_BITS)
goto gpf;
/* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
if (changed & EFER_LME) {
if (state->cr0 & CR0_PG)
goto gpf;
}
/* EFER.LMA = EFER.LME & CR0.PG */
if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
lma = EFER_LMA;
else
lma = 0;
if ((newval & EFER_LMA) != lma)
goto gpf;
if (newval & EFER_NXE) {
if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE))
goto gpf;
}
/*
* XXX bhyve does not enforce segment limits in 64-bit mode. Until
* this is fixed flag guest attempt to set EFER_LMSLE as an error.
*/
if (newval & EFER_LMSLE) {
vme = vm_exitinfo(sc->vm, vcpu);
vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
*retu = true;
return (0);
}
if (newval & EFER_FFXSR) {
if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR))
goto gpf;
}
if (newval & EFER_TCE) {
if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE))
goto gpf;
}
error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval);
KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
return (0);
gpf:
vm_inject_gp(sc->vm, vcpu);
return (0);
}
static int
emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val,
bool *retu)
{
int error;
if (lapic_msr(num))
error = lapic_wrmsr(sc->vm, vcpu, num, val, retu);
else if (num == MSR_EFER)
error = svm_write_efer(sc, vcpu, val, retu);
else
error = svm_wrmsr(sc, vcpu, num, val, retu);
return (error);
}
static int
emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu)
{
struct vmcb_state *state;
struct svm_regctx *ctx;
uint64_t result;
int error;
if (lapic_msr(num))
error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu);
else
error = svm_rdmsr(sc, vcpu, num, &result, retu);
if (error == 0) {
state = svm_get_vmcb_state(sc, vcpu);
ctx = svm_get_guest_regctx(sc, vcpu);
state->rax = result & 0xffffffff;
ctx->sctx_rdx = result >> 32;
}
return (error);
}
#ifdef KTR
static const char *
exit_reason_to_str(uint64_t reason)
{
static char reasonbuf[32];
switch (reason) {
case VMCB_EXIT_INVALID:
return ("invalvmcb");
case VMCB_EXIT_SHUTDOWN:
return ("shutdown");
case VMCB_EXIT_NPF:
return ("nptfault");
case VMCB_EXIT_PAUSE:
return ("pause");
case VMCB_EXIT_HLT:
return ("hlt");
case VMCB_EXIT_CPUID:
return ("cpuid");
case VMCB_EXIT_IO:
return ("inout");
case VMCB_EXIT_MC:
return ("mchk");
case VMCB_EXIT_INTR:
return ("extintr");
case VMCB_EXIT_NMI:
return ("nmi");
case VMCB_EXIT_VINTR:
return ("vintr");
case VMCB_EXIT_MSR:
return ("msr");
case VMCB_EXIT_IRET:
return ("iret");
case VMCB_EXIT_MONITOR:
return ("monitor");
case VMCB_EXIT_MWAIT:
return ("mwait");
default:
snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
return (reasonbuf);
}
}
#endif /* KTR */
/*
* From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
* that are due to instruction intercepts as well as MSR and IOIO intercepts
* and exceptions caused by INT3, INTO and BOUND instructions.
*
* Return 1 if the nRIP is valid and 0 otherwise.
*/
static int
nrip_valid(uint64_t exitcode)
{
switch (exitcode) {
case 0x00 ... 0x0F: /* read of CR0 through CR15 */
case 0x10 ... 0x1F: /* write of CR0 through CR15 */
case 0x20 ... 0x2F: /* read of DR0 through DR15 */
case 0x30 ... 0x3F: /* write of DR0 through DR15 */
case 0x43: /* INT3 */
case 0x44: /* INTO */
case 0x45: /* BOUND */
case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */
return (1);
default:
return (0);
}
}
static int
svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
{
struct vmcb *vmcb;
struct vmcb_state *state;
struct vmcb_ctrl *ctrl;
struct svm_regctx *ctx;
uint64_t code, info1, info2, val;
uint32_t eax, ecx, edx;
int error, errcode_valid, handled, idtvec, reflect;
bool retu;
ctx = svm_get_guest_regctx(svm_sc, vcpu);
vmcb = svm_get_vmcb(svm_sc, vcpu);
state = &vmcb->state;
ctrl = &vmcb->ctrl;
handled = 0;
code = ctrl->exitcode;
info1 = ctrl->exitinfo1;
info2 = ctrl->exitinfo2;
vmexit->exitcode = VM_EXITCODE_BOGUS;
vmexit->rip = state->rip;
vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1);
/*
* #VMEXIT(INVALID) needs to be handled early because the VMCB is
* in an inconsistent state and can trigger assertions that would
* never happen otherwise.
*/
if (code == VMCB_EXIT_INVALID) {
vm_exit_svm(vmexit, code, info1, info2);
return (0);
}
KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
"injection valid bit is set %#lx", __func__, ctrl->eventinj));
KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
vmexit->inst_length, code, info1, info2));
svm_update_virqinfo(svm_sc, vcpu);
svm_save_intinfo(svm_sc, vcpu);
switch (code) {
case VMCB_EXIT_IRET:
/*
* Restart execution at "iret" but with the intercept cleared.
*/
vmexit->inst_length = 0;
clear_nmi_blocking(svm_sc, vcpu);
handled = 1;
break;
case VMCB_EXIT_VINTR: /* interrupt window exiting */
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
handled = 1;
break;
case VMCB_EXIT_INTR: /* external interrupt */
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1);
handled = 1;
break;
case VMCB_EXIT_NMI: /* external NMI */
handled = 1;
break;
case 0x40 ... 0x5F:
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1);
reflect = 1;
idtvec = code - 0x40;
switch (idtvec) {
case IDT_MC:
/*
* Call the machine check handler by hand. Also don't
* reflect the machine check back into the guest.
*/
reflect = 0;
VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler");
__asm __volatile("int $18");
break;
case IDT_PF:
error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2,
info2);
KASSERT(error == 0, ("%s: error %d updating cr2",
__func__, error));
/* fallthru */
case IDT_NP:
case IDT_SS:
case IDT_GP:
case IDT_AC:
case IDT_TS:
errcode_valid = 1;
break;
case IDT_DF:
errcode_valid = 1;
info1 = 0;
break;
case IDT_BP:
case IDT_OF:
case IDT_BR:
/*
* The 'nrip' field is populated for INT3, INTO and
* BOUND exceptions and this also implies that
* 'inst_length' is non-zero.
*
* Reset 'inst_length' to zero so the guest %rip at
* event injection is identical to what it was when
* the exception originally happened.
*/
VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d "
"to zero before injecting exception %d",
vmexit->inst_length, idtvec);
vmexit->inst_length = 0;
/* fallthru */
default:
errcode_valid = 0;
info1 = 0;
break;
}
KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
"when reflecting exception %d into guest",
vmexit->inst_length, idtvec));
if (reflect) {
/* Reflect the exception back into the guest */
VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception "
"%d/%#x into the guest", idtvec, (int)info1);
error = vm_inject_exception(svm_sc->vm, vcpu, idtvec,
errcode_valid, info1, 0);
KASSERT(error == 0, ("%s: vm_inject_exception error %d",
__func__, error));
}
handled = 1;
break;
case VMCB_EXIT_MSR: /* MSR access. */
eax = state->rax;
ecx = ctx->sctx_rcx;
edx = ctx->sctx_rdx;
retu = false;
if (info1) {
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1);
val = (uint64_t)edx << 32 | eax;
VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx",
ecx, val);
if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
vmexit->exitcode = VM_EXITCODE_WRMSR;
vmexit->u.msr.code = ecx;
vmexit->u.msr.wval = val;
} else if (!retu) {
handled = 1;
} else {
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
("emulate_wrmsr retu with bogus exitcode"));
}
} else {
VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx);
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1);
if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) {
vmexit->exitcode = VM_EXITCODE_RDMSR;
vmexit->u.msr.code = ecx;
} else if (!retu) {
handled = 1;
} else {
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
("emulate_rdmsr retu with bogus exitcode"));
}
}
break;
case VMCB_EXIT_IO:
handled = svm_handle_io(svm_sc, vcpu, vmexit);
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1);
break;
case VMCB_EXIT_CPUID:
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1);
handled = x86_emulate_cpuid(svm_sc->vm, vcpu,
(uint32_t *)&state->rax,
(uint32_t *)&ctx->sctx_rbx,
(uint32_t *)&ctx->sctx_rcx,
(uint32_t *)&ctx->sctx_rdx);
break;
case VMCB_EXIT_HLT:
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1);
vmexit->exitcode = VM_EXITCODE_HLT;
vmexit->u.hlt.rflags = state->rflags;
break;
case VMCB_EXIT_PAUSE:
vmexit->exitcode = VM_EXITCODE_PAUSE;
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1);
break;
case VMCB_EXIT_NPF:
/* EXITINFO2 contains the faulting guest physical address */
if (info1 & VMCB_NPF_INFO1_RSV) {
VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with "
"reserved bits set: info1(%#lx) info2(%#lx)",
info1, info2);
} else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->u.paging.gpa = info2;
vmexit->u.paging.fault_type = npf_fault_type(info1);
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault "
"on gpa %#lx/%#lx at rip %#lx",
info2, info1, state->rip);
} else if (svm_npf_emul_fault(info1)) {
svm_handle_inst_emul(vmcb, info2, vmexit);
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1);
VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault "
"for gpa %#lx/%#lx at rip %#lx",
info2, info1, state->rip);
}
break;
case VMCB_EXIT_MONITOR:
vmexit->exitcode = VM_EXITCODE_MONITOR;
break;
case VMCB_EXIT_MWAIT:
vmexit->exitcode = VM_EXITCODE_MWAIT;
break;
default:
vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1);
break;
}
VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d",
handled ? "handled" : "unhandled", exit_reason_to_str(code),
vmexit->rip, vmexit->inst_length);
if (handled) {
vmexit->rip += vmexit->inst_length;
vmexit->inst_length = 0;
state->rip = vmexit->rip;
} else {
if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
/*
* If this VM exit was not claimed by anybody then
* treat it as a generic SVM exit.
*/
vm_exit_svm(vmexit, code, info1, info2);
} else {
/*
* The exitcode and collateral have been populated.
* The VM exit will be processed further in userland.
*/
}
}
return (handled);
}
static void
svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
{
uint64_t intinfo;
if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
return;
KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
"valid: %#lx", __func__, intinfo));
svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
VMCB_EXITINTINFO_VECTOR(intinfo),
VMCB_EXITINTINFO_EC(intinfo),
VMCB_EXITINTINFO_EC_VALID(intinfo));
vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo);
}
/*
* Inject event to virtual cpu.
*/
static void
svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
{
struct vmcb_ctrl *ctrl;
struct vmcb_state *state;
struct svm_vcpu *vcpustate;
uint8_t v_tpr;
int vector, need_intr_window;
int extint_pending;
state = svm_get_vmcb_state(sc, vcpu);
ctrl = svm_get_vmcb_ctrl(sc, vcpu);
vcpustate = svm_get_vcpu(sc, vcpu);
need_intr_window = 0;
if (vcpustate->nextrip != state->rip) {
ctrl->intr_shadow = 0;
VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
"cleared due to rip change: %#lx/%#lx",
vcpustate->nextrip, state->rip);
}
/*
* Inject pending events or exceptions for this vcpu.
*
* An event might be pending because the previous #VMEXIT happened
* during event delivery (i.e. ctrl->exitintinfo).
*
* An event might also be pending because an exception was injected
* by the hypervisor (e.g. #PF during instruction emulation).
*/
svm_inj_intinfo(sc, vcpu);
/* NMI event has priority over interrupts. */
if (vm_nmi_pending(sc->vm, vcpu)) {
if (nmi_blocked(sc, vcpu)) {
/*
* Can't inject another NMI if the guest has not
* yet executed an "iret" after the last NMI.
*/
VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
"to NMI-blocking");
} else if (ctrl->intr_shadow) {
/*
* Can't inject an NMI if the vcpu is in an intr_shadow.
*/
VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
"interrupt shadow");
need_intr_window = 1;
goto done;
} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
/*
* If there is already an exception/interrupt pending
* then defer the NMI until after that.
*/
VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
"eventinj %#lx", ctrl->eventinj);
/*
* Use self-IPI to trigger a VM-exit as soon as
* possible after the event injection is completed.
*
* This works only if the external interrupt exiting
* is at a lower priority than the event injection.
*
* Although not explicitly specified in APMv2 the
* relative priorities were verified empirically.
*/
ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */
} else {
vm_nmi_clear(sc->vm, vcpu);
/* Inject NMI, vector number is not used */
svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
IDT_NMI, 0, false);
/* virtual NMI blocking is now in effect */
enable_nmi_blocking(sc, vcpu);
VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
}
}
extint_pending = vm_extint_pending(sc->vm, vcpu);
if (!extint_pending) {
if (!vlapic_pending_intr(vlapic, &vector))
goto done;
KASSERT(vector >= 16 && vector <= 255,
("invalid vector %d from local APIC", vector));
} else {
/* Ask the legacy pic for a vector to inject */
vatpic_pending_intr(sc->vm, &vector);
KASSERT(vector >= 0 && vector <= 255,
("invalid vector %d from INTR", vector));
}
/*
* If the guest has disabled interrupts or is in an interrupt shadow
* then we cannot inject the pending interrupt.
*/
if ((state->rflags & PSL_I) == 0) {
VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
"rflags %#lx", vector, state->rflags);
need_intr_window = 1;
goto done;
}
if (ctrl->intr_shadow) {
VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
"interrupt shadow", vector);
need_intr_window = 1;
goto done;
}
if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
"eventinj %#lx", vector, ctrl->eventinj);
need_intr_window = 1;
goto done;
}
svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
if (!extint_pending) {
vlapic_intr_accepted(vlapic, vector);
} else {
vm_extint_clear(sc->vm, vcpu);
vatpic_intr_accepted(sc->vm, vector);
}
/*
* Force a VM-exit as soon as the vcpu is ready to accept another
* interrupt. This is done because the PIC might have another vector
* that it wants to inject. Also, if the APIC has a pending interrupt
* that was preempted by the ExtInt then it allows us to inject the
* APIC vector as soon as possible.
*/
need_intr_window = 1;
done:
/*
* The guest can modify the TPR by writing to %CR8. In guest mode
* the processor reflects this write to V_TPR without hypervisor
* intervention.
*
* The guest can also modify the TPR by writing to it via the memory
* mapped APIC page. In this case, the write will be emulated by the
* hypervisor. For this reason V_TPR must be updated before every
* VMRUN.
*/
v_tpr = vlapic_get_cr8(vlapic);
KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
if (ctrl->v_tpr != v_tpr) {
VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x",
ctrl->v_tpr, v_tpr);
ctrl->v_tpr = v_tpr;
svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
}
if (need_intr_window) {
/*
* We use V_IRQ in conjunction with the VINTR intercept to
* trap into the hypervisor as soon as a virtual interrupt
* can be delivered.
*
* Since injected events are not subject to intercept checks
* we need to ensure that the V_IRQ is not actually going to
* be delivered on VM entry. The KASSERT below enforces this.
*/
KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
(state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
("Bogus intr_window_exiting: eventinj (%#lx), "
"intr_shadow (%u), rflags (%#lx)",
ctrl->eventinj, ctrl->intr_shadow, state->rflags));
enable_intr_window_exiting(sc, vcpu);
} else {
disable_intr_window_exiting(sc, vcpu);
}
}
static __inline void
restore_host_tss(void)
{
struct system_segment_descriptor *tss_sd;
/*
* The TSS descriptor was in use prior to launching the guest so it
* has been marked busy.
*
* 'ltr' requires the descriptor to be marked available so change the
* type to "64-bit available TSS".
*/
tss_sd = PCPU_GET(tss);
tss_sd->sd_type = SDT_SYSTSS;
ltr(GSEL(GPROC0_SEL, SEL_KPL));
}
static void
check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
{
struct svm_vcpu *vcpustate;
struct vmcb_ctrl *ctrl;
long eptgen;
bool alloc_asid;
KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not "
"active on cpu %u", __func__, thiscpu));
vcpustate = svm_get_vcpu(sc, vcpuid);
ctrl = svm_get_vmcb_ctrl(sc, vcpuid);
/*
* The TLB entries associated with the vcpu's ASID are not valid
* if either of the following conditions is true:
*
* 1. The vcpu's ASID generation is different than the host cpu's
* ASID generation. This happens when the vcpu migrates to a new
* host cpu. It can also happen when the number of vcpus executing
* on a host cpu is greater than the number of ASIDs available.
*
* 2. The pmap generation number is different than the value cached in
* the 'vcpustate'. This happens when the host invalidates pages
* belonging to the guest.
*
* asidgen eptgen Action
* mismatch mismatch
* 0 0 (a)
* 0 1 (b1) or (b2)
* 1 0 (c)
* 1 1 (d)
*
* (a) There is no mismatch in eptgen or ASID generation and therefore
* no further action is needed.
*
* (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
* retained and the TLB entries associated with this ASID
* are flushed by VMRUN.
*
* (b2) If the cpu does not support FlushByAsid then a new ASID is
* allocated.
*
* (c) A new ASID is allocated.
*
* (d) A new ASID is allocated.
*/
alloc_asid = false;
eptgen = pmap->pm_eptgen;
ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
if (vcpustate->asid.gen != asid[thiscpu].gen) {
alloc_asid = true; /* (c) and (d) */
} else if (vcpustate->eptgen != eptgen) {
if (flush_by_asid())
ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */
else
alloc_asid = true; /* (b2) */
} else {
/*
* This is the common case (a).
*/
KASSERT(!alloc_asid, ("ASID allocation not necessary"));
KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
}
if (alloc_asid) {
if (++asid[thiscpu].num >= nasid) {
asid[thiscpu].num = 1;
if (++asid[thiscpu].gen == 0)
asid[thiscpu].gen = 1;
/*
* If this cpu does not support "flush-by-asid"
* then flush the entire TLB on a generation
* bump. Subsequent ASID allocation in this
* generation can be done without a TLB flush.
*/
if (!flush_by_asid())
ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
}
vcpustate->asid.gen = asid[thiscpu].gen;
vcpustate->asid.num = asid[thiscpu].num;
ctrl->asid = vcpustate->asid.num;
svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID);
/*
* If this cpu supports "flush-by-asid" then the TLB
* was not flushed after the generation bump. The TLB
* is flushed selectively after every new ASID allocation.
*/
if (flush_by_asid())
ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
}
vcpustate->eptgen = eptgen;
KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
KASSERT(ctrl->asid == vcpustate->asid.num,
("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num));
}
static __inline void
disable_gintr(void)
{
__asm __volatile("clgi");
}
static __inline void
enable_gintr(void)
{
__asm __volatile("stgi");
}
static __inline void
svm_dr_enter_guest(struct svm_regctx *gctx)
{
/* Save host control debug registers. */
gctx->host_dr7 = rdr7();
gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
/*
* Disable debugging in DR7 and DEBUGCTL to avoid triggering
* exceptions in the host based on the guest DRx values. The
* guest DR6, DR7, and DEBUGCTL are saved/restored in the
* VMCB.
*/
load_dr7(0);
wrmsr(MSR_DEBUGCTLMSR, 0);
/* Save host debug registers. */
gctx->host_dr0 = rdr0();
gctx->host_dr1 = rdr1();
gctx->host_dr2 = rdr2();
gctx->host_dr3 = rdr3();
gctx->host_dr6 = rdr6();
/* Restore guest debug registers. */
load_dr0(gctx->sctx_dr0);
load_dr1(gctx->sctx_dr1);
load_dr2(gctx->sctx_dr2);
load_dr3(gctx->sctx_dr3);
}
static __inline void
svm_dr_leave_guest(struct svm_regctx *gctx)
{
/* Save guest debug registers. */
gctx->sctx_dr0 = rdr0();
gctx->sctx_dr1 = rdr1();
gctx->sctx_dr2 = rdr2();
gctx->sctx_dr3 = rdr3();
/*
* Restore host debug registers. Restore DR7 and DEBUGCTL
* last.
*/
load_dr0(gctx->host_dr0);
load_dr1(gctx->host_dr1);
load_dr2(gctx->host_dr2);
load_dr3(gctx->host_dr3);
load_dr6(gctx->host_dr6);
wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
load_dr7(gctx->host_dr7);
}
/*
* Start vcpu with specified RIP.
*/
static int
svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
struct vm_eventinfo *evinfo)
{
struct svm_regctx *gctx;
struct svm_softc *svm_sc;
struct svm_vcpu *vcpustate;
struct vmcb_state *state;
struct vmcb_ctrl *ctrl;
struct vm_exit *vmexit;
struct vlapic *vlapic;
struct vm *vm;
uint64_t vmcb_pa;
int handled;
svm_sc = arg;
vm = svm_sc->vm;
vcpustate = svm_get_vcpu(svm_sc, vcpu);
state = svm_get_vmcb_state(svm_sc, vcpu);
ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu);
vmexit = vm_exitinfo(vm, vcpu);
vlapic = vm_lapic(vm, vcpu);
gctx = svm_get_guest_regctx(svm_sc, vcpu);
vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa;
if (vcpustate->lastcpu != curcpu) {
/*
* Force new ASID allocation by invalidating the generation.
*/
vcpustate->asid.gen = 0;
/*
* Invalidate the VMCB state cache by marking all fields dirty.
*/
svm_set_dirty(svm_sc, vcpu, 0xffffffff);
/*
* XXX
* Setting 'vcpustate->lastcpu' here is bit premature because
* we may return from this function without actually executing
* the VMRUN instruction. This could happen if a rendezvous
* or an AST is pending on the first time through the loop.
*
* This works for now but any new side-effects of vcpu
* migration should take this case into account.
*/
vcpustate->lastcpu = curcpu;
vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1);
}
svm_msr_guest_enter(svm_sc, vcpu);
/* Update Guest RIP */
state->rip = rip;
do {
/*
* Disable global interrupts to guarantee atomicity during
* loading of guest state. This includes not only the state
* loaded by the "vmrun" instruction but also software state
* maintained by the hypervisor: suspended and rendezvous
* state, NPT generation number, vlapic interrupts etc.
*/
disable_gintr();
if (vcpu_suspended(evinfo)) {
enable_gintr();
vm_exit_suspended(vm, vcpu, state->rip);
break;
}
if (vcpu_rendezvous_pending(evinfo)) {
enable_gintr();
vm_exit_rendezvous(vm, vcpu, state->rip);
break;
}
if (vcpu_reqidle(evinfo)) {
enable_gintr();
vm_exit_reqidle(vm, vcpu, state->rip);
break;
}
/* We are asked to give the cpu by scheduler. */
if (vcpu_should_yield(vm, vcpu)) {
enable_gintr();
vm_exit_astpending(vm, vcpu, state->rip);
break;
}
+ if (vcpu_debugged(vm, vcpu)) {
+ enable_gintr();
+ vm_exit_debug(vm, vcpu, state->rip);
+ break;
+ }
+
svm_inj_interrupts(svm_sc, vcpu, vlapic);
/* Activate the nested pmap on 'curcpu' */
CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
/*
* Check the pmap generation and the ASID generation to
* ensure that the vcpu does not use stale TLB mappings.
*/
check_asid(svm_sc, vcpu, pmap, curcpu);
ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty;
vcpustate->dirty = 0;
VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
/* Launch Virtual Machine. */
VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip);
svm_dr_enter_guest(gctx);
svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]);
svm_dr_leave_guest(gctx);
CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
/*
* The host GDTR and IDTR is saved by VMRUN and restored
* automatically on #VMEXIT. However, the host TSS needs
* to be restored explicitly.
*/
restore_host_tss();
/* #VMEXIT disables interrupts so re-enable them here. */
enable_gintr();
/* Update 'nextrip' */
vcpustate->nextrip = state->rip;
/* Handle #VMEXIT and if required return to user space. */
handled = svm_vmexit(svm_sc, vcpu, vmexit);
} while (handled);
svm_msr_guest_exit(svm_sc, vcpu);
return (0);
}
static void
svm_vmcleanup(void *arg)
{
struct svm_softc *sc = arg;
contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
free(sc, M_SVM);
}
static register_t *
swctx_regptr(struct svm_regctx *regctx, int reg)
{
switch (reg) {
case VM_REG_GUEST_RBX:
return (&regctx->sctx_rbx);
case VM_REG_GUEST_RCX:
return (&regctx->sctx_rcx);
case VM_REG_GUEST_RDX:
return (&regctx->sctx_rdx);
case VM_REG_GUEST_RDI:
return (&regctx->sctx_rdi);
case VM_REG_GUEST_RSI:
return (&regctx->sctx_rsi);
case VM_REG_GUEST_RBP:
return (&regctx->sctx_rbp);
case VM_REG_GUEST_R8:
return (&regctx->sctx_r8);
case VM_REG_GUEST_R9:
return (&regctx->sctx_r9);
case VM_REG_GUEST_R10:
return (&regctx->sctx_r10);
case VM_REG_GUEST_R11:
return (&regctx->sctx_r11);
case VM_REG_GUEST_R12:
return (&regctx->sctx_r12);
case VM_REG_GUEST_R13:
return (&regctx->sctx_r13);
case VM_REG_GUEST_R14:
return (&regctx->sctx_r14);
case VM_REG_GUEST_R15:
return (&regctx->sctx_r15);
case VM_REG_GUEST_DR0:
return (&regctx->sctx_dr0);
case VM_REG_GUEST_DR1:
return (&regctx->sctx_dr1);
case VM_REG_GUEST_DR2:
return (&regctx->sctx_dr2);
case VM_REG_GUEST_DR3:
return (&regctx->sctx_dr3);
default:
return (NULL);
}
}
static int
svm_getreg(void *arg, int vcpu, int ident, uint64_t *val)
{
struct svm_softc *svm_sc;
register_t *reg;
svm_sc = arg;
if (ident == VM_REG_GUEST_INTR_SHADOW) {
return (svm_get_intr_shadow(svm_sc, vcpu, val));
}
if (vmcb_read(svm_sc, vcpu, ident, val) == 0) {
return (0);
}
reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
if (reg != NULL) {
*val = *reg;
return (0);
}
VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident);
return (EINVAL);
}
static int
svm_setreg(void *arg, int vcpu, int ident, uint64_t val)
{
struct svm_softc *svm_sc;
register_t *reg;
svm_sc = arg;
if (ident == VM_REG_GUEST_INTR_SHADOW) {
return (svm_modify_intr_shadow(svm_sc, vcpu, val));
}
if (vmcb_write(svm_sc, vcpu, ident, val) == 0) {
return (0);
}
reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident);
if (reg != NULL) {
*reg = val;
return (0);
}
/*
* XXX deal with CR3 and invalidate TLB entries tagged with the
* vcpu's ASID. This needs to be treated differently depending on
* whether 'running' is true/false.
*/
VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident);
return (EINVAL);
}
static int
svm_setcap(void *arg, int vcpu, int type, int val)
{
struct svm_softc *sc;
int error;
sc = arg;
error = 0;
switch (type) {
case VM_CAP_HALT_EXIT:
svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
VMCB_INTCPT_HLT, val);
break;
case VM_CAP_PAUSE_EXIT:
svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
VMCB_INTCPT_PAUSE, val);
break;
case VM_CAP_UNRESTRICTED_GUEST:
/* Unrestricted guest execution cannot be disabled in SVM */
if (val == 0)
error = EINVAL;
break;
default:
error = ENOENT;
break;
}
return (error);
}
static int
svm_getcap(void *arg, int vcpu, int type, int *retval)
{
struct svm_softc *sc;
int error;
sc = arg;
error = 0;
switch (type) {
case VM_CAP_HALT_EXIT:
*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
VMCB_INTCPT_HLT);
break;
case VM_CAP_PAUSE_EXIT:
*retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
VMCB_INTCPT_PAUSE);
break;
case VM_CAP_UNRESTRICTED_GUEST:
*retval = 1; /* unrestricted guest is always enabled */
break;
default:
error = ENOENT;
break;
}
return (error);
}
static struct vlapic *
svm_vlapic_init(void *arg, int vcpuid)
{
struct svm_softc *svm_sc;
struct vlapic *vlapic;
svm_sc = arg;
vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
vlapic->vm = svm_sc->vm;
vlapic->vcpuid = vcpuid;
vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid];
vlapic_init(vlapic);
return (vlapic);
}
static void
svm_vlapic_cleanup(void *arg, struct vlapic *vlapic)
{
vlapic_cleanup(vlapic);
free(vlapic, M_SVM_VLAPIC);
}
struct vmm_ops vmm_ops_amd = {
svm_init,
svm_cleanup,
svm_restore,
svm_vminit,
svm_vmrun,
svm_vmcleanup,
svm_getreg,
svm_setreg,
vmcb_getdesc,
vmcb_setdesc,
svm_getcap,
svm_setcap,
svm_npt_alloc,
svm_npt_free,
svm_vlapic_init,
svm_vlapic_cleanup
};
Index: head/sys/amd64/vmm/intel/vmx.c
===================================================================
--- head/sys/amd64/vmm/intel/vmx.c (revision 332156)
+++ head/sys/amd64/vmm/intel/vmx.c (revision 332157)
@@ -1,3553 +1,3559 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/smp.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/psl.h>
#include <machine/cpufunc.h>
#include <machine/md_var.h>
#include <machine/segments.h>
#include <machine/smp.h>
#include <machine/specialreg.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
#include "vmm_lapic.h"
#include "vmm_host.h"
#include "vmm_ioport.h"
#include "vmm_ktr.h"
#include "vmm_stat.h"
#include "vatpic.h"
#include "vlapic.h"
#include "vlapic_priv.h"
#include "ept.h"
#include "vmx_cpufunc.h"
#include "vmx.h"
#include "vmx_msr.h"
#include "x86.h"
#include "vmx_controls.h"
#define PINBASED_CTLS_ONE_SETTING \
(PINBASED_EXTINT_EXITING | \
PINBASED_NMI_EXITING | \
PINBASED_VIRTUAL_NMI)
#define PINBASED_CTLS_ZERO_SETTING 0
#define PROCBASED_CTLS_WINDOW_SETTING \
(PROCBASED_INT_WINDOW_EXITING | \
PROCBASED_NMI_WINDOW_EXITING)
#define PROCBASED_CTLS_ONE_SETTING \
(PROCBASED_SECONDARY_CONTROLS | \
PROCBASED_MWAIT_EXITING | \
PROCBASED_MONITOR_EXITING | \
PROCBASED_IO_EXITING | \
PROCBASED_MSR_BITMAPS | \
PROCBASED_CTLS_WINDOW_SETTING | \
PROCBASED_CR8_LOAD_EXITING | \
PROCBASED_CR8_STORE_EXITING)
#define PROCBASED_CTLS_ZERO_SETTING \
(PROCBASED_CR3_LOAD_EXITING | \
PROCBASED_CR3_STORE_EXITING | \
PROCBASED_IO_BITMAPS)
#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
#define PROCBASED_CTLS2_ZERO_SETTING 0
#define VM_EXIT_CTLS_ONE_SETTING \
(VM_EXIT_SAVE_DEBUG_CONTROLS | \
VM_EXIT_HOST_LMA | \
VM_EXIT_SAVE_EFER | \
VM_EXIT_LOAD_EFER | \
VM_EXIT_ACKNOWLEDGE_INTERRUPT)
#define VM_EXIT_CTLS_ZERO_SETTING 0
#define VM_ENTRY_CTLS_ONE_SETTING \
(VM_ENTRY_LOAD_DEBUG_CONTROLS | \
VM_ENTRY_LOAD_EFER)
#define VM_ENTRY_CTLS_ZERO_SETTING \
(VM_ENTRY_INTO_SMM | \
VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
#define HANDLED 1
#define UNHANDLED 0
static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
SYSCTL_DECL(_hw_vmm);
SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
int vmxon_enabled[MAXCPU];
static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
static uint32_t exit_ctls, entry_ctls;
static uint64_t cr0_ones_mask, cr0_zeros_mask;
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
&cr0_ones_mask, 0, NULL);
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
&cr0_zeros_mask, 0, NULL);
static uint64_t cr4_ones_mask, cr4_zeros_mask;
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
&cr4_ones_mask, 0, NULL);
SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
&cr4_zeros_mask, 0, NULL);
static int vmx_initialized;
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
&vmx_initialized, 0, "Intel VMX initialized");
/*
* Optional capabilities
*/
static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
static int cap_halt_exit;
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
"HLT triggers a VM-exit");
static int cap_pause_exit;
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
0, "PAUSE triggers a VM-exit");
static int cap_unrestricted_guest;
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
&cap_unrestricted_guest, 0, "Unrestricted guests");
static int cap_monitor_trap;
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
&cap_monitor_trap, 0, "Monitor trap flag");
static int cap_invpcid;
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
0, "Guests are allowed to use INVPCID");
static int virtual_interrupt_delivery;
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
&virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
static int posted_interrupts;
SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
&posted_interrupts, 0, "APICv posted interrupt support");
static int pirvec = -1;
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
&pirvec, 0, "APICv posted interrupt vector");
static struct unrhdr *vpid_unr;
static u_int vpid_alloc_failed;
SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
&vpid_alloc_failed, 0, NULL);
/*
* Use the last page below 4GB as the APIC access address. This address is
* occupied by the boot firmware so it is guaranteed that it will not conflict
* with a page in system memory.
*/
#define APIC_ACCESS_ADDRESS 0xFFFFF000
static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
static void vmx_inject_pir(struct vlapic *vlapic);
#ifdef KTR
static const char *
exit_reason_to_str(int reason)
{
static char reasonbuf[32];
switch (reason) {
case EXIT_REASON_EXCEPTION:
return "exception";
case EXIT_REASON_EXT_INTR:
return "extint";
case EXIT_REASON_TRIPLE_FAULT:
return "triplefault";
case EXIT_REASON_INIT:
return "init";
case EXIT_REASON_SIPI:
return "sipi";
case EXIT_REASON_IO_SMI:
return "iosmi";
case EXIT_REASON_SMI:
return "smi";
case EXIT_REASON_INTR_WINDOW:
return "intrwindow";
case EXIT_REASON_NMI_WINDOW:
return "nmiwindow";
case EXIT_REASON_TASK_SWITCH:
return "taskswitch";
case EXIT_REASON_CPUID:
return "cpuid";
case EXIT_REASON_GETSEC:
return "getsec";
case EXIT_REASON_HLT:
return "hlt";
case EXIT_REASON_INVD:
return "invd";
case EXIT_REASON_INVLPG:
return "invlpg";
case EXIT_REASON_RDPMC:
return "rdpmc";
case EXIT_REASON_RDTSC:
return "rdtsc";
case EXIT_REASON_RSM:
return "rsm";
case EXIT_REASON_VMCALL:
return "vmcall";
case EXIT_REASON_VMCLEAR:
return "vmclear";
case EXIT_REASON_VMLAUNCH:
return "vmlaunch";
case EXIT_REASON_VMPTRLD:
return "vmptrld";
case EXIT_REASON_VMPTRST:
return "vmptrst";
case EXIT_REASON_VMREAD:
return "vmread";
case EXIT_REASON_VMRESUME:
return "vmresume";
case EXIT_REASON_VMWRITE:
return "vmwrite";
case EXIT_REASON_VMXOFF:
return "vmxoff";
case EXIT_REASON_VMXON:
return "vmxon";
case EXIT_REASON_CR_ACCESS:
return "craccess";
case EXIT_REASON_DR_ACCESS:
return "draccess";
case EXIT_REASON_INOUT:
return "inout";
case EXIT_REASON_RDMSR:
return "rdmsr";
case EXIT_REASON_WRMSR:
return "wrmsr";
case EXIT_REASON_INVAL_VMCS:
return "invalvmcs";
case EXIT_REASON_INVAL_MSR:
return "invalmsr";
case EXIT_REASON_MWAIT:
return "mwait";
case EXIT_REASON_MTF:
return "mtf";
case EXIT_REASON_MONITOR:
return "monitor";
case EXIT_REASON_PAUSE:
return "pause";
case EXIT_REASON_MCE_DURING_ENTRY:
return "mce-during-entry";
case EXIT_REASON_TPR:
return "tpr";
case EXIT_REASON_APIC_ACCESS:
return "apic-access";
case EXIT_REASON_GDTR_IDTR:
return "gdtridtr";
case EXIT_REASON_LDTR_TR:
return "ldtrtr";
case EXIT_REASON_EPT_FAULT:
return "eptfault";
case EXIT_REASON_EPT_MISCONFIG:
return "eptmisconfig";
case EXIT_REASON_INVEPT:
return "invept";
case EXIT_REASON_RDTSCP:
return "rdtscp";
case EXIT_REASON_VMX_PREEMPT:
return "vmxpreempt";
case EXIT_REASON_INVVPID:
return "invvpid";
case EXIT_REASON_WBINVD:
return "wbinvd";
case EXIT_REASON_XSETBV:
return "xsetbv";
case EXIT_REASON_APIC_WRITE:
return "apic-write";
default:
snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
return (reasonbuf);
}
}
#endif /* KTR */
static int
vmx_allow_x2apic_msrs(struct vmx *vmx)
{
int i, error;
error = 0;
/*
* Allow readonly access to the following x2APIC MSRs from the guest.
*/
error += guest_msr_ro(vmx, MSR_APIC_ID);
error += guest_msr_ro(vmx, MSR_APIC_VERSION);
error += guest_msr_ro(vmx, MSR_APIC_LDR);
error += guest_msr_ro(vmx, MSR_APIC_SVR);
for (i = 0; i < 8; i++)
error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
for (i = 0; i < 8; i++)
error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
for (i = 0; i < 8; i++)
error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
error += guest_msr_ro(vmx, MSR_APIC_ESR);
error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
error += guest_msr_ro(vmx, MSR_APIC_ICR);
/*
* Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
*
* These registers get special treatment described in the section
* "Virtualizing MSR-Based APIC Accesses".
*/
error += guest_msr_rw(vmx, MSR_APIC_TPR);
error += guest_msr_rw(vmx, MSR_APIC_EOI);
error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
return (error);
}
u_long
vmx_fix_cr0(u_long cr0)
{
return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
}
u_long
vmx_fix_cr4(u_long cr4)
{
return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
}
static void
vpid_free(int vpid)
{
if (vpid < 0 || vpid > 0xffff)
panic("vpid_free: invalid vpid %d", vpid);
/*
* VPIDs [0,VM_MAXCPU] are special and are not allocated from
* the unit number allocator.
*/
if (vpid > VM_MAXCPU)
free_unr(vpid_unr, vpid);
}
static void
vpid_alloc(uint16_t *vpid, int num)
{
int i, x;
if (num <= 0 || num > VM_MAXCPU)
panic("invalid number of vpids requested: %d", num);
/*
* If the "enable vpid" execution control is not enabled then the
* VPID is required to be 0 for all vcpus.
*/
if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
for (i = 0; i < num; i++)
vpid[i] = 0;
return;
}
/*
* Allocate a unique VPID for each vcpu from the unit number allocator.
*/
for (i = 0; i < num; i++) {
x = alloc_unr(vpid_unr);
if (x == -1)
break;
else
vpid[i] = x;
}
if (i < num) {
atomic_add_int(&vpid_alloc_failed, 1);
/*
* If the unit number allocator does not have enough unique
* VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
*
* These VPIDs are not be unique across VMs but this does not
* affect correctness because the combined mappings are also
* tagged with the EP4TA which is unique for each VM.
*
* It is still sub-optimal because the invvpid will invalidate
* combined mappings for a particular VPID across all EP4TAs.
*/
while (i-- > 0)
vpid_free(vpid[i]);
for (i = 0; i < num; i++)
vpid[i] = i + 1;
}
}
static void
vpid_init(void)
{
/*
* VPID 0 is required when the "enable VPID" execution control is
* disabled.
*
* VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
* unit number allocator does not have sufficient unique VPIDs to
* satisfy the allocation.
*
* The remaining VPIDs are managed by the unit number allocator.
*/
vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
}
static void
vmx_disable(void *arg __unused)
{
struct invvpid_desc invvpid_desc = { 0 };
struct invept_desc invept_desc = { 0 };
if (vmxon_enabled[curcpu]) {
/*
* See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
*
* VMXON or VMXOFF are not required to invalidate any TLB
* caching structures. This prevents potential retention of
* cached information in the TLB between distinct VMX episodes.
*/
invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
vmxoff();
}
load_cr4(rcr4() & ~CR4_VMXE);
}
static int
vmx_cleanup(void)
{
if (pirvec >= 0)
lapic_ipi_free(pirvec);
if (vpid_unr != NULL) {
delete_unrhdr(vpid_unr);
vpid_unr = NULL;
}
smp_rendezvous(NULL, vmx_disable, NULL, NULL);
return (0);
}
static void
vmx_enable(void *arg __unused)
{
int error;
uint64_t feature_control;
feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
(feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
wrmsr(MSR_IA32_FEATURE_CONTROL,
feature_control | IA32_FEATURE_CONTROL_VMX_EN |
IA32_FEATURE_CONTROL_LOCK);
}
load_cr4(rcr4() | CR4_VMXE);
*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
error = vmxon(vmxon_region[curcpu]);
if (error == 0)
vmxon_enabled[curcpu] = 1;
}
static void
vmx_restore(void)
{
if (vmxon_enabled[curcpu])
vmxon(vmxon_region[curcpu]);
}
static int
vmx_init(int ipinum)
{
int error, use_tpr_shadow;
uint64_t basic, fixed0, fixed1, feature_control;
uint32_t tmp, procbased2_vid_bits;
/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
if (!(cpu_feature2 & CPUID2_VMX)) {
printf("vmx_init: processor does not support VMX operation\n");
return (ENXIO);
}
/*
* Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
* are set (bits 0 and 2 respectively).
*/
feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
(feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
printf("vmx_init: VMX operation disabled by BIOS\n");
return (ENXIO);
}
/*
* Verify capabilities MSR_VMX_BASIC:
* - bit 54 indicates support for INS/OUTS decoding
*/
basic = rdmsr(MSR_VMX_BASIC);
if ((basic & (1UL << 54)) == 0) {
printf("vmx_init: processor does not support desired basic "
"capabilities\n");
return (EINVAL);
}
/* Check support for primary processor-based VM-execution controls */
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
MSR_VMX_TRUE_PROCBASED_CTLS,
PROCBASED_CTLS_ONE_SETTING,
PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
if (error) {
printf("vmx_init: processor does not support desired primary "
"processor-based controls\n");
return (error);
}
/* Clear the processor-based ctl bits that are set on demand */
procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
/* Check support for secondary processor-based VM-execution controls */
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
MSR_VMX_PROCBASED_CTLS2,
PROCBASED_CTLS2_ONE_SETTING,
PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
if (error) {
printf("vmx_init: processor does not support desired secondary "
"processor-based controls\n");
return (error);
}
/* Check support for VPID */
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
PROCBASED2_ENABLE_VPID, 0, &tmp);
if (error == 0)
procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
/* Check support for pin-based VM-execution controls */
error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
MSR_VMX_TRUE_PINBASED_CTLS,
PINBASED_CTLS_ONE_SETTING,
PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
if (error) {
printf("vmx_init: processor does not support desired "
"pin-based controls\n");
return (error);
}
/* Check support for VM-exit controls */
error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
VM_EXIT_CTLS_ONE_SETTING,
VM_EXIT_CTLS_ZERO_SETTING,
&exit_ctls);
if (error) {
printf("vmx_init: processor does not support desired "
"exit controls\n");
return (error);
}
/* Check support for VM-entry controls */
error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
&entry_ctls);
if (error) {
printf("vmx_init: processor does not support desired "
"entry controls\n");
return (error);
}
/*
* Check support for optional features by testing them
* as individual bits
*/
cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
MSR_VMX_TRUE_PROCBASED_CTLS,
PROCBASED_HLT_EXITING, 0,
&tmp) == 0);
cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
MSR_VMX_PROCBASED_CTLS,
PROCBASED_MTF, 0,
&tmp) == 0);
cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
MSR_VMX_TRUE_PROCBASED_CTLS,
PROCBASED_PAUSE_EXITING, 0,
&tmp) == 0);
cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
MSR_VMX_PROCBASED_CTLS2,
PROCBASED2_UNRESTRICTED_GUEST, 0,
&tmp) == 0);
cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
&tmp) == 0);
/*
* Check support for virtual interrupt delivery.
*/
procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
PROCBASED2_VIRTUALIZE_X2APIC_MODE |
PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
&tmp) == 0);
error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
procbased2_vid_bits, 0, &tmp);
if (error == 0 && use_tpr_shadow) {
virtual_interrupt_delivery = 1;
TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
&virtual_interrupt_delivery);
}
if (virtual_interrupt_delivery) {
procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
procbased_ctls2 |= procbased2_vid_bits;
procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
/*
* No need to emulate accesses to %CR8 if virtual
* interrupt delivery is enabled.
*/
procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
/*
* Check for Posted Interrupts only if Virtual Interrupt
* Delivery is enabled.
*/
error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
&tmp);
if (error == 0) {
pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
&IDTVEC(justreturn));
if (pirvec < 0) {
if (bootverbose) {
printf("vmx_init: unable to allocate "
"posted interrupt vector\n");
}
} else {
posted_interrupts = 1;
TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
&posted_interrupts);
}
}
}
if (posted_interrupts)
pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
/* Initialize EPT */
error = ept_init(ipinum);
if (error) {
printf("vmx_init: ept initialization failed (%d)\n", error);
return (error);
}
/*
* Stash the cr0 and cr4 bits that must be fixed to 0 or 1
*/
fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
cr0_ones_mask = fixed0 & fixed1;
cr0_zeros_mask = ~fixed0 & ~fixed1;
/*
* CR0_PE and CR0_PG can be set to zero in VMX non-root operation
* if unrestricted guest execution is allowed.
*/
if (cap_unrestricted_guest)
cr0_ones_mask &= ~(CR0_PG | CR0_PE);
/*
* Do not allow the guest to set CR0_NW or CR0_CD.
*/
cr0_zeros_mask |= (CR0_NW | CR0_CD);
fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
cr4_ones_mask = fixed0 & fixed1;
cr4_zeros_mask = ~fixed0 & ~fixed1;
vpid_init();
vmx_msr_init();
/* enable VMX operation */
smp_rendezvous(NULL, vmx_enable, NULL, NULL);
vmx_initialized = 1;
return (0);
}
static void
vmx_trigger_hostintr(int vector)
{
uintptr_t func;
struct gate_descriptor *gd;
gd = &idt[vector];
KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
"invalid vector %d", vector));
KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
vector));
KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
"has invalid type %d", vector, gd->gd_type));
KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
"has invalid dpl %d", vector, gd->gd_dpl));
KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
"for vector %d has invalid selector %d", vector, gd->gd_selector));
KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
"IST %d", vector, gd->gd_ist));
func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
vmx_call_isr(func);
}
static int
vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
{
int error, mask_ident, shadow_ident;
uint64_t mask_value;
if (which != 0 && which != 4)
panic("vmx_setup_cr_shadow: unknown cr%d", which);
if (which == 0) {
mask_ident = VMCS_CR0_MASK;
mask_value = cr0_ones_mask | cr0_zeros_mask;
shadow_ident = VMCS_CR0_SHADOW;
} else {
mask_ident = VMCS_CR4_MASK;
mask_value = cr4_ones_mask | cr4_zeros_mask;
shadow_ident = VMCS_CR4_SHADOW;
}
error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
if (error)
return (error);
error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
if (error)
return (error);
return (0);
}
#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
static void *
vmx_vminit(struct vm *vm, pmap_t pmap)
{
uint16_t vpid[VM_MAXCPU];
int i, error;
struct vmx *vmx;
struct vmcs *vmcs;
uint32_t exc_bitmap;
vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
if ((uintptr_t)vmx & PAGE_MASK) {
panic("malloc of struct vmx not aligned on %d byte boundary",
PAGE_SIZE);
}
vmx->vm = vm;
vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
/*
* Clean up EPTP-tagged guest physical and combined mappings
*
* VMX transitions are not required to invalidate any guest physical
* mappings. So, it may be possible for stale guest physical mappings
* to be present in the processor TLBs.
*
* Combined mappings for this EP4TA are also invalidated for all VPIDs.
*/
ept_invalidate_mappings(vmx->eptp);
msr_bitmap_initialize(vmx->msr_bitmap);
/*
* It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
* The guest FSBASE and GSBASE are saved and restored during
* vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
* always restored from the vmcs host state area on vm-exit.
*
* The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
* how they are saved/restored so can be directly accessed by the
* guest.
*
* MSR_EFER is saved and restored in the guest VMCS area on a
* VM exit and entry respectively. It is also restored from the
* host VMCS area on a VM exit.
*
* The TSC MSR is exposed read-only. Writes are disallowed as
* that will impact the host TSC. If the guest does a write
* the "use TSC offsetting" execution control is enabled and the
* difference between the host TSC and the guest TSC is written
* into the TSC offset in the VMCS.
*/
if (guest_msr_rw(vmx, MSR_GSBASE) ||
guest_msr_rw(vmx, MSR_FSBASE) ||
guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
guest_msr_rw(vmx, MSR_EFER) ||
guest_msr_ro(vmx, MSR_TSC))
panic("vmx_vminit: error setting guest msr access");
vpid_alloc(vpid, VM_MAXCPU);
if (virtual_interrupt_delivery) {
error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
APIC_ACCESS_ADDRESS);
/* XXX this should really return an error to the caller */
KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
}
for (i = 0; i < VM_MAXCPU; i++) {
vmcs = &vmx->vmcs[i];
vmcs->identifier = vmx_revision();
error = vmclear(vmcs);
if (error != 0) {
panic("vmx_vminit: vmclear error %d on vcpu %d\n",
error, i);
}
vmx_msr_guest_init(vmx, i);
error = vmcs_init(vmcs);
KASSERT(error == 0, ("vmcs_init error %d", error));
VMPTRLD(vmcs);
error = 0;
error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
error += vmwrite(VMCS_EPTP, vmx->eptp);
error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
error += vmwrite(VMCS_VPID, vpid[i]);
/* exception bitmap */
if (vcpu_trace_exceptions(vm, i))
exc_bitmap = 0xffffffff;
else
exc_bitmap = 1 << IDT_MC;
error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
vmx->ctx[i].guest_dr6 = 0xffff0ff0;
error += vmwrite(VMCS_GUEST_DR7, 0x400);
if (virtual_interrupt_delivery) {
error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
error += vmwrite(VMCS_VIRTUAL_APIC,
vtophys(&vmx->apic_page[i]));
error += vmwrite(VMCS_EOI_EXIT0, 0);
error += vmwrite(VMCS_EOI_EXIT1, 0);
error += vmwrite(VMCS_EOI_EXIT2, 0);
error += vmwrite(VMCS_EOI_EXIT3, 0);
}
if (posted_interrupts) {
error += vmwrite(VMCS_PIR_VECTOR, pirvec);
error += vmwrite(VMCS_PIR_DESC,
vtophys(&vmx->pir_desc[i]));
}
VMCLEAR(vmcs);
KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
vmx->cap[i].set = 0;
vmx->cap[i].proc_ctls = procbased_ctls;
vmx->cap[i].proc_ctls2 = procbased_ctls2;
vmx->state[i].nextrip = ~0;
vmx->state[i].lastcpu = NOCPU;
vmx->state[i].vpid = vpid[i];
/*
* Set up the CR0/4 shadows, and init the read shadow
* to the power-on register value from the Intel Sys Arch.
* CR0 - 0x60000010
* CR4 - 0
*/
error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
if (error != 0)
panic("vmx_setup_cr0_shadow %d", error);
error = vmx_setup_cr4_shadow(vmcs, 0);
if (error != 0)
panic("vmx_setup_cr4_shadow %d", error);
vmx->ctx[i].pmap = pmap;
}
return (vmx);
}
static int
vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
{
int handled, func;
func = vmxctx->guest_rax;
handled = x86_emulate_cpuid(vm, vcpu,
(uint32_t*)(&vmxctx->guest_rax),
(uint32_t*)(&vmxctx->guest_rbx),
(uint32_t*)(&vmxctx->guest_rcx),
(uint32_t*)(&vmxctx->guest_rdx));
return (handled);
}
static __inline void
vmx_run_trace(struct vmx *vmx, int vcpu)
{
#ifdef KTR
VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
#endif
}
static __inline void
vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
int handled)
{
#ifdef KTR
VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
handled ? "handled" : "unhandled",
exit_reason_to_str(exit_reason), rip);
#endif
}
static __inline void
vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
{
#ifdef KTR
VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
#endif
}
static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
/*
* Invalidate guest mappings identified by its vpid from the TLB.
*/
static __inline void
vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
{
struct vmxstate *vmxstate;
struct invvpid_desc invvpid_desc;
vmxstate = &vmx->state[vcpu];
if (vmxstate->vpid == 0)
return;
if (!running) {
/*
* Set the 'lastcpu' to an invalid host cpu.
*
* This will invalidate TLB entries tagged with the vcpu's
* vpid the next time it runs via vmx_set_pcpu_defaults().
*/
vmxstate->lastcpu = NOCPU;
return;
}
KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
"critical section", __func__, vcpu));
/*
* Invalidate all mappings tagged with 'vpid'
*
* We do this because this vcpu was executing on a different host
* cpu when it last ran. We do not track whether it invalidated
* mappings associated with its 'vpid' during that run. So we must
* assume that the mappings associated with 'vpid' on 'curcpu' are
* stale and invalidate them.
*
* Note that we incur this penalty only when the scheduler chooses to
* move the thread associated with this vcpu between host cpus.
*
* Note also that this will invalidate mappings tagged with 'vpid'
* for "all" EP4TAs.
*/
if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
invvpid_desc._res1 = 0;
invvpid_desc._res2 = 0;
invvpid_desc.vpid = vmxstate->vpid;
invvpid_desc.linear_addr = 0;
invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
} else {
/*
* The invvpid can be skipped if an invept is going to
* be performed before entering the guest. The invept
* will invalidate combined mappings tagged with
* 'vmx->eptp' for all vpids.
*/
vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
}
}
static void
vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
{
struct vmxstate *vmxstate;
vmxstate = &vmx->state[vcpu];
if (vmxstate->lastcpu == curcpu)
return;
vmxstate->lastcpu = curcpu;
vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
vmx_invvpid(vmx, vcpu, pmap, 1);
}
/*
* We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
*/
CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
static void __inline
vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
{
if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
}
}
static void __inline
vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
{
KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
}
static void __inline
vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
{
if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
}
}
static void __inline
vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
{
KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
}
int
vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
{
int error;
if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
}
error = vmwrite(VMCS_TSC_OFFSET, offset);
return (error);
}
#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \
VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \
VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
static void
vmx_inject_nmi(struct vmx *vmx, int vcpu)
{
uint32_t gi, info;
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
"interruptibility-state %#x", gi));
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
"VM-entry interruption information %#x", info));
/*
* Inject the virtual NMI. The vector must be the NMI IDT entry
* or the VMCS entry check will fail.
*/
info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
/* Clear the request */
vm_nmi_clear(vmx->vm, vcpu);
}
static void
vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
uint64_t guestrip)
{
int vector, need_nmi_exiting, extint_pending;
uint64_t rflags, entryinfo;
uint32_t gi, info;
if (vmx->state[vcpu].nextrip != guestrip) {
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
if (gi & HWINTR_BLOCKING) {
VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
"cleared due to rip change: %#lx/%#lx",
vmx->state[vcpu].nextrip, guestrip);
gi &= ~HWINTR_BLOCKING;
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
}
}
if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
"intinfo is not valid: %#lx", __func__, entryinfo));
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
"pending exception: %#lx/%#x", __func__, entryinfo, info));
info = entryinfo;
vector = info & 0xff;
if (vector == IDT_BP || vector == IDT_OF) {
/*
* VT-x requires #BP and #OF to be injected as software
* exceptions.
*/
info &= ~VMCS_INTR_T_MASK;
info |= VMCS_INTR_T_SWEXCEPTION;
}
if (info & VMCS_INTR_DEL_ERRCODE)
vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
}
if (vm_nmi_pending(vmx->vm, vcpu)) {
/*
* If there are no conditions blocking NMI injection then
* inject it directly here otherwise enable "NMI window
* exiting" to inject it as soon as we can.
*
* We also check for STI_BLOCKING because some implementations
* don't allow NMI injection in this case. If we are running
* on a processor that doesn't have this restriction it will
* immediately exit and the NMI will be injected in the
* "NMI window exiting" handler.
*/
need_nmi_exiting = 1;
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
if ((info & VMCS_INTR_VALID) == 0) {
vmx_inject_nmi(vmx, vcpu);
need_nmi_exiting = 0;
} else {
VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
"due to VM-entry intr info %#x", info);
}
} else {
VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
"Guest Interruptibility-state %#x", gi);
}
if (need_nmi_exiting)
vmx_set_nmi_window_exiting(vmx, vcpu);
}
extint_pending = vm_extint_pending(vmx->vm, vcpu);
if (!extint_pending && virtual_interrupt_delivery) {
vmx_inject_pir(vlapic);
return;
}
/*
* If interrupt-window exiting is already in effect then don't bother
* checking for pending interrupts. This is just an optimization and
* not needed for correctness.
*/
if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
"pending int_window_exiting");
return;
}
if (!extint_pending) {
/* Ask the local apic for a vector to inject */
if (!vlapic_pending_intr(vlapic, &vector))
return;
/*
* From the Intel SDM, Volume 3, Section "Maskable
* Hardware Interrupts":
* - maskable interrupt vectors [16,255] can be delivered
* through the local APIC.
*/
KASSERT(vector >= 16 && vector <= 255,
("invalid vector %d from local APIC", vector));
} else {
/* Ask the legacy pic for a vector to inject */
vatpic_pending_intr(vmx->vm, &vector);
/*
* From the Intel SDM, Volume 3, Section "Maskable
* Hardware Interrupts":
* - maskable interrupt vectors [0,255] can be delivered
* through the INTR pin.
*/
KASSERT(vector >= 0 && vector <= 255,
("invalid vector %d from INTR", vector));
}
/* Check RFLAGS.IF and the interruptibility state of the guest */
rflags = vmcs_read(VMCS_GUEST_RFLAGS);
if ((rflags & PSL_I) == 0) {
VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
"rflags %#lx", vector, rflags);
goto cantinject;
}
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
if (gi & HWINTR_BLOCKING) {
VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
"Guest Interruptibility-state %#x", vector, gi);
goto cantinject;
}
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
if (info & VMCS_INTR_VALID) {
/*
* This is expected and could happen for multiple reasons:
* - A vectoring VM-entry was aborted due to astpending
* - A VM-exit happened during event injection.
* - An exception was injected above.
* - An NMI was injected above or after "NMI window exiting"
*/
VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
"VM-entry intr info %#x", vector, info);
goto cantinject;
}
/* Inject the interrupt */
info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
info |= vector;
vmcs_write(VMCS_ENTRY_INTR_INFO, info);
if (!extint_pending) {
/* Update the Local APIC ISR */
vlapic_intr_accepted(vlapic, vector);
} else {
vm_extint_clear(vmx->vm, vcpu);
vatpic_intr_accepted(vmx->vm, vector);
/*
* After we accepted the current ExtINT the PIC may
* have posted another one. If that is the case, set
* the Interrupt Window Exiting execution control so
* we can inject that one too.
*
* Also, interrupt window exiting allows us to inject any
* pending APIC vector that was preempted by the ExtINT
* as soon as possible. This applies both for the software
* emulated vlapic and the hardware assisted virtual APIC.
*/
vmx_set_int_window_exiting(vmx, vcpu);
}
VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
return;
cantinject:
/*
* Set the Interrupt Window Exiting execution control so we can inject
* the interrupt as soon as blocking condition goes away.
*/
vmx_set_int_window_exiting(vmx, vcpu);
}
/*
* If the Virtual NMIs execution control is '1' then the logical processor
* tracks virtual-NMI blocking in the Guest Interruptibility-state field of
* the VMCS. An IRET instruction in VMX non-root operation will remove any
* virtual-NMI blocking.
*
* This unblocking occurs even if the IRET causes a fault. In this case the
* hypervisor needs to restore virtual-NMI blocking before resuming the guest.
*/
static void
vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
{
uint32_t gi;
VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
}
static void
vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
{
uint32_t gi;
VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
}
static void
vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
{
uint32_t gi;
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
("NMI blocking is not in effect %#x", gi));
}
static int
vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
struct vmxctx *vmxctx;
uint64_t xcrval;
const struct xsave_limits *limits;
vmxctx = &vmx->ctx[vcpu];
limits = vmm_get_xsave_limits();
/*
* Note that the processor raises a GP# fault on its own if
* xsetbv is executed for CPL != 0, so we do not have to
* emulate that fault here.
*/
/* Only xcr0 is supported. */
if (vmxctx->guest_rcx != 0) {
vm_inject_gp(vmx->vm, vcpu);
return (HANDLED);
}
/* We only handle xcr0 if both the host and guest have XSAVE enabled. */
if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
vm_inject_ud(vmx->vm, vcpu);
return (HANDLED);
}
xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
if ((xcrval & ~limits->xcr0_allowed) != 0) {
vm_inject_gp(vmx->vm, vcpu);
return (HANDLED);
}
if (!(xcrval & XFEATURE_ENABLED_X87)) {
vm_inject_gp(vmx->vm, vcpu);
return (HANDLED);
}
/* AVX (YMM_Hi128) requires SSE. */
if (xcrval & XFEATURE_ENABLED_AVX &&
(xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
vm_inject_gp(vmx->vm, vcpu);
return (HANDLED);
}
/*
* AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
* ZMM_Hi256, and Hi16_ZMM.
*/
if (xcrval & XFEATURE_AVX512 &&
(xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
(XFEATURE_AVX512 | XFEATURE_AVX)) {
vm_inject_gp(vmx->vm, vcpu);
return (HANDLED);
}
/*
* Intel MPX requires both bound register state flags to be
* set.
*/
if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
vm_inject_gp(vmx->vm, vcpu);
return (HANDLED);
}
/*
* This runs "inside" vmrun() with the guest's FPU state, so
* modifying xcr0 directly modifies the guest's xcr0, not the
* host's.
*/
load_xcr(0, xcrval);
return (HANDLED);
}
static uint64_t
vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
{
const struct vmxctx *vmxctx;
vmxctx = &vmx->ctx[vcpu];
switch (ident) {
case 0:
return (vmxctx->guest_rax);
case 1:
return (vmxctx->guest_rcx);
case 2:
return (vmxctx->guest_rdx);
case 3:
return (vmxctx->guest_rbx);
case 4:
return (vmcs_read(VMCS_GUEST_RSP));
case 5:
return (vmxctx->guest_rbp);
case 6:
return (vmxctx->guest_rsi);
case 7:
return (vmxctx->guest_rdi);
case 8:
return (vmxctx->guest_r8);
case 9:
return (vmxctx->guest_r9);
case 10:
return (vmxctx->guest_r10);
case 11:
return (vmxctx->guest_r11);
case 12:
return (vmxctx->guest_r12);
case 13:
return (vmxctx->guest_r13);
case 14:
return (vmxctx->guest_r14);
case 15:
return (vmxctx->guest_r15);
default:
panic("invalid vmx register %d", ident);
}
}
static void
vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
{
struct vmxctx *vmxctx;
vmxctx = &vmx->ctx[vcpu];
switch (ident) {
case 0:
vmxctx->guest_rax = regval;
break;
case 1:
vmxctx->guest_rcx = regval;
break;
case 2:
vmxctx->guest_rdx = regval;
break;
case 3:
vmxctx->guest_rbx = regval;
break;
case 4:
vmcs_write(VMCS_GUEST_RSP, regval);
break;
case 5:
vmxctx->guest_rbp = regval;
break;
case 6:
vmxctx->guest_rsi = regval;
break;
case 7:
vmxctx->guest_rdi = regval;
break;
case 8:
vmxctx->guest_r8 = regval;
break;
case 9:
vmxctx->guest_r9 = regval;
break;
case 10:
vmxctx->guest_r10 = regval;
break;
case 11:
vmxctx->guest_r11 = regval;
break;
case 12:
vmxctx->guest_r12 = regval;
break;
case 13:
vmxctx->guest_r13 = regval;
break;
case 14:
vmxctx->guest_r14 = regval;
break;
case 15:
vmxctx->guest_r15 = regval;
break;
default:
panic("invalid vmx register %d", ident);
}
}
static int
vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
{
uint64_t crval, regval;
/* We only handle mov to %cr0 at this time */
if ((exitqual & 0xf0) != 0x00)
return (UNHANDLED);
regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
vmcs_write(VMCS_CR0_SHADOW, regval);
crval = regval | cr0_ones_mask;
crval &= ~cr0_zeros_mask;
vmcs_write(VMCS_GUEST_CR0, crval);
if (regval & CR0_PG) {
uint64_t efer, entry_ctls;
/*
* If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
* the "IA-32e mode guest" bit in VM-entry control must be
* equal.
*/
efer = vmcs_read(VMCS_GUEST_IA32_EFER);
if (efer & EFER_LME) {
efer |= EFER_LMA;
vmcs_write(VMCS_GUEST_IA32_EFER, efer);
entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
entry_ctls |= VM_ENTRY_GUEST_LMA;
vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
}
}
return (HANDLED);
}
static int
vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
{
uint64_t crval, regval;
/* We only handle mov to %cr4 at this time */
if ((exitqual & 0xf0) != 0x00)
return (UNHANDLED);
regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
vmcs_write(VMCS_CR4_SHADOW, regval);
crval = regval | cr4_ones_mask;
crval &= ~cr4_zeros_mask;
vmcs_write(VMCS_GUEST_CR4, crval);
return (HANDLED);
}
static int
vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
{
struct vlapic *vlapic;
uint64_t cr8;
int regnum;
/* We only handle mov %cr8 to/from a register at this time. */
if ((exitqual & 0xe0) != 0x00) {
return (UNHANDLED);
}
vlapic = vm_lapic(vmx->vm, vcpu);
regnum = (exitqual >> 8) & 0xf;
if (exitqual & 0x10) {
cr8 = vlapic_get_cr8(vlapic);
vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
} else {
cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
vlapic_set_cr8(vlapic, cr8);
}
return (HANDLED);
}
/*
* From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
*/
static int
vmx_cpl(void)
{
uint32_t ssar;
ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
return ((ssar >> 5) & 0x3);
}
static enum vm_cpu_mode
vmx_cpu_mode(void)
{
uint32_t csar;
if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
if (csar & 0x2000)
return (CPU_MODE_64BIT); /* CS.L = 1 */
else
return (CPU_MODE_COMPATIBILITY);
} else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
return (CPU_MODE_PROTECTED);
} else {
return (CPU_MODE_REAL);
}
}
static enum vm_paging_mode
vmx_paging_mode(void)
{
if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
return (PAGING_MODE_FLAT);
if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
return (PAGING_MODE_32);
if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
return (PAGING_MODE_64);
else
return (PAGING_MODE_PAE);
}
static uint64_t
inout_str_index(struct vmx *vmx, int vcpuid, int in)
{
uint64_t val;
int error;
enum vm_reg_name reg;
reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
error = vmx_getreg(vmx, vcpuid, reg, &val);
KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
return (val);
}
static uint64_t
inout_str_count(struct vmx *vmx, int vcpuid, int rep)
{
uint64_t val;
int error;
if (rep) {
error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
} else {
val = 1;
}
return (val);
}
static int
inout_str_addrsize(uint32_t inst_info)
{
uint32_t size;
size = (inst_info >> 7) & 0x7;
switch (size) {
case 0:
return (2); /* 16 bit */
case 1:
return (4); /* 32 bit */
case 2:
return (8); /* 64 bit */
default:
panic("%s: invalid size encoding %d", __func__, size);
}
}
static void
inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
struct vm_inout_str *vis)
{
int error, s;
if (in) {
vis->seg_name = VM_REG_GUEST_ES;
} else {
s = (inst_info >> 15) & 0x7;
vis->seg_name = vm_segment_name(s);
}
error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
}
static void
vmx_paging_info(struct vm_guest_paging *paging)
{
paging->cr3 = vmcs_guest_cr3();
paging->cpl = vmx_cpl();
paging->cpu_mode = vmx_cpu_mode();
paging->paging_mode = vmx_paging_mode();
}
static void
vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
{
struct vm_guest_paging *paging;
uint32_t csar;
paging = &vmexit->u.inst_emul.paging;
vmexit->exitcode = VM_EXITCODE_INST_EMUL;
vmexit->inst_length = 0;
vmexit->u.inst_emul.gpa = gpa;
vmexit->u.inst_emul.gla = gla;
vmx_paging_info(paging);
switch (paging->cpu_mode) {
case CPU_MODE_REAL:
vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
vmexit->u.inst_emul.cs_d = 0;
break;
case CPU_MODE_PROTECTED:
case CPU_MODE_COMPATIBILITY:
vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
break;
default:
vmexit->u.inst_emul.cs_base = 0;
vmexit->u.inst_emul.cs_d = 0;
break;
}
vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
}
static int
ept_fault_type(uint64_t ept_qual)
{
int fault_type;
if (ept_qual & EPT_VIOLATION_DATA_WRITE)
fault_type = VM_PROT_WRITE;
else if (ept_qual & EPT_VIOLATION_INST_FETCH)
fault_type = VM_PROT_EXECUTE;
else
fault_type= VM_PROT_READ;
return (fault_type);
}
static boolean_t
ept_emulation_fault(uint64_t ept_qual)
{
int read, write;
/* EPT fault on an instruction fetch doesn't make sense here */
if (ept_qual & EPT_VIOLATION_INST_FETCH)
return (FALSE);
/* EPT fault must be a read fault or a write fault */
read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
if ((read | write) == 0)
return (FALSE);
/*
* The EPT violation must have been caused by accessing a
* guest-physical address that is a translation of a guest-linear
* address.
*/
if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
(ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
return (FALSE);
}
return (TRUE);
}
static __inline int
apic_access_virtualization(struct vmx *vmx, int vcpuid)
{
uint32_t proc_ctls2;
proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
}
static __inline int
x2apic_virtualization(struct vmx *vmx, int vcpuid)
{
uint32_t proc_ctls2;
proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
}
static int
vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
uint64_t qual)
{
int error, handled, offset;
uint32_t *apic_regs, vector;
bool retu;
handled = HANDLED;
offset = APIC_WRITE_OFFSET(qual);
if (!apic_access_virtualization(vmx, vcpuid)) {
/*
* In general there should not be any APIC write VM-exits
* unless APIC-access virtualization is enabled.
*
* However self-IPI virtualization can legitimately trigger
* an APIC-write VM-exit so treat it specially.
*/
if (x2apic_virtualization(vmx, vcpuid) &&
offset == APIC_OFFSET_SELF_IPI) {
apic_regs = (uint32_t *)(vlapic->apic_page);
vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
vlapic_self_ipi_handler(vlapic, vector);
return (HANDLED);
} else
return (UNHANDLED);
}
switch (offset) {
case APIC_OFFSET_ID:
vlapic_id_write_handler(vlapic);
break;
case APIC_OFFSET_LDR:
vlapic_ldr_write_handler(vlapic);
break;
case APIC_OFFSET_DFR:
vlapic_dfr_write_handler(vlapic);
break;
case APIC_OFFSET_SVR:
vlapic_svr_write_handler(vlapic);
break;
case APIC_OFFSET_ESR:
vlapic_esr_write_handler(vlapic);
break;
case APIC_OFFSET_ICR_LOW:
retu = false;
error = vlapic_icrlo_write_handler(vlapic, &retu);
if (error != 0 || retu)
handled = UNHANDLED;
break;
case APIC_OFFSET_CMCI_LVT:
case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
vlapic_lvt_write_handler(vlapic, offset);
break;
case APIC_OFFSET_TIMER_ICR:
vlapic_icrtmr_write_handler(vlapic);
break;
case APIC_OFFSET_TIMER_DCR:
vlapic_dcr_write_handler(vlapic);
break;
default:
handled = UNHANDLED;
break;
}
return (handled);
}
static bool
apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
{
if (apic_access_virtualization(vmx, vcpuid) &&
(gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
return (true);
else
return (false);
}
static int
vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
{
uint64_t qual;
int access_type, offset, allowed;
if (!apic_access_virtualization(vmx, vcpuid))
return (UNHANDLED);
qual = vmexit->u.vmx.exit_qualification;
access_type = APIC_ACCESS_TYPE(qual);
offset = APIC_ACCESS_OFFSET(qual);
allowed = 0;
if (access_type == 0) {
/*
* Read data access to the following registers is expected.
*/
switch (offset) {
case APIC_OFFSET_APR:
case APIC_OFFSET_PPR:
case APIC_OFFSET_RRR:
case APIC_OFFSET_CMCI_LVT:
case APIC_OFFSET_TIMER_CCR:
allowed = 1;
break;
default:
break;
}
} else if (access_type == 1) {
/*
* Write data access to the following registers is expected.
*/
switch (offset) {
case APIC_OFFSET_VER:
case APIC_OFFSET_APR:
case APIC_OFFSET_PPR:
case APIC_OFFSET_RRR:
case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
case APIC_OFFSET_CMCI_LVT:
case APIC_OFFSET_TIMER_CCR:
allowed = 1;
break;
default:
break;
}
}
if (allowed) {
vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
VIE_INVALID_GLA);
}
/*
* Regardless of whether the APIC-access is allowed this handler
* always returns UNHANDLED:
* - if the access is allowed then it is handled by emulating the
* instruction that caused the VM-exit (outside the critical section)
* - if the access is not allowed then it will be converted to an
* exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
*/
return (UNHANDLED);
}
static enum task_switch_reason
vmx_task_switch_reason(uint64_t qual)
{
int reason;
reason = (qual >> 30) & 0x3;
switch (reason) {
case 0:
return (TSR_CALL);
case 1:
return (TSR_IRET);
case 2:
return (TSR_JMP);
case 3:
return (TSR_IDT_GATE);
default:
panic("%s: invalid reason %d", __func__, reason);
}
}
static int
emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
{
int error;
if (lapic_msr(num))
error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
else
error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
return (error);
}
static int
emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
{
struct vmxctx *vmxctx;
uint64_t result;
uint32_t eax, edx;
int error;
if (lapic_msr(num))
error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
else
error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
if (error == 0) {
eax = result;
vmxctx = &vmx->ctx[vcpuid];
error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
edx = result >> 32;
error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
}
return (error);
}
static int
vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
{
int error, errcode, errcode_valid, handled, in;
struct vmxctx *vmxctx;
struct vlapic *vlapic;
struct vm_inout_str *vis;
struct vm_task_switch *ts;
uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
uint32_t intr_type, intr_vec, reason;
uint64_t exitintinfo, qual, gpa;
bool retu;
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
handled = UNHANDLED;
vmxctx = &vmx->ctx[vcpu];
qual = vmexit->u.vmx.exit_qualification;
reason = vmexit->u.vmx.exit_reason;
vmexit->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
/*
* VM-entry failures during or after loading guest state.
*
* These VM-exits are uncommon but must be handled specially
* as most VM-exit fields are not populated as usual.
*/
if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
__asm __volatile("int $18");
return (1);
}
/*
* VM exits that can be triggered during event delivery need to
* be handled specially by re-injecting the event if the IDT
* vectoring information field's valid bit is set.
*
* See "Information for VM Exits During Event Delivery" in Intel SDM
* for details.
*/
idtvec_info = vmcs_idt_vectoring_info();
if (idtvec_info & VMCS_IDT_VEC_VALID) {
idtvec_info &= ~(1 << 12); /* clear undefined bit */
exitintinfo = idtvec_info;
if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
idtvec_err = vmcs_idt_vectoring_err();
exitintinfo |= (uint64_t)idtvec_err << 32;
}
error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
__func__, error));
/*
* If 'virtual NMIs' are being used and the VM-exit
* happened while injecting an NMI during the previous
* VM-entry, then clear "blocking by NMI" in the
* Guest Interruptibility-State so the NMI can be
* reinjected on the subsequent VM-entry.
*
* However, if the NMI was being delivered through a task
* gate, then the new task must start execution with NMIs
* blocked so don't clear NMI blocking in this case.
*/
intr_type = idtvec_info & VMCS_INTR_T_MASK;
if (intr_type == VMCS_INTR_T_NMI) {
if (reason != EXIT_REASON_TASK_SWITCH)
vmx_clear_nmi_blocking(vmx, vcpu);
else
vmx_assert_nmi_blocking(vmx, vcpu);
}
/*
* Update VM-entry instruction length if the event being
* delivered was a software interrupt or software exception.
*/
if (intr_type == VMCS_INTR_T_SWINTR ||
intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
intr_type == VMCS_INTR_T_SWEXCEPTION) {
vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
}
}
switch (reason) {
case EXIT_REASON_TASK_SWITCH:
ts = &vmexit->u.task_switch;
ts->tsssel = qual & 0xffff;
ts->reason = vmx_task_switch_reason(qual);
ts->ext = 0;
ts->errcode_valid = 0;
vmx_paging_info(&ts->paging);
/*
* If the task switch was due to a CALL, JMP, IRET, software
* interrupt (INT n) or software exception (INT3, INTO),
* then the saved %rip references the instruction that caused
* the task switch. The instruction length field in the VMCS
* is valid in this case.
*
* In all other cases (e.g., NMI, hardware exception) the
* saved %rip is one that would have been saved in the old TSS
* had the task switch completed normally so the instruction
* length field is not needed in this case and is explicitly
* set to 0.
*/
if (ts->reason == TSR_IDT_GATE) {
KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
("invalid idtvec_info %#x for IDT task switch",
idtvec_info));
intr_type = idtvec_info & VMCS_INTR_T_MASK;
if (intr_type != VMCS_INTR_T_SWINTR &&
intr_type != VMCS_INTR_T_SWEXCEPTION &&
intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
/* Task switch triggered by external event */
ts->ext = 1;
vmexit->inst_length = 0;
if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
ts->errcode_valid = 1;
ts->errcode = vmcs_idt_vectoring_err();
}
}
}
vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
"%s errcode 0x%016lx", ts->reason, ts->tsssel,
ts->ext ? "external" : "internal",
((uint64_t)ts->errcode << 32) | ts->errcode_valid);
break;
case EXIT_REASON_CR_ACCESS:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
switch (qual & 0xf) {
case 0:
handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
break;
case 4:
handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
break;
case 8:
handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
break;
}
break;
case EXIT_REASON_RDMSR:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
retu = false;
ecx = vmxctx->guest_rcx;
VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
if (error) {
vmexit->exitcode = VM_EXITCODE_RDMSR;
vmexit->u.msr.code = ecx;
} else if (!retu) {
handled = HANDLED;
} else {
/* Return to userspace with a valid exitcode */
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
("emulate_rdmsr retu with bogus exitcode"));
}
break;
case EXIT_REASON_WRMSR:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
retu = false;
eax = vmxctx->guest_rax;
ecx = vmxctx->guest_rcx;
edx = vmxctx->guest_rdx;
VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
ecx, (uint64_t)edx << 32 | eax);
error = emulate_wrmsr(vmx, vcpu, ecx,
(uint64_t)edx << 32 | eax, &retu);
if (error) {
vmexit->exitcode = VM_EXITCODE_WRMSR;
vmexit->u.msr.code = ecx;
vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
} else if (!retu) {
handled = HANDLED;
} else {
/* Return to userspace with a valid exitcode */
KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
("emulate_wrmsr retu with bogus exitcode"));
}
break;
case EXIT_REASON_HLT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
vmexit->exitcode = VM_EXITCODE_HLT;
vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
if (virtual_interrupt_delivery)
vmexit->u.hlt.intr_status =
vmcs_read(VMCS_GUEST_INTR_STATUS);
else
vmexit->u.hlt.intr_status = 0;
break;
case EXIT_REASON_MTF:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
vmexit->exitcode = VM_EXITCODE_MTRAP;
vmexit->inst_length = 0;
break;
case EXIT_REASON_PAUSE:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
vmexit->exitcode = VM_EXITCODE_PAUSE;
break;
case EXIT_REASON_INTR_WINDOW:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
vmx_clear_int_window_exiting(vmx, vcpu);
return (1);
case EXIT_REASON_EXT_INTR:
/*
* External interrupts serve only to cause VM exits and allow
* the host interrupt handler to run.
*
* If this external interrupt triggers a virtual interrupt
* to a VM, then that state will be recorded by the
* host interrupt handler in the VM's softc. We will inject
* this virtual interrupt during the subsequent VM enter.
*/
intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
/*
* XXX: Ignore this exit if VMCS_INTR_VALID is not set.
* This appears to be a bug in VMware Fusion?
*/
if (!(intr_info & VMCS_INTR_VALID))
return (1);
KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
(intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
("VM exit interruption info invalid: %#x", intr_info));
vmx_trigger_hostintr(intr_info & 0xff);
/*
* This is special. We want to treat this as an 'handled'
* VM-exit but not increment the instruction pointer.
*/
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
return (1);
case EXIT_REASON_NMI_WINDOW:
/* Exit to allow the pending virtual NMI to be injected */
if (vm_nmi_pending(vmx->vm, vcpu))
vmx_inject_nmi(vmx, vcpu);
vmx_clear_nmi_window_exiting(vmx, vcpu);
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
return (1);
case EXIT_REASON_INOUT:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
vmexit->exitcode = VM_EXITCODE_INOUT;
vmexit->u.inout.bytes = (qual & 0x7) + 1;
vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
vmexit->u.inout.port = (uint16_t)(qual >> 16);
vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
if (vmexit->u.inout.string) {
inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
vmexit->exitcode = VM_EXITCODE_INOUT_STR;
vis = &vmexit->u.inout_str;
vmx_paging_info(&vis->paging);
vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
vis->index = inout_str_index(vmx, vcpu, in);
vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
vis->addrsize = inout_str_addrsize(inst_info);
inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
}
break;
case EXIT_REASON_CPUID:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
break;
case EXIT_REASON_EXCEPTION:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
KASSERT((intr_info & VMCS_INTR_VALID) != 0,
("VM exit interruption info invalid: %#x", intr_info));
intr_vec = intr_info & 0xff;
intr_type = intr_info & VMCS_INTR_T_MASK;
/*
* If Virtual NMIs control is 1 and the VM-exit is due to a
* fault encountered during the execution of IRET then we must
* restore the state of "virtual-NMI blocking" before resuming
* the guest.
*
* See "Resuming Guest Software after Handling an Exception".
* See "Information for VM Exits Due to Vectored Events".
*/
if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
(intr_vec != IDT_DF) &&
(intr_info & EXIT_QUAL_NMIUDTI) != 0)
vmx_restore_nmi_blocking(vmx, vcpu);
/*
* The NMI has already been handled in vmx_exit_handle_nmi().
*/
if (intr_type == VMCS_INTR_T_NMI)
return (1);
/*
* Call the machine check handler by hand. Also don't reflect
* the machine check back into the guest.
*/
if (intr_vec == IDT_MC) {
VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
__asm __volatile("int $18");
return (1);
}
if (intr_vec == IDT_PF) {
error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
__func__, error));
}
/*
* Software exceptions exhibit trap-like behavior. This in
* turn requires populating the VM-entry instruction length
* so that the %rip in the trap frame is past the INT3/INTO
* instruction.
*/
if (intr_type == VMCS_INTR_T_SWEXCEPTION)
vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
/* Reflect all other exceptions back into the guest */
errcode_valid = errcode = 0;
if (intr_info & VMCS_INTR_DEL_ERRCODE) {
errcode_valid = 1;
errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
}
VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
"the guest", intr_vec, errcode);
error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
errcode_valid, errcode, 0);
KASSERT(error == 0, ("%s: vm_inject_exception error %d",
__func__, error));
return (1);
case EXIT_REASON_EPT_FAULT:
/*
* If 'gpa' lies within the address space allocated to
* memory then this must be a nested page fault otherwise
* this must be an instruction that accesses MMIO space.
*/
gpa = vmcs_gpa();
if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
apic_access_fault(vmx, vcpu, gpa)) {
vmexit->exitcode = VM_EXITCODE_PAGING;
vmexit->inst_length = 0;
vmexit->u.paging.gpa = gpa;
vmexit->u.paging.fault_type = ept_fault_type(qual);
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
} else if (ept_emulation_fault(qual)) {
vmexit_inst_emul(vmexit, gpa, vmcs_gla());
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
}
/*
* If Virtual NMIs control is 1 and the VM-exit is due to an
* EPT fault during the execution of IRET then we must restore
* the state of "virtual-NMI blocking" before resuming.
*
* See description of "NMI unblocking due to IRET" in
* "Exit Qualification for EPT Violations".
*/
if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
(qual & EXIT_QUAL_NMIUDTI) != 0)
vmx_restore_nmi_blocking(vmx, vcpu);
break;
case EXIT_REASON_VIRTUALIZED_EOI:
vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
vmexit->u.ioapic_eoi.vector = qual & 0xFF;
vmexit->inst_length = 0; /* trap-like */
break;
case EXIT_REASON_APIC_ACCESS:
handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
break;
case EXIT_REASON_APIC_WRITE:
/*
* APIC-write VM exit is trap-like so the %rip is already
* pointing to the next instruction.
*/
vmexit->inst_length = 0;
vlapic = vm_lapic(vmx->vm, vcpu);
handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
break;
case EXIT_REASON_XSETBV:
handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
break;
case EXIT_REASON_MONITOR:
vmexit->exitcode = VM_EXITCODE_MONITOR;
break;
case EXIT_REASON_MWAIT:
vmexit->exitcode = VM_EXITCODE_MWAIT;
break;
default:
vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
break;
}
if (handled) {
/*
* It is possible that control is returned to userland
* even though we were able to handle the VM exit in the
* kernel.
*
* In such a case we want to make sure that the userland
* restarts guest execution at the instruction *after*
* the one we just processed. Therefore we update the
* guest rip in the VMCS and in 'vmexit'.
*/
vmexit->rip += vmexit->inst_length;
vmexit->inst_length = 0;
vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
} else {
if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
/*
* If this VM exit was not claimed by anybody then
* treat it as a generic VMX exit.
*/
vmexit->exitcode = VM_EXITCODE_VMX;
vmexit->u.vmx.status = VM_SUCCESS;
vmexit->u.vmx.inst_type = 0;
vmexit->u.vmx.inst_error = 0;
} else {
/*
* The exitcode and collateral have been populated.
* The VM exit will be processed further in userland.
*/
}
}
return (handled);
}
static __inline void
vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
{
KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
("vmx_exit_inst_error: invalid inst_fail_status %d",
vmxctx->inst_fail_status));
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_VMX;
vmexit->u.vmx.status = vmxctx->inst_fail_status;
vmexit->u.vmx.inst_error = vmcs_instruction_error();
vmexit->u.vmx.exit_reason = ~0;
vmexit->u.vmx.exit_qualification = ~0;
switch (rc) {
case VMX_VMRESUME_ERROR:
case VMX_VMLAUNCH_ERROR:
case VMX_INVEPT_ERROR:
vmexit->u.vmx.inst_type = rc;
break;
default:
panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
}
}
/*
* If the NMI-exiting VM execution control is set to '1' then an NMI in
* non-root operation causes a VM-exit. NMI blocking is in effect so it is
* sufficient to simply vector to the NMI handler via a software interrupt.
* However, this must be done before maskable interrupts are enabled
* otherwise the "iret" issued by an interrupt handler will incorrectly
* clear NMI blocking.
*/
static __inline void
vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
{
uint32_t intr_info;
KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
return;
intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
KASSERT((intr_info & VMCS_INTR_VALID) != 0,
("VM exit interruption info invalid: %#x", intr_info));
if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
"to NMI has invalid vector: %#x", intr_info));
VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
__asm __volatile("int $2");
}
}
static __inline void
vmx_dr_enter_guest(struct vmxctx *vmxctx)
{
register_t rflags;
/* Save host control debug registers. */
vmxctx->host_dr7 = rdr7();
vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
/*
* Disable debugging in DR7 and DEBUGCTL to avoid triggering
* exceptions in the host based on the guest DRx values. The
* guest DR7 and DEBUGCTL are saved/restored in the VMCS.
*/
load_dr7(0);
wrmsr(MSR_DEBUGCTLMSR, 0);
/*
* Disable single stepping the kernel to avoid corrupting the
* guest DR6. A debugger might still be able to corrupt the
* guest DR6 by setting a breakpoint after this point and then
* single stepping.
*/
rflags = read_rflags();
vmxctx->host_tf = rflags & PSL_T;
write_rflags(rflags & ~PSL_T);
/* Save host debug registers. */
vmxctx->host_dr0 = rdr0();
vmxctx->host_dr1 = rdr1();
vmxctx->host_dr2 = rdr2();
vmxctx->host_dr3 = rdr3();
vmxctx->host_dr6 = rdr6();
/* Restore guest debug registers. */
load_dr0(vmxctx->guest_dr0);
load_dr1(vmxctx->guest_dr1);
load_dr2(vmxctx->guest_dr2);
load_dr3(vmxctx->guest_dr3);
load_dr6(vmxctx->guest_dr6);
}
static __inline void
vmx_dr_leave_guest(struct vmxctx *vmxctx)
{
/* Save guest debug registers. */
vmxctx->guest_dr0 = rdr0();
vmxctx->guest_dr1 = rdr1();
vmxctx->guest_dr2 = rdr2();
vmxctx->guest_dr3 = rdr3();
vmxctx->guest_dr6 = rdr6();
/*
* Restore host debug registers. Restore DR7, DEBUGCTL, and
* PSL_T last.
*/
load_dr0(vmxctx->host_dr0);
load_dr1(vmxctx->host_dr1);
load_dr2(vmxctx->host_dr2);
load_dr3(vmxctx->host_dr3);
load_dr6(vmxctx->host_dr6);
wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
load_dr7(vmxctx->host_dr7);
write_rflags(read_rflags() | vmxctx->host_tf);
}
static int
vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
struct vm_eventinfo *evinfo)
{
int rc, handled, launched;
struct vmx *vmx;
struct vm *vm;
struct vmxctx *vmxctx;
struct vmcs *vmcs;
struct vm_exit *vmexit;
struct vlapic *vlapic;
uint32_t exit_reason;
vmx = arg;
vm = vmx->vm;
vmcs = &vmx->vmcs[vcpu];
vmxctx = &vmx->ctx[vcpu];
vlapic = vm_lapic(vm, vcpu);
vmexit = vm_exitinfo(vm, vcpu);
launched = 0;
KASSERT(vmxctx->pmap == pmap,
("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
vmx_msr_guest_enter(vmx, vcpu);
VMPTRLD(vmcs);
/*
* XXX
* We do this every time because we may setup the virtual machine
* from a different process than the one that actually runs it.
*
* If the life of a virtual machine was spent entirely in the context
* of a single process we could do this once in vmx_vminit().
*/
vmcs_write(VMCS_HOST_CR3, rcr3());
vmcs_write(VMCS_GUEST_RIP, rip);
vmx_set_pcpu_defaults(vmx, vcpu, pmap);
do {
KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
"%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
handled = UNHANDLED;
/*
* Interrupts are disabled from this point on until the
* guest starts executing. This is done for the following
* reasons:
*
* If an AST is asserted on this thread after the check below,
* then the IPI_AST notification will not be lost, because it
* will cause a VM exit due to external interrupt as soon as
* the guest state is loaded.
*
* A posted interrupt after 'vmx_inject_interrupts()' will
* not be "lost" because it will be held pending in the host
* APIC because interrupts are disabled. The pending interrupt
* will be recognized as soon as the guest state is loaded.
*
* The same reasoning applies to the IPI generated by
* pmap_invalidate_ept().
*/
disable_intr();
vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
/*
* Check for vcpu suspension after injecting events because
* vmx_inject_interrupts() can suspend the vcpu due to a
* triple fault.
*/
if (vcpu_suspended(evinfo)) {
enable_intr();
vm_exit_suspended(vmx->vm, vcpu, rip);
break;
}
if (vcpu_rendezvous_pending(evinfo)) {
enable_intr();
vm_exit_rendezvous(vmx->vm, vcpu, rip);
break;
}
if (vcpu_reqidle(evinfo)) {
enable_intr();
vm_exit_reqidle(vmx->vm, vcpu, rip);
break;
}
if (vcpu_should_yield(vm, vcpu)) {
enable_intr();
vm_exit_astpending(vmx->vm, vcpu, rip);
vmx_astpending_trace(vmx, vcpu, rip);
handled = HANDLED;
break;
}
+ if (vcpu_debugged(vm, vcpu)) {
+ enable_intr();
+ vm_exit_debug(vmx->vm, vcpu, rip);
+ break;
+ }
+
vmx_run_trace(vmx, vcpu);
vmx_dr_enter_guest(vmxctx);
rc = vmx_enter_guest(vmxctx, vmx, launched);
vmx_dr_leave_guest(vmxctx);
/* Collect some information for VM exit processing */
vmexit->rip = rip = vmcs_guest_rip();
vmexit->inst_length = vmexit_instruction_length();
vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
/* Update 'nextrip' */
vmx->state[vcpu].nextrip = rip;
if (rc == VMX_GUEST_VMEXIT) {
vmx_exit_handle_nmi(vmx, vcpu, vmexit);
enable_intr();
handled = vmx_exit_process(vmx, vcpu, vmexit);
} else {
enable_intr();
vmx_exit_inst_error(vmxctx, rc, vmexit);
}
launched = 1;
vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
rip = vmexit->rip;
} while (handled);
/*
* If a VM exit has been handled then the exitcode must be BOGUS
* If a VM exit is not handled then the exitcode must not be BOGUS
*/
if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
(!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
panic("Mismatch between handled (%d) and exitcode (%d)",
handled, vmexit->exitcode);
}
if (!handled)
vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
vmexit->exitcode);
VMCLEAR(vmcs);
vmx_msr_guest_exit(vmx, vcpu);
return (0);
}
static void
vmx_vmcleanup(void *arg)
{
int i;
struct vmx *vmx = arg;
if (apic_access_virtualization(vmx, 0))
vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
for (i = 0; i < VM_MAXCPU; i++)
vpid_free(vmx->state[i].vpid);
free(vmx, M_VMX);
return;
}
static register_t *
vmxctx_regptr(struct vmxctx *vmxctx, int reg)
{
switch (reg) {
case VM_REG_GUEST_RAX:
return (&vmxctx->guest_rax);
case VM_REG_GUEST_RBX:
return (&vmxctx->guest_rbx);
case VM_REG_GUEST_RCX:
return (&vmxctx->guest_rcx);
case VM_REG_GUEST_RDX:
return (&vmxctx->guest_rdx);
case VM_REG_GUEST_RSI:
return (&vmxctx->guest_rsi);
case VM_REG_GUEST_RDI:
return (&vmxctx->guest_rdi);
case VM_REG_GUEST_RBP:
return (&vmxctx->guest_rbp);
case VM_REG_GUEST_R8:
return (&vmxctx->guest_r8);
case VM_REG_GUEST_R9:
return (&vmxctx->guest_r9);
case VM_REG_GUEST_R10:
return (&vmxctx->guest_r10);
case VM_REG_GUEST_R11:
return (&vmxctx->guest_r11);
case VM_REG_GUEST_R12:
return (&vmxctx->guest_r12);
case VM_REG_GUEST_R13:
return (&vmxctx->guest_r13);
case VM_REG_GUEST_R14:
return (&vmxctx->guest_r14);
case VM_REG_GUEST_R15:
return (&vmxctx->guest_r15);
case VM_REG_GUEST_CR2:
return (&vmxctx->guest_cr2);
case VM_REG_GUEST_DR0:
return (&vmxctx->guest_dr0);
case VM_REG_GUEST_DR1:
return (&vmxctx->guest_dr1);
case VM_REG_GUEST_DR2:
return (&vmxctx->guest_dr2);
case VM_REG_GUEST_DR3:
return (&vmxctx->guest_dr3);
case VM_REG_GUEST_DR6:
return (&vmxctx->guest_dr6);
default:
break;
}
return (NULL);
}
static int
vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
{
register_t *regp;
if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
*retval = *regp;
return (0);
} else
return (EINVAL);
}
static int
vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
{
register_t *regp;
if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
*regp = val;
return (0);
} else
return (EINVAL);
}
static int
vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
{
uint64_t gi;
int error;
error = vmcs_getreg(&vmx->vmcs[vcpu], running,
VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
*retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
return (error);
}
static int
vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
{
struct vmcs *vmcs;
uint64_t gi;
int error, ident;
/*
* Forcing the vcpu into an interrupt shadow is not supported.
*/
if (val) {
error = EINVAL;
goto done;
}
vmcs = &vmx->vmcs[vcpu];
ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
error = vmcs_getreg(vmcs, running, ident, &gi);
if (error == 0) {
gi &= ~HWINTR_BLOCKING;
error = vmcs_setreg(vmcs, running, ident, gi);
}
done:
VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
error ? "failed" : "succeeded");
return (error);
}
static int
vmx_shadow_reg(int reg)
{
int shreg;
shreg = -1;
switch (reg) {
case VM_REG_GUEST_CR0:
shreg = VMCS_CR0_SHADOW;
break;
case VM_REG_GUEST_CR4:
shreg = VMCS_CR4_SHADOW;
break;
default:
break;
}
return (shreg);
}
static int
vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
{
int running, hostcpu;
struct vmx *vmx = arg;
running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
if (running && hostcpu != curcpu)
panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
if (reg == VM_REG_GUEST_INTR_SHADOW)
return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
return (0);
return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
}
static int
vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
{
int error, hostcpu, running, shadow;
uint64_t ctls;
pmap_t pmap;
struct vmx *vmx = arg;
running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
if (running && hostcpu != curcpu)
panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
if (reg == VM_REG_GUEST_INTR_SHADOW)
return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
return (0);
error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
if (error == 0) {
/*
* If the "load EFER" VM-entry control is 1 then the
* value of EFER.LMA must be identical to "IA-32e mode guest"
* bit in the VM-entry control.
*/
if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
(reg == VM_REG_GUEST_EFER)) {
vmcs_getreg(&vmx->vmcs[vcpu], running,
VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
if (val & EFER_LMA)
ctls |= VM_ENTRY_GUEST_LMA;
else
ctls &= ~VM_ENTRY_GUEST_LMA;
vmcs_setreg(&vmx->vmcs[vcpu], running,
VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
}
shadow = vmx_shadow_reg(reg);
if (shadow > 0) {
/*
* Store the unmodified value in the shadow
*/
error = vmcs_setreg(&vmx->vmcs[vcpu], running,
VMCS_IDENT(shadow), val);
}
if (reg == VM_REG_GUEST_CR3) {
/*
* Invalidate the guest vcpu's TLB mappings to emulate
* the behavior of updating %cr3.
*
* XXX the processor retains global mappings when %cr3
* is updated but vmx_invvpid() does not.
*/
pmap = vmx->ctx[vcpu].pmap;
vmx_invvpid(vmx, vcpu, pmap, running);
}
}
return (error);
}
static int
vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
{
int hostcpu, running;
struct vmx *vmx = arg;
running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
if (running && hostcpu != curcpu)
panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
}
static int
vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
{
int hostcpu, running;
struct vmx *vmx = arg;
running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
if (running && hostcpu != curcpu)
panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
}
static int
vmx_getcap(void *arg, int vcpu, int type, int *retval)
{
struct vmx *vmx = arg;
int vcap;
int ret;
ret = ENOENT;
vcap = vmx->cap[vcpu].set;
switch (type) {
case VM_CAP_HALT_EXIT:
if (cap_halt_exit)
ret = 0;
break;
case VM_CAP_PAUSE_EXIT:
if (cap_pause_exit)
ret = 0;
break;
case VM_CAP_MTRAP_EXIT:
if (cap_monitor_trap)
ret = 0;
break;
case VM_CAP_UNRESTRICTED_GUEST:
if (cap_unrestricted_guest)
ret = 0;
break;
case VM_CAP_ENABLE_INVPCID:
if (cap_invpcid)
ret = 0;
break;
default:
break;
}
if (ret == 0)
*retval = (vcap & (1 << type)) ? 1 : 0;
return (ret);
}
static int
vmx_setcap(void *arg, int vcpu, int type, int val)
{
struct vmx *vmx = arg;
struct vmcs *vmcs = &vmx->vmcs[vcpu];
uint32_t baseval;
uint32_t *pptr;
int error;
int flag;
int reg;
int retval;
retval = ENOENT;
pptr = NULL;
switch (type) {
case VM_CAP_HALT_EXIT:
if (cap_halt_exit) {
retval = 0;
pptr = &vmx->cap[vcpu].proc_ctls;
baseval = *pptr;
flag = PROCBASED_HLT_EXITING;
reg = VMCS_PRI_PROC_BASED_CTLS;
}
break;
case VM_CAP_MTRAP_EXIT:
if (cap_monitor_trap) {
retval = 0;
pptr = &vmx->cap[vcpu].proc_ctls;
baseval = *pptr;
flag = PROCBASED_MTF;
reg = VMCS_PRI_PROC_BASED_CTLS;
}
break;
case VM_CAP_PAUSE_EXIT:
if (cap_pause_exit) {
retval = 0;
pptr = &vmx->cap[vcpu].proc_ctls;
baseval = *pptr;
flag = PROCBASED_PAUSE_EXITING;
reg = VMCS_PRI_PROC_BASED_CTLS;
}
break;
case VM_CAP_UNRESTRICTED_GUEST:
if (cap_unrestricted_guest) {
retval = 0;
pptr = &vmx->cap[vcpu].proc_ctls2;
baseval = *pptr;
flag = PROCBASED2_UNRESTRICTED_GUEST;
reg = VMCS_SEC_PROC_BASED_CTLS;
}
break;
case VM_CAP_ENABLE_INVPCID:
if (cap_invpcid) {
retval = 0;
pptr = &vmx->cap[vcpu].proc_ctls2;
baseval = *pptr;
flag = PROCBASED2_ENABLE_INVPCID;
reg = VMCS_SEC_PROC_BASED_CTLS;
}
break;
default:
break;
}
if (retval == 0) {
if (val) {
baseval |= flag;
} else {
baseval &= ~flag;
}
VMPTRLD(vmcs);
error = vmwrite(reg, baseval);
VMCLEAR(vmcs);
if (error) {
retval = error;
} else {
/*
* Update optional stored flags, and record
* setting
*/
if (pptr != NULL) {
*pptr = baseval;
}
if (val) {
vmx->cap[vcpu].set |= (1 << type);
} else {
vmx->cap[vcpu].set &= ~(1 << type);
}
}
}
return (retval);
}
struct vlapic_vtx {
struct vlapic vlapic;
struct pir_desc *pir_desc;
struct vmx *vmx;
};
#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \
do { \
VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \
level ? "level" : "edge", vector); \
VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \
VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \
VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \
VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \
VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
} while (0)
/*
* vlapic->ops handlers that utilize the APICv hardware assist described in
* Chapter 29 of the Intel SDM.
*/
static int
vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
{
struct vlapic_vtx *vlapic_vtx;
struct pir_desc *pir_desc;
uint64_t mask;
int idx, notify;
vlapic_vtx = (struct vlapic_vtx *)vlapic;
pir_desc = vlapic_vtx->pir_desc;
/*
* Keep track of interrupt requests in the PIR descriptor. This is
* because the virtual APIC page pointed to by the VMCS cannot be
* modified if the vcpu is running.
*/
idx = vector / 64;
mask = 1UL << (vector % 64);
atomic_set_long(&pir_desc->pir[idx], mask);
notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
level, "vmx_set_intr_ready");
return (notify);
}
static int
vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
{
struct vlapic_vtx *vlapic_vtx;
struct pir_desc *pir_desc;
struct LAPIC *lapic;
uint64_t pending, pirval;
uint32_t ppr, vpr;
int i;
/*
* This function is only expected to be called from the 'HLT' exit
* handler which does not care about the vector that is pending.
*/
KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
vlapic_vtx = (struct vlapic_vtx *)vlapic;
pir_desc = vlapic_vtx->pir_desc;
pending = atomic_load_acq_long(&pir_desc->pending);
if (!pending) {
/*
* While a virtual interrupt may have already been
* processed the actual delivery maybe pending the
* interruptibility of the guest. Recognize a pending
* interrupt by reevaluating virtual interrupts
* following Section 29.2.1 in the Intel SDM Volume 3.
*/
struct vm_exit *vmexit;
uint8_t rvi, ppr;
vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
("vmx_pending_intr: exitcode not 'HLT'"));
rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
lapic = vlapic->apic_page;
ppr = lapic->ppr & APIC_TPR_INT;
if (rvi > ppr) {
return (1);
}
return (0);
}
/*
* If there is an interrupt pending then it will be recognized only
* if its priority is greater than the processor priority.
*
* Special case: if the processor priority is zero then any pending
* interrupt will be recognized.
*/
lapic = vlapic->apic_page;
ppr = lapic->ppr & APIC_TPR_INT;
if (ppr == 0)
return (1);
VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
lapic->ppr);
for (i = 3; i >= 0; i--) {
pirval = pir_desc->pir[i];
if (pirval != 0) {
vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
return (vpr > ppr);
}
}
return (0);
}
static void
vmx_intr_accepted(struct vlapic *vlapic, int vector)
{
panic("vmx_intr_accepted: not expected to be called");
}
static void
vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
{
struct vlapic_vtx *vlapic_vtx;
struct vmx *vmx;
struct vmcs *vmcs;
uint64_t mask, val;
KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
("vmx_set_tmr: vcpu cannot be running"));
vlapic_vtx = (struct vlapic_vtx *)vlapic;
vmx = vlapic_vtx->vmx;
vmcs = &vmx->vmcs[vlapic->vcpuid];
mask = 1UL << (vector % 64);
VMPTRLD(vmcs);
val = vmcs_read(VMCS_EOI_EXIT(vector));
if (level)
val |= mask;
else
val &= ~mask;
vmcs_write(VMCS_EOI_EXIT(vector), val);
VMCLEAR(vmcs);
}
static void
vmx_enable_x2apic_mode(struct vlapic *vlapic)
{
struct vmx *vmx;
struct vmcs *vmcs;
uint32_t proc_ctls2;
int vcpuid, error;
vcpuid = vlapic->vcpuid;
vmx = ((struct vlapic_vtx *)vlapic)->vmx;
vmcs = &vmx->vmcs[vcpuid];
proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
VMPTRLD(vmcs);
vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
VMCLEAR(vmcs);
if (vlapic->vcpuid == 0) {
/*
* The nested page table mappings are shared by all vcpus
* so unmap the APIC access page just once.
*/
error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
__func__, error));
/*
* The MSR bitmap is shared by all vcpus so modify it only
* once in the context of vcpu 0.
*/
error = vmx_allow_x2apic_msrs(vmx);
KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
__func__, error));
}
}
static void
vmx_post_intr(struct vlapic *vlapic, int hostcpu)
{
ipi_cpu(hostcpu, pirvec);
}
/*
* Transfer the pending interrupts in the PIR descriptor to the IRR
* in the virtual APIC page.
*/
static void
vmx_inject_pir(struct vlapic *vlapic)
{
struct vlapic_vtx *vlapic_vtx;
struct pir_desc *pir_desc;
struct LAPIC *lapic;
uint64_t val, pirval;
int rvi, pirbase = -1;
uint16_t intr_status_old, intr_status_new;
vlapic_vtx = (struct vlapic_vtx *)vlapic;
pir_desc = vlapic_vtx->pir_desc;
if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
"no posted interrupt pending");
return;
}
pirval = 0;
pirbase = -1;
lapic = vlapic->apic_page;
val = atomic_readandclear_long(&pir_desc->pir[0]);
if (val != 0) {
lapic->irr0 |= val;
lapic->irr1 |= val >> 32;
pirbase = 0;
pirval = val;
}
val = atomic_readandclear_long(&pir_desc->pir[1]);
if (val != 0) {
lapic->irr2 |= val;
lapic->irr3 |= val >> 32;
pirbase = 64;
pirval = val;
}
val = atomic_readandclear_long(&pir_desc->pir[2]);
if (val != 0) {
lapic->irr4 |= val;
lapic->irr5 |= val >> 32;
pirbase = 128;
pirval = val;
}
val = atomic_readandclear_long(&pir_desc->pir[3]);
if (val != 0) {
lapic->irr6 |= val;
lapic->irr7 |= val >> 32;
pirbase = 192;
pirval = val;
}
VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
/*
* Update RVI so the processor can evaluate pending virtual
* interrupts on VM-entry.
*
* It is possible for pirval to be 0 here, even though the
* pending bit has been set. The scenario is:
* CPU-Y is sending a posted interrupt to CPU-X, which
* is running a guest and processing posted interrupts in h/w.
* CPU-X will eventually exit and the state seen in s/w is
* the pending bit set, but no PIR bits set.
*
* CPU-X CPU-Y
* (vm running) (host running)
* rx posted interrupt
* CLEAR pending bit
* SET PIR bit
* READ/CLEAR PIR bits
* SET pending bit
* (vm exit)
* pending bit set, PIR 0
*/
if (pirval != 0) {
rvi = pirbase + flsl(pirval) - 1;
intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
intr_status_new = (intr_status_old & 0xFF00) | rvi;
if (intr_status_new > intr_status_old) {
vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
"guest_intr_status changed from 0x%04x to 0x%04x",
intr_status_old, intr_status_new);
}
}
}
static struct vlapic *
vmx_vlapic_init(void *arg, int vcpuid)
{
struct vmx *vmx;
struct vlapic *vlapic;
struct vlapic_vtx *vlapic_vtx;
vmx = arg;
vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
vlapic->vm = vmx->vm;
vlapic->vcpuid = vcpuid;
vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
vlapic_vtx = (struct vlapic_vtx *)vlapic;
vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
vlapic_vtx->vmx = vmx;
if (virtual_interrupt_delivery) {
vlapic->ops.set_intr_ready = vmx_set_intr_ready;
vlapic->ops.pending_intr = vmx_pending_intr;
vlapic->ops.intr_accepted = vmx_intr_accepted;
vlapic->ops.set_tmr = vmx_set_tmr;
vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
}
if (posted_interrupts)
vlapic->ops.post_intr = vmx_post_intr;
vlapic_init(vlapic);
return (vlapic);
}
static void
vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
{
vlapic_cleanup(vlapic);
free(vlapic, M_VLAPIC);
}
struct vmm_ops vmm_ops_intel = {
vmx_init,
vmx_cleanup,
vmx_restore,
vmx_vminit,
vmx_run,
vmx_vmcleanup,
vmx_getreg,
vmx_setreg,
vmx_getdesc,
vmx_setdesc,
vmx_getcap,
vmx_setcap,
ept_vmspace_alloc,
ept_vmspace_free,
vmx_vlapic_init,
vmx_vlapic_cleanup,
};
Index: head/sys/amd64/vmm/vmm.c
===================================================================
--- head/sys/amd64/vmm/vmm.c (revision 332156)
+++ head/sys/amd64/vmm/vmm.c (revision 332157)
@@ -1,2594 +1,2666 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/pcpu.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <machine/cpu.h>
#include <machine/pcb.h>
#include <machine/smp.h>
#include <machine/md_var.h>
#include <x86/psl.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
#include "vmm_ioport.h"
#include "vmm_ktr.h"
#include "vmm_host.h"
#include "vmm_mem.h"
#include "vmm_util.h"
#include "vatpic.h"
#include "vatpit.h"
#include "vhpet.h"
#include "vioapic.h"
#include "vlapic.h"
#include "vpmtmr.h"
#include "vrtc.h"
#include "vmm_stat.h"
#include "vmm_lapic.h"
#include "io/ppt.h"
#include "io/iommu.h"
struct vlapic;
/*
* Initialization:
* (a) allocated when vcpu is created
* (i) initialized when vcpu is created and when it is reinitialized
* (o) initialized the first time the vcpu is created
* (x) initialized before use
*/
struct vcpu {
struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
enum vcpu_state state; /* (o) vcpu state */
int hostcpu; /* (o) vcpu's host cpu */
int reqidle; /* (i) request vcpu to idle */
struct vlapic *vlapic; /* (i) APIC device model */
enum x2apic_state x2apic_state; /* (i) APIC mode */
uint64_t exitintinfo; /* (i) events pending at VM exit */
int nmi_pending; /* (i) NMI pending */
int extint_pending; /* (i) INTR pending */
int exception_pending; /* (i) exception pending */
int exc_vector; /* (x) exception collateral */
int exc_errcode_valid;
uint32_t exc_errcode;
struct savefpu *guestfpu; /* (a,i) guest fpu state */
uint64_t guest_xcr0; /* (i) guest %xcr0 register */
void *stats; /* (a,i) statistics */
struct vm_exit exitinfo; /* (x) exit reason and collateral */
uint64_t nextrip; /* (x) next instruction to execute */
};
#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
struct mem_seg {
size_t len;
bool sysmem;
struct vm_object *object;
};
#define VM_MAX_MEMSEGS 3
struct mem_map {
vm_paddr_t gpa;
size_t len;
vm_ooffset_t segoff;
int segid;
int prot;
int flags;
};
#define VM_MAX_MEMMAPS 4
/*
* Initialization:
* (o) initialized the first time the VM is created
* (i) initialized when VM is created and when it is reinitialized
* (x) initialized before use
*/
struct vm {
void *cookie; /* (i) cpu-specific data */
void *iommu; /* (x) iommu-specific data */
struct vhpet *vhpet; /* (i) virtual HPET */
struct vioapic *vioapic; /* (i) virtual ioapic */
struct vatpic *vatpic; /* (i) virtual atpic */
struct vatpit *vatpit; /* (i) virtual atpit */
struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
struct vrtc *vrtc; /* (o) virtual RTC */
volatile cpuset_t active_cpus; /* (i) active vcpus */
+ volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */
int suspend; /* (i) stop VM execution */
volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */
cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */
void *rendezvous_arg; /* (x) rendezvous func/arg */
vm_rendezvous_func_t rendezvous_func;
struct mtx rendezvous_mtx; /* (o) rendezvous lock */
struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
struct vmspace *vmspace; /* (o) guest's address space */
char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
};
static int vmm_initialized;
static struct vmm_ops *ops;
#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0)
#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0)
#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
#define VMSPACE_ALLOC(min, max) \
(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
#define VMSPACE_FREE(vmspace) \
(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
#define VMGETREG(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETREG(vmi, vcpu, num, val) \
(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
#define VMGETDESC(vmi, vcpu, num, desc) \
(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
#define VMSETDESC(vmi, vcpu, num, desc) \
(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
#define VMGETCAP(vmi, vcpu, num, retval) \
(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
#define VMSETCAP(vmi, vcpu, num, val) \
(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
#define VLAPIC_INIT(vmi, vcpu) \
(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
#define VLAPIC_CLEANUP(vmi, vlapic) \
(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
#define fpu_stop_emulating() clts()
static MALLOC_DEFINE(M_VM, "vm", "vm");
/* statistics */
static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
/*
* Halt the guest if all vcpus are executing a HLT instruction with
* interrupts disabled.
*/
static int halt_detection_enabled = 1;
SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
&halt_detection_enabled, 0,
"Halt VM if all vcpus execute HLT with interrupts disabled");
static int vmm_ipinum;
SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
"IPI vector used for vcpu notifications");
static int trace_guest_exceptions;
SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
&trace_guest_exceptions, 0,
"Trap into hypervisor on all guest exceptions and reflect them back");
static void vm_free_memmap(struct vm *vm, int ident);
static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
#ifdef KTR
static const char *
vcpu_state2str(enum vcpu_state state)
{
switch (state) {
case VCPU_IDLE:
return ("idle");
case VCPU_FROZEN:
return ("frozen");
case VCPU_RUNNING:
return ("running");
case VCPU_SLEEPING:
return ("sleeping");
default:
return ("unknown");
}
}
#endif
static void
vcpu_cleanup(struct vm *vm, int i, bool destroy)
{
struct vcpu *vcpu = &vm->vcpu[i];
VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
if (destroy) {
vmm_stat_free(vcpu->stats);
fpu_save_area_free(vcpu->guestfpu);
}
}
static void
vcpu_init(struct vm *vm, int vcpu_id, bool create)
{
struct vcpu *vcpu;
KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
("vcpu_init: invalid vcpu %d", vcpu_id));
vcpu = &vm->vcpu[vcpu_id];
if (create) {
KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
"initialized", vcpu_id));
vcpu_lock_init(vcpu);
vcpu->state = VCPU_IDLE;
vcpu->hostcpu = NOCPU;
vcpu->guestfpu = fpu_save_area_alloc();
vcpu->stats = vmm_stat_alloc();
}
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
vcpu->reqidle = 0;
vcpu->exitintinfo = 0;
vcpu->nmi_pending = 0;
vcpu->extint_pending = 0;
vcpu->exception_pending = 0;
vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
fpu_save_area_reset(vcpu->guestfpu);
vmm_stat_init(vcpu->stats);
}
int
vcpu_trace_exceptions(struct vm *vm, int vcpuid)
{
return (trace_guest_exceptions);
}
struct vm_exit *
vm_exitinfo(struct vm *vm, int cpuid)
{
struct vcpu *vcpu;
if (cpuid < 0 || cpuid >= VM_MAXCPU)
panic("vm_exitinfo: invalid cpuid %d", cpuid);
vcpu = &vm->vcpu[cpuid];
return (&vcpu->exitinfo);
}
static void
vmm_resume(void)
{
VMM_RESUME();
}
static int
vmm_init(void)
{
int error;
vmm_host_state_init();
vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
&IDTVEC(justreturn));
if (vmm_ipinum < 0)
vmm_ipinum = IPI_AST;
error = vmm_mem_init();
if (error)
return (error);
if (vmm_is_intel())
ops = &vmm_ops_intel;
else if (vmm_is_amd())
ops = &vmm_ops_amd;
else
return (ENXIO);
vmm_resume_p = vmm_resume;
return (VMM_INIT(vmm_ipinum));
}
static int
vmm_handler(module_t mod, int what, void *arg)
{
int error;
switch (what) {
case MOD_LOAD:
vmmdev_init();
error = vmm_init();
if (error == 0)
vmm_initialized = 1;
break;
case MOD_UNLOAD:
error = vmmdev_cleanup();
if (error == 0) {
vmm_resume_p = NULL;
iommu_cleanup();
if (vmm_ipinum != IPI_AST)
lapic_ipi_free(vmm_ipinum);
error = VMM_CLEANUP();
/*
* Something bad happened - prevent new
* VMs from being created
*/
if (error)
vmm_initialized = 0;
}
break;
default:
error = 0;
break;
}
return (error);
}
static moduledata_t vmm_kmod = {
"vmm",
vmm_handler,
NULL
};
/*
* vmm initialization has the following dependencies:
*
* - VT-x initialization requires smp_rendezvous() and therefore must happen
* after SMP is fully functional (after SI_SUB_SMP).
*/
DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
MODULE_VERSION(vmm, 1);
static void
vm_init(struct vm *vm, bool create)
{
int i;
vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
vm->iommu = NULL;
vm->vioapic = vioapic_init(vm);
vm->vhpet = vhpet_init(vm);
vm->vatpic = vatpic_init(vm);
vm->vatpit = vatpit_init(vm);
vm->vpmtmr = vpmtmr_init(vm);
if (create)
vm->vrtc = vrtc_init(vm);
CPU_ZERO(&vm->active_cpus);
+ CPU_ZERO(&vm->debug_cpus);
vm->suspend = 0;
CPU_ZERO(&vm->suspended_cpus);
for (i = 0; i < VM_MAXCPU; i++)
vcpu_init(vm, i, create);
}
int
vm_create(const char *name, struct vm **retvm)
{
struct vm *vm;
struct vmspace *vmspace;
/*
* If vmm.ko could not be successfully initialized then don't attempt
* to create the virtual machine.
*/
if (!vmm_initialized)
return (ENXIO);
if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
return (EINVAL);
vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
if (vmspace == NULL)
return (ENOMEM);
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->vmspace = vmspace;
mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
vm_init(vm, true);
*retvm = vm;
return (0);
}
static void
vm_cleanup(struct vm *vm, bool destroy)
{
struct mem_map *mm;
int i;
ppt_unassign_all(vm);
if (vm->iommu != NULL)
iommu_destroy_domain(vm->iommu);
if (destroy)
vrtc_cleanup(vm->vrtc);
else
vrtc_reset(vm->vrtc);
vpmtmr_cleanup(vm->vpmtmr);
vatpit_cleanup(vm->vatpit);
vhpet_cleanup(vm->vhpet);
vatpic_cleanup(vm->vatpic);
vioapic_cleanup(vm->vioapic);
for (i = 0; i < VM_MAXCPU; i++)
vcpu_cleanup(vm, i, destroy);
VMCLEANUP(vm->cookie);
/*
* System memory is removed from the guest address space only when
* the VM is destroyed. This is because the mapping remains the same
* across VM reset.
*
* Device memory can be relocated by the guest (e.g. using PCI BARs)
* so those mappings are removed on a VM reset.
*/
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (destroy || !sysmem_mapping(vm, mm))
vm_free_memmap(vm, i);
}
if (destroy) {
for (i = 0; i < VM_MAX_MEMSEGS; i++)
vm_free_memseg(vm, i);
VMSPACE_FREE(vm->vmspace);
vm->vmspace = NULL;
}
}
void
vm_destroy(struct vm *vm)
{
vm_cleanup(vm, true);
free(vm, M_VM);
}
int
vm_reinit(struct vm *vm)
{
int error;
/*
* A virtual machine can be reset only if all vcpus are suspended.
*/
if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
vm_cleanup(vm, false);
vm_init(vm, false);
error = 0;
} else {
error = EBUSY;
}
return (error);
}
const char *
vm_name(struct vm *vm)
{
return (vm->name);
}
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
vm_object_t obj;
if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
return (ENOMEM);
else
return (0);
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
vmm_mmio_free(vm->vmspace, gpa, len);
return (0);
}
/*
* Return 'true' if 'gpa' is allocated in the guest address space.
*
* This function is called in the context of a running vcpu which acts as
* an implicit lock on 'vm->mem_maps[]'.
*/
bool
vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
{
struct mem_map *mm;
int i;
#ifdef INVARIANTS
int hostcpu, state;
state = vcpu_get_state(vm, vcpuid, &hostcpu);
KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
#endif
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
return (true); /* 'gpa' is sysmem or devmem */
}
if (ppt_is_mmio(vm, gpa))
return (true); /* 'gpa' is pci passthru mmio */
return (false);
}
int
vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
{
struct mem_seg *seg;
vm_object_t obj;
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
if (len == 0 || (len & PAGE_MASK))
return (EINVAL);
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
if (seg->len == len && seg->sysmem == sysmem)
return (EEXIST);
else
return (EINVAL);
}
obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
if (obj == NULL)
return (ENOMEM);
seg->len = len;
seg->object = obj;
seg->sysmem = sysmem;
return (0);
}
int
vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
vm_object_t *objptr)
{
struct mem_seg *seg;
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
seg = &vm->mem_segs[ident];
if (len)
*len = seg->len;
if (sysmem)
*sysmem = seg->sysmem;
if (objptr)
*objptr = seg->object;
return (0);
}
void
vm_free_memseg(struct vm *vm, int ident)
{
struct mem_seg *seg;
KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
("%s: invalid memseg ident %d", __func__, ident));
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
vm_object_deallocate(seg->object);
bzero(seg, sizeof(struct mem_seg));
}
}
int
vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
size_t len, int prot, int flags)
{
struct mem_seg *seg;
struct mem_map *m, *map;
vm_ooffset_t last;
int i, error;
if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
return (EINVAL);
if (flags & ~VM_MEMMAP_F_WIRED)
return (EINVAL);
if (segid < 0 || segid >= VM_MAX_MEMSEGS)
return (EINVAL);
seg = &vm->mem_segs[segid];
if (seg->object == NULL)
return (EINVAL);
last = first + len;
if (first < 0 || first >= last || last > seg->len)
return (EINVAL);
if ((gpa | first | last) & PAGE_MASK)
return (EINVAL);
map = NULL;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
m = &vm->mem_maps[i];
if (m->len == 0) {
map = m;
break;
}
}
if (map == NULL)
return (ENOSPC);
error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
len, 0, VMFS_NO_SPACE, prot, prot, 0);
if (error != KERN_SUCCESS)
return (EFAULT);
vm_object_reference(seg->object);
if (flags & VM_MEMMAP_F_WIRED) {
error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
if (error != KERN_SUCCESS) {
vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
return (EFAULT);
}
}
map->gpa = gpa;
map->len = len;
map->segoff = first;
map->segid = segid;
map->prot = prot;
map->flags = flags;
return (0);
}
int
vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
{
struct mem_map *mm, *mmnext;
int i;
mmnext = NULL;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (mm->len == 0 || mm->gpa < *gpa)
continue;
if (mmnext == NULL || mm->gpa < mmnext->gpa)
mmnext = mm;
}
if (mmnext != NULL) {
*gpa = mmnext->gpa;
if (segid)
*segid = mmnext->segid;
if (segoff)
*segoff = mmnext->segoff;
if (len)
*len = mmnext->len;
if (prot)
*prot = mmnext->prot;
if (flags)
*flags = mmnext->flags;
return (0);
} else {
return (ENOENT);
}
}
static void
vm_free_memmap(struct vm *vm, int ident)
{
struct mem_map *mm;
int error;
mm = &vm->mem_maps[ident];
if (mm->len) {
error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
mm->gpa + mm->len);
KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
__func__, error));
bzero(mm, sizeof(struct mem_map));
}
}
static __inline bool
sysmem_mapping(struct vm *vm, struct mem_map *mm)
{
if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
return (true);
else
return (false);
}
static vm_paddr_t
sysmem_maxaddr(struct vm *vm)
{
struct mem_map *mm;
vm_paddr_t maxaddr;
int i;
maxaddr = 0;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (sysmem_mapping(vm, mm)) {
if (maxaddr < mm->gpa + mm->len)
maxaddr = mm->gpa + mm->len;
}
}
return (maxaddr);
}
static void
vm_iommu_modify(struct vm *vm, boolean_t map)
{
int i, sz;
vm_paddr_t gpa, hpa;
struct mem_map *mm;
void *vp, *cookie, *host_domain;
sz = PAGE_SIZE;
host_domain = iommu_host_domain();
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (!sysmem_mapping(vm, mm))
continue;
if (map) {
KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
("iommu map found invalid memmap %#lx/%#lx/%#x",
mm->gpa, mm->len, mm->flags));
if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
continue;
mm->flags |= VM_MEMMAP_F_IOMMU;
} else {
if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
continue;
mm->flags &= ~VM_MEMMAP_F_IOMMU;
KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
("iommu unmap found invalid memmap %#lx/%#lx/%#x",
mm->gpa, mm->len, mm->flags));
}
gpa = mm->gpa;
while (gpa < mm->gpa + mm->len) {
vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
&cookie);
KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
vm_name(vm), gpa));
vm_gpa_release(cookie);
hpa = DMAP_TO_PHYS((uintptr_t)vp);
if (map) {
iommu_create_mapping(vm->iommu, gpa, hpa, sz);
iommu_remove_mapping(host_domain, hpa, sz);
} else {
iommu_remove_mapping(vm->iommu, gpa, sz);
iommu_create_mapping(host_domain, hpa, hpa, sz);
}
gpa += PAGE_SIZE;
}
}
/*
* Invalidate the cached translations associated with the domain
* from which pages were removed.
*/
if (map)
iommu_invalidate_tlb(host_domain);
else
iommu_invalidate_tlb(vm->iommu);
}
#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
int
vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
error = ppt_unassign_device(vm, bus, slot, func);
if (error)
return (error);
if (ppt_assigned_devices(vm) == 0)
vm_iommu_unmap(vm);
return (0);
}
int
vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
{
int error;
vm_paddr_t maxaddr;
/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
if (ppt_assigned_devices(vm) == 0) {
KASSERT(vm->iommu == NULL,
("vm_assign_pptdev: iommu must be NULL"));
maxaddr = sysmem_maxaddr(vm);
vm->iommu = iommu_create_domain(maxaddr);
if (vm->iommu == NULL)
return (ENXIO);
vm_iommu_map(vm);
}
error = ppt_assign_device(vm, bus, slot, func);
return (error);
}
void *
vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
void **cookie)
{
int i, count, pageoff;
struct mem_map *mm;
vm_page_t m;
#ifdef INVARIANTS
/*
* All vcpus are frozen by ioctls that modify the memory map
* (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
* guaranteed if at least one vcpu is in the VCPU_FROZEN state.
*/
int state;
KASSERT(vcpuid >= -1 && vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d",
__func__, vcpuid));
for (i = 0; i < VM_MAXCPU; i++) {
if (vcpuid != -1 && vcpuid != i)
continue;
state = vcpu_get_state(vm, i, NULL);
KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
__func__, state));
}
#endif
pageoff = gpa & PAGE_MASK;
if (len > PAGE_SIZE - pageoff)
panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
count = 0;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
gpa < mm->gpa + mm->len) {
count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
break;
}
}
if (count == 1) {
*cookie = m;
return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
} else {
*cookie = NULL;
return (NULL);
}
}
void
vm_gpa_release(void *cookie)
{
vm_page_t m = cookie;
vm_page_lock(m);
vm_page_unhold(m);
vm_page_unlock(m);
}
int
vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
return (VMGETREG(vm->cookie, vcpu, reg, retval));
}
int
vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
{
struct vcpu *vcpu;
int error;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
error = VMSETREG(vm->cookie, vcpuid, reg, val);
if (error || reg != VM_REG_GUEST_RIP)
return (error);
/* Set 'nextrip' to match the value of %rip */
VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
vcpu = &vm->vcpu[vcpuid];
vcpu->nextrip = val;
return (0);
}
static boolean_t
is_descriptor_table(int reg)
{
switch (reg) {
case VM_REG_GUEST_IDTR:
case VM_REG_GUEST_GDTR:
return (TRUE);
default:
return (FALSE);
}
}
static boolean_t
is_segment_register(int reg)
{
switch (reg) {
case VM_REG_GUEST_ES:
case VM_REG_GUEST_CS:
case VM_REG_GUEST_SS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
case VM_REG_GUEST_TR:
case VM_REG_GUEST_LDTR:
return (TRUE);
default:
return (FALSE);
}
}
int
vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMGETDESC(vm->cookie, vcpu, reg, desc));
}
int
vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMSETDESC(vm->cookie, vcpu, reg, desc));
}
static void
restore_guest_fpustate(struct vcpu *vcpu)
{
/* flush host state to the pcb */
fpuexit(curthread);
/* restore guest FPU state */
fpu_stop_emulating();
fpurestore(vcpu->guestfpu);
/* restore guest XCR0 if XSAVE is enabled in the host */
if (rcr4() & CR4_XSAVE)
load_xcr(0, vcpu->guest_xcr0);
/*
* The FPU is now "dirty" with the guest's state so turn on emulation
* to trap any access to the FPU by the host.
*/
fpu_start_emulating();
}
static void
save_guest_fpustate(struct vcpu *vcpu)
{
if ((rcr0() & CR0_TS) == 0)
panic("fpu emulation not enabled in host!");
/* save guest XCR0 and restore host XCR0 */
if (rcr4() & CR4_XSAVE) {
vcpu->guest_xcr0 = rxcr(0);
load_xcr(0, vmm_get_host_xcr0());
}
/* save guest FPU state */
fpu_stop_emulating();
fpusave(vcpu->guestfpu);
fpu_start_emulating();
}
static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
static int
vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
bool from_idle)
{
struct vcpu *vcpu;
int error;
vcpu = &vm->vcpu[vcpuid];
vcpu_assert_locked(vcpu);
/*
* State transitions from the vmmdev_ioctl() must always begin from
* the VCPU_IDLE state. This guarantees that there is only a single
* ioctl() operating on a vcpu at any point.
*/
if (from_idle) {
while (vcpu->state != VCPU_IDLE) {
vcpu->reqidle = 1;
vcpu_notify_event_locked(vcpu, false);
VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
"idle requested", vcpu_state2str(vcpu->state));
msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
}
} else {
KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
"vcpu idle state"));
}
if (vcpu->state == VCPU_RUNNING) {
KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
"mismatch for running vcpu", curcpu, vcpu->hostcpu));
} else {
KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
"vcpu that is not running", vcpu->hostcpu));
}
/*
* The following state transitions are allowed:
* IDLE -> FROZEN -> IDLE
* FROZEN -> RUNNING -> FROZEN
* FROZEN -> SLEEPING -> FROZEN
*/
switch (vcpu->state) {
case VCPU_IDLE:
case VCPU_RUNNING:
case VCPU_SLEEPING:
error = (newstate != VCPU_FROZEN);
break;
case VCPU_FROZEN:
error = (newstate == VCPU_FROZEN);
break;
default:
error = 1;
break;
}
if (error)
return (EBUSY);
VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
vcpu->state = newstate;
if (newstate == VCPU_RUNNING)
vcpu->hostcpu = curcpu;
else
vcpu->hostcpu = NOCPU;
if (newstate == VCPU_IDLE)
wakeup(&vcpu->state);
return (0);
}
static void
vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
panic("Error %d setting state to %d\n", error, newstate);
}
static void
vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
panic("Error %d setting state to %d", error, newstate);
}
static void
vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
{
KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
/*
* Update 'rendezvous_func' and execute a write memory barrier to
* ensure that it is visible across all host cpus. This is not needed
* for correctness but it does ensure that all the vcpus will notice
* that the rendezvous is requested immediately.
*/
vm->rendezvous_func = func;
wmb();
}
#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
do { \
if (vcpuid >= 0) \
VCPU_CTR0(vm, vcpuid, fmt); \
else \
VM_CTR0(vm, fmt); \
} while (0)
static void
vm_handle_rendezvous(struct vm *vm, int vcpuid)
{
KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
mtx_lock(&vm->rendezvous_mtx);
while (vm->rendezvous_func != NULL) {
/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
if (vcpuid != -1 &&
CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
!CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
}
if (CPU_CMP(&vm->rendezvous_req_cpus,
&vm->rendezvous_done_cpus) == 0) {
VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
vm_set_rendezvous_func(vm, NULL);
wakeup(&vm->rendezvous_func);
break;
}
RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
"vmrndv", 0);
}
mtx_unlock(&vm->rendezvous_mtx);
}
/*
* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
*/
static int
vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
{
struct vcpu *vcpu;
const char *wmesg;
int t, vcpu_halted, vm_halted;
KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
vcpu = &vm->vcpu[vcpuid];
vcpu_halted = 0;
vm_halted = 0;
vcpu_lock(vcpu);
while (1) {
/*
* Do a final check for pending NMI or interrupts before
* really putting this thread to sleep. Also check for
* software events that would cause this vcpu to wakeup.
*
* These interrupts/events could have happened after the
* vcpu returned from VMRUN() and before it acquired the
* vcpu lock above.
*/
if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
break;
if (vm_nmi_pending(vm, vcpuid))
break;
if (!intr_disabled) {
if (vm_extint_pending(vm, vcpuid) ||
vlapic_pending_intr(vcpu->vlapic, NULL)) {
break;
}
}
/* Don't go to sleep if the vcpu thread needs to yield */
if (vcpu_should_yield(vm, vcpuid))
break;
+ if (vcpu_debugged(vm, vcpuid))
+ break;
+
/*
* Some Linux guests implement "halt" by having all vcpus
* execute HLT with interrupts disabled. 'halted_cpus' keeps
* track of the vcpus that have entered this state. When all
* vcpus enter the halted state the virtual machine is halted.
*/
if (intr_disabled) {
wmesg = "vmhalt";
VCPU_CTR0(vm, vcpuid, "Halted");
if (!vcpu_halted && halt_detection_enabled) {
vcpu_halted = 1;
CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
}
if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
vm_halted = 1;
break;
}
} else {
wmesg = "vmidle";
}
t = ticks;
vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
/*
* XXX msleep_spin() cannot be interrupted by signals so
* wake up periodically to check pending signals.
*/
msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
}
if (vcpu_halted)
CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
vcpu_unlock(vcpu);
if (vm_halted)
vm_suspend(vm, VM_SUSPEND_HALT);
return (0);
}
static int
vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
{
int rv, ftype;
struct vm_map *map;
struct vcpu *vcpu;
struct vm_exit *vme;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
__func__, vme->inst_length));
ftype = vme->u.paging.fault_type;
KASSERT(ftype == VM_PROT_READ ||
ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
("vm_handle_paging: invalid fault_type %d", ftype));
if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
vme->u.paging.gpa, ftype);
if (rv == 0) {
VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
ftype == VM_PROT_READ ? "accessed" : "dirty",
vme->u.paging.gpa);
goto done;
}
}
map = &vm->vmspace->vm_map;
rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
"ftype = %d", rv, vme->u.paging.gpa, ftype);
if (rv != KERN_SUCCESS)
return (EFAULT);
done:
return (0);
}
static int
vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
{
struct vie *vie;
struct vcpu *vcpu;
struct vm_exit *vme;
uint64_t gla, gpa, cs_base;
struct vm_guest_paging *paging;
mem_region_read_t mread;
mem_region_write_t mwrite;
enum vm_cpu_mode cpu_mode;
int cs_d, error, fault;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
__func__, vme->inst_length));
gla = vme->u.inst_emul.gla;
gpa = vme->u.inst_emul.gpa;
cs_base = vme->u.inst_emul.cs_base;
cs_d = vme->u.inst_emul.cs_d;
vie = &vme->u.inst_emul.vie;
paging = &vme->u.inst_emul.paging;
cpu_mode = paging->cpu_mode;
VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
/* Fetch, decode and emulate the faulting instruction */
if (vie->num_valid == 0) {
error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
cs_base, VIE_INST_SIZE, vie, &fault);
} else {
/*
* The instruction bytes have already been copied into 'vie'
*/
error = fault = 0;
}
if (error || fault)
return (error);
if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
vme->rip + cs_base);
*retu = true; /* dump instruction bytes in userspace */
return (0);
}
/*
* Update 'nextrip' based on the length of the emulated instruction.
*/
vme->inst_length = vie->num_processed;
vcpu->nextrip += vie->num_processed;
VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
"decoding", vcpu->nextrip);
/* return to userland unless this is an in-kernel emulated device */
if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
mread = lapic_mmio_read;
mwrite = lapic_mmio_write;
} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
mread = vioapic_mmio_read;
mwrite = vioapic_mmio_write;
} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
mread = vhpet_mmio_read;
mwrite = vhpet_mmio_write;
} else {
*retu = true;
return (0);
}
error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
mread, mwrite, retu);
return (error);
}
static int
vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
{
int i, done;
struct vcpu *vcpu;
done = 0;
vcpu = &vm->vcpu[vcpuid];
CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
/*
* Wait until all 'active_cpus' have suspended themselves.
*
* Since a VM may be suspended at any time including when one or
* more vcpus are doing a rendezvous we need to call the rendezvous
* handler while we are waiting to prevent a deadlock.
*/
vcpu_lock(vcpu);
while (1) {
if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
break;
}
if (vm->rendezvous_func == NULL) {
VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
} else {
VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
vcpu_unlock(vcpu);
vm_handle_rendezvous(vm, vcpuid);
vcpu_lock(vcpu);
}
}
vcpu_unlock(vcpu);
/*
* Wakeup the other sleeping vcpus and return to userspace.
*/
for (i = 0; i < VM_MAXCPU; i++) {
if (CPU_ISSET(i, &vm->suspended_cpus)) {
vcpu_notify_event(vm, i, false);
}
}
*retu = true;
return (0);
}
static int
vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
vcpu->reqidle = 0;
vcpu_unlock(vcpu);
*retu = true;
return (0);
}
int
vm_suspend(struct vm *vm, enum vm_suspend_how how)
{
int i;
if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
return (EINVAL);
if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
VM_CTR2(vm, "virtual machine already suspended %d/%d",
vm->suspend, how);
return (EALREADY);
}
VM_CTR1(vm, "virtual machine successfully suspended %d", how);
/*
* Notify all active vcpus that they are now suspended.
*/
for (i = 0; i < VM_MAXCPU; i++) {
if (CPU_ISSET(i, &vm->active_cpus))
vcpu_notify_event(vm, i, false);
}
return (0);
}
void
vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
("vm_exit_suspended: invalid suspend type %d", vm->suspend));
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_SUSPENDED;
vmexit->u.suspended.how = vm->suspend;
}
void
+vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
+{
+ struct vm_exit *vmexit;
+
+ vmexit = vm_exitinfo(vm, vcpuid);
+ vmexit->rip = rip;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_DEBUG;
+}
+
+void
vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
}
void
vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_REQIDLE;
vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
}
void
vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
}
int
vm_run(struct vm *vm, struct vm_run *vmrun)
{
struct vm_eventinfo evinfo;
int error, vcpuid;
struct vcpu *vcpu;
struct pcb *pcb;
uint64_t tscval;
struct vm_exit *vme;
bool retu, intr_disabled;
pmap_t pmap;
vcpuid = vmrun->cpuid;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
if (!CPU_ISSET(vcpuid, &vm->active_cpus))
return (EINVAL);
if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
return (EINVAL);
pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
evinfo.rptr = &vm->rendezvous_func;
evinfo.sptr = &vm->suspend;
evinfo.iptr = &vcpu->reqidle;
restart:
critical_enter();
KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
("vm_run: absurd pm_active"));
tscval = rdtsc();
pcb = PCPU_GET(curpcb);
set_pcb_flags(pcb, PCB_FULL_IRET);
restore_guest_fpustate(vcpu);
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
save_guest_fpustate(vcpu);
vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
critical_exit();
if (error == 0) {
retu = false;
vcpu->nextrip = vme->rip + vme->inst_length;
switch (vme->exitcode) {
case VM_EXITCODE_REQIDLE:
error = vm_handle_reqidle(vm, vcpuid, &retu);
break;
case VM_EXITCODE_SUSPENDED:
error = vm_handle_suspend(vm, vcpuid, &retu);
break;
case VM_EXITCODE_IOAPIC_EOI:
vioapic_process_eoi(vm, vcpuid,
vme->u.ioapic_eoi.vector);
break;
case VM_EXITCODE_RENDEZVOUS:
vm_handle_rendezvous(vm, vcpuid);
error = 0;
break;
case VM_EXITCODE_HLT:
intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
break;
case VM_EXITCODE_PAGING:
error = vm_handle_paging(vm, vcpuid, &retu);
break;
case VM_EXITCODE_INST_EMUL:
error = vm_handle_inst_emul(vm, vcpuid, &retu);
break;
case VM_EXITCODE_INOUT:
case VM_EXITCODE_INOUT_STR:
error = vm_handle_inout(vm, vcpuid, vme, &retu);
break;
case VM_EXITCODE_MONITOR:
case VM_EXITCODE_MWAIT:
vm_inject_ud(vm, vcpuid);
break;
default:
retu = true; /* handled in userland */
break;
}
}
if (error == 0 && retu == false)
goto restart;
VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
/* copy the exit information */
bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
return (error);
}
int
vm_restart_instruction(void *arg, int vcpuid)
{
struct vm *vm;
struct vcpu *vcpu;
enum vcpu_state state;
uint64_t rip;
int error;
vm = arg;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
state = vcpu_get_state(vm, vcpuid, NULL);
if (state == VCPU_RUNNING) {
/*
* When a vcpu is "running" the next instruction is determined
* by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
* Thus setting 'inst_length' to zero will cause the current
* instruction to be restarted.
*/
vcpu->exitinfo.inst_length = 0;
VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
"setting inst_length to zero", vcpu->exitinfo.rip);
} else if (state == VCPU_FROZEN) {
/*
* When a vcpu is "frozen" it is outside the critical section
* around VMRUN() and 'nextrip' points to the next instruction.
* Thus instruction restart is achieved by setting 'nextrip'
* to the vcpu's %rip.
*/
error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
KASSERT(!error, ("%s: error %d getting rip", __func__, error));
VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
"nextrip from %#lx to %#lx", vcpu->nextrip, rip);
vcpu->nextrip = rip;
} else {
panic("%s: invalid state %d", __func__, state);
}
return (0);
}
int
vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
{
struct vcpu *vcpu;
int type, vector;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
if (info & VM_INTINFO_VALID) {
type = info & VM_INTINFO_TYPE;
vector = info & 0xff;
if (type == VM_INTINFO_NMI && vector != IDT_NMI)
return (EINVAL);
if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
return (EINVAL);
if (info & VM_INTINFO_RSVD)
return (EINVAL);
} else {
info = 0;
}
VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
vcpu->exitintinfo = info;
return (0);
}
enum exc_class {
EXC_BENIGN,
EXC_CONTRIBUTORY,
EXC_PAGEFAULT
};
#define IDT_VE 20 /* Virtualization Exception (Intel specific) */
static enum exc_class
exception_class(uint64_t info)
{
int type, vector;
KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
type = info & VM_INTINFO_TYPE;
vector = info & 0xff;
/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
switch (type) {
case VM_INTINFO_HWINTR:
case VM_INTINFO_SWINTR:
case VM_INTINFO_NMI:
return (EXC_BENIGN);
default:
/*
* Hardware exception.
*
* SVM and VT-x use identical type values to represent NMI,
* hardware interrupt and software interrupt.
*
* SVM uses type '3' for all exceptions. VT-x uses type '3'
* for exceptions except #BP and #OF. #BP and #OF use a type
* value of '5' or '6'. Therefore we don't check for explicit
* values of 'type' to classify 'intinfo' into a hardware
* exception.
*/
break;
}
switch (vector) {
case IDT_PF:
case IDT_VE:
return (EXC_PAGEFAULT);
case IDT_DE:
case IDT_TS:
case IDT_NP:
case IDT_SS:
case IDT_GP:
return (EXC_CONTRIBUTORY);
default:
return (EXC_BENIGN);
}
}
static int
nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
uint64_t *retinfo)
{
enum exc_class exc1, exc2;
int type1, vector1;
KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
/*
* If an exception occurs while attempting to call the double-fault
* handler the processor enters shutdown mode (aka triple fault).
*/
type1 = info1 & VM_INTINFO_TYPE;
vector1 = info1 & 0xff;
if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
info1, info2);
vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
*retinfo = 0;
return (0);
}
/*
* Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
*/
exc1 = exception_class(info1);
exc2 = exception_class(info2);
if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
(exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
/* Convert nested fault into a double fault. */
*retinfo = IDT_DF;
*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
*retinfo |= VM_INTINFO_DEL_ERRCODE;
} else {
/* Handle exceptions serially */
*retinfo = info2;
}
return (1);
}
static uint64_t
vcpu_exception_intinfo(struct vcpu *vcpu)
{
uint64_t info = 0;
if (vcpu->exception_pending) {
info = vcpu->exc_vector & 0xff;
info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
if (vcpu->exc_errcode_valid) {
info |= VM_INTINFO_DEL_ERRCODE;
info |= (uint64_t)vcpu->exc_errcode << 32;
}
}
return (info);
}
int
vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
{
struct vcpu *vcpu;
uint64_t info1, info2;
int valid;
KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
vcpu = &vm->vcpu[vcpuid];
info1 = vcpu->exitintinfo;
vcpu->exitintinfo = 0;
info2 = 0;
if (vcpu->exception_pending) {
info2 = vcpu_exception_intinfo(vcpu);
vcpu->exception_pending = 0;
VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
vcpu->exc_vector, info2);
}
if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
} else if (info1 & VM_INTINFO_VALID) {
*retinfo = info1;
valid = 1;
} else if (info2 & VM_INTINFO_VALID) {
*retinfo = info2;
valid = 1;
} else {
valid = 0;
}
if (valid) {
VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
"retinfo(%#lx)", __func__, info1, info2, *retinfo);
}
return (valid);
}
int
vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
*info1 = vcpu->exitintinfo;
*info2 = vcpu_exception_intinfo(vcpu);
return (0);
}
int
vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
uint32_t errcode, int restart_instruction)
{
struct vcpu *vcpu;
uint64_t regval;
int error;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
if (vector < 0 || vector >= 32)
return (EINVAL);
/*
* A double fault exception should never be injected directly into
* the guest. It is a derived exception that results from specific
* combinations of nested faults.
*/
if (vector == IDT_DF)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
if (vcpu->exception_pending) {
VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
"pending exception %d", vector, vcpu->exc_vector);
return (EBUSY);
}
if (errcode_valid) {
/*
* Exceptions don't deliver an error code in real mode.
*/
error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
if (!(regval & CR0_PE))
errcode_valid = 0;
}
/*
* From section 26.6.1 "Interruptibility State" in Intel SDM:
*
* Event blocking by "STI" or "MOV SS" is cleared after guest executes
* one instruction or incurs an exception.
*/
error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
__func__, error));
if (restart_instruction)
vm_restart_instruction(vm, vcpuid);
vcpu->exception_pending = 1;
vcpu->exc_vector = vector;
vcpu->exc_errcode = errcode;
vcpu->exc_errcode_valid = errcode_valid;
VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
return (0);
}
void
vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
int errcode)
{
struct vm *vm;
int error, restart_instruction;
vm = vmarg;
restart_instruction = 1;
error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
errcode, restart_instruction);
KASSERT(error == 0, ("vm_inject_exception error %d", error));
}
void
vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
{
struct vm *vm;
int error;
vm = vmarg;
VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
error_code, cr2);
error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
}
static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
int
vm_inject_nmi(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu->nmi_pending = 1;
vcpu_notify_event(vm, vcpuid, false);
return (0);
}
int
vm_nmi_pending(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
return (vcpu->nmi_pending);
}
void
vm_nmi_clear(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
if (vcpu->nmi_pending == 0)
panic("vm_nmi_clear: inconsistent nmi_pending state");
vcpu->nmi_pending = 0;
vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
}
static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
int
vm_inject_extint(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu->extint_pending = 1;
vcpu_notify_event(vm, vcpuid, false);
return (0);
}
int
vm_extint_pending(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
return (vcpu->extint_pending);
}
void
vm_extint_clear(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
if (vcpu->extint_pending == 0)
panic("vm_extint_clear: inconsistent extint_pending state");
vcpu->extint_pending = 0;
vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
}
int
vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMGETCAP(vm->cookie, vcpu, type, retval));
}
int
vm_set_capability(struct vm *vm, int vcpu, int type, int val)
{
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMSETCAP(vm->cookie, vcpu, type, val));
}
struct vlapic *
vm_lapic(struct vm *vm, int cpu)
{
return (vm->vcpu[cpu].vlapic);
}
struct vioapic *
vm_ioapic(struct vm *vm)
{
return (vm->vioapic);
}
struct vhpet *
vm_hpet(struct vm *vm)
{
return (vm->vhpet);
}
boolean_t
vmm_is_pptdev(int bus, int slot, int func)
{
int found, i, n;
int b, s, f;
char *val, *cp, *cp2;
/*
* XXX
* The length of an environment variable is limited to 128 bytes which
* puts an upper limit on the number of passthru devices that may be
* specified using a single environment variable.
*
* Work around this by scanning multiple environment variable
* names instead of a single one - yuck!
*/
const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
found = 0;
for (i = 0; names[i] != NULL && !found; i++) {
cp = val = kern_getenv(names[i]);
while (cp != NULL && *cp != '\0') {
if ((cp2 = strchr(cp, ' ')) != NULL)
*cp2 = '\0';
n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
if (n == 3 && bus == b && slot == s && func == f) {
found = 1;
break;
}
if (cp2 != NULL)
*cp2++ = ' ';
cp = cp2;
}
freeenv(val);
}
return (found);
}
void *
vm_iommu_domain(struct vm *vm)
{
return (vm->iommu);
}
int
vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
bool from_idle)
{
int error;
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
vcpu_unlock(vcpu);
return (error);
}
enum vcpu_state
vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
{
struct vcpu *vcpu;
enum vcpu_state state;
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
state = vcpu->state;
if (hostcpu != NULL)
*hostcpu = vcpu->hostcpu;
vcpu_unlock(vcpu);
return (state);
}
int
vm_activate_cpu(struct vm *vm, int vcpuid)
{
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
if (CPU_ISSET(vcpuid, &vm->active_cpus))
return (EBUSY);
VCPU_CTR0(vm, vcpuid, "activated");
CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
return (0);
}
+int
+vm_suspend_cpu(struct vm *vm, int vcpuid)
+{
+ int i;
+
+ if (vcpuid < -1 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (vcpuid == -1) {
+ vm->debug_cpus = vm->active_cpus;
+ for (i = 0; i < VM_MAXCPU; i++) {
+ if (CPU_ISSET(i, &vm->active_cpus))
+ vcpu_notify_event(vm, i, false);
+ }
+ } else {
+ if (!CPU_ISSET(vcpuid, &vm->active_cpus))
+ return (EINVAL);
+
+ CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
+ vcpu_notify_event(vm, vcpuid, false);
+ }
+ return (0);
+}
+
+int
+vm_resume_cpu(struct vm *vm, int vcpuid)
+{
+
+ if (vcpuid < -1 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (vcpuid == -1) {
+ CPU_ZERO(&vm->debug_cpus);
+ } else {
+ if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
+ return (EINVAL);
+
+ CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
+ }
+ return (0);
+}
+
+int
+vcpu_debugged(struct vm *vm, int vcpuid)
+{
+
+ return (CPU_ISSET(vcpuid, &vm->debug_cpus));
+}
+
cpuset_t
vm_active_cpus(struct vm *vm)
{
return (vm->active_cpus);
+}
+
+cpuset_t
+vm_debug_cpus(struct vm *vm)
+{
+
+ return (vm->debug_cpus);
}
cpuset_t
vm_suspended_cpus(struct vm *vm)
{
return (vm->suspended_cpus);
}
void *
vcpu_stats(struct vm *vm, int vcpuid)
{
return (vm->vcpu[vcpuid].stats);
}
int
vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
{
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
*state = vm->vcpu[vcpuid].x2apic_state;
return (0);
}
int
vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
{
if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
return (EINVAL);
if (state >= X2APIC_STATE_LAST)
return (EINVAL);
vm->vcpu[vcpuid].x2apic_state = state;
vlapic_set_x2apic_state(vm, vcpuid, state);
return (0);
}
/*
* This function is called to ensure that a vcpu "sees" a pending event
* as soon as possible:
* - If the vcpu thread is sleeping then it is woken up.
* - If the vcpu is running on a different host_cpu then an IPI will be directed
* to the host_cpu to cause the vcpu to trap into the hypervisor.
*/
static void
vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
{
int hostcpu;
hostcpu = vcpu->hostcpu;
if (vcpu->state == VCPU_RUNNING) {
KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
if (hostcpu != curcpu) {
if (lapic_intr) {
vlapic_post_intr(vcpu->vlapic, hostcpu,
vmm_ipinum);
} else {
ipi_cpu(hostcpu, vmm_ipinum);
}
} else {
/*
* If the 'vcpu' is running on 'curcpu' then it must
* be sending a notification to itself (e.g. SELF_IPI).
* The pending event will be picked up when the vcpu
* transitions back to guest context.
*/
}
} else {
KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
"with hostcpu %d", vcpu->state, hostcpu));
if (vcpu->state == VCPU_SLEEPING)
wakeup_one(vcpu);
}
}
void
vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
vcpu_notify_event_locked(vcpu, lapic_intr);
vcpu_unlock(vcpu);
}
struct vmspace *
vm_get_vmspace(struct vm *vm)
{
return (vm->vmspace);
}
int
vm_apicid2vcpuid(struct vm *vm, int apicid)
{
/*
* XXX apic id is assumed to be numerically identical to vcpu id
*/
return (apicid);
}
void
vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
vm_rendezvous_func_t func, void *arg)
{
int i;
/*
* Enforce that this function is called without any locks
*/
WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
restart:
mtx_lock(&vm->rendezvous_mtx);
if (vm->rendezvous_func != NULL) {
/*
* If a rendezvous is already in progress then we need to
* call the rendezvous handler in case this 'vcpuid' is one
* of the targets of the rendezvous.
*/
RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
mtx_unlock(&vm->rendezvous_mtx);
vm_handle_rendezvous(vm, vcpuid);
goto restart;
}
KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
"rendezvous is still in progress"));
RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
vm->rendezvous_req_cpus = dest;
CPU_ZERO(&vm->rendezvous_done_cpus);
vm->rendezvous_arg = arg;
vm_set_rendezvous_func(vm, func);
mtx_unlock(&vm->rendezvous_mtx);
/*
* Wake up any sleeping vcpus and trigger a VM-exit in any running
* vcpus so they handle the rendezvous as soon as possible.
*/
for (i = 0; i < VM_MAXCPU; i++) {
if (CPU_ISSET(i, &dest))
vcpu_notify_event(vm, i, false);
}
vm_handle_rendezvous(vm, vcpuid);
}
struct vatpic *
vm_atpic(struct vm *vm)
{
return (vm->vatpic);
}
struct vatpit *
vm_atpit(struct vm *vm)
{
return (vm->vatpit);
}
struct vpmtmr *
vm_pmtmr(struct vm *vm)
{
return (vm->vpmtmr);
}
struct vrtc *
vm_rtc(struct vm *vm)
{
return (vm->vrtc);
}
enum vm_reg_name
vm_segment_name(int seg)
{
static enum vm_reg_name seg_names[] = {
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS
};
KASSERT(seg >= 0 && seg < nitems(seg_names),
("%s: invalid segment encoding %d", __func__, seg));
return (seg_names[seg]);
}
void
vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
int num_copyinfo)
{
int idx;
for (idx = 0; idx < num_copyinfo; idx++) {
if (copyinfo[idx].cookie != NULL)
vm_gpa_release(copyinfo[idx].cookie);
}
bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
}
int
vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
int num_copyinfo, int *fault)
{
int error, idx, nused;
size_t n, off, remaining;
void *hva, *cookie;
uint64_t gpa;
bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
nused = 0;
remaining = len;
while (remaining > 0) {
KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
if (error || *fault)
return (error);
off = gpa & PAGE_MASK;
n = min(remaining, PAGE_SIZE - off);
copyinfo[nused].gpa = gpa;
copyinfo[nused].len = n;
remaining -= n;
gla += n;
nused++;
}
for (idx = 0; idx < nused; idx++) {
hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
copyinfo[idx].len, prot, &cookie);
if (hva == NULL)
break;
copyinfo[idx].hva = hva;
copyinfo[idx].cookie = cookie;
}
if (idx != nused) {
vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
return (EFAULT);
} else {
*fault = 0;
return (0);
}
}
void
vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
size_t len)
{
char *dst;
int idx;
dst = kaddr;
idx = 0;
while (len > 0) {
bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
len -= copyinfo[idx].len;
dst += copyinfo[idx].len;
idx++;
}
}
void
vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
struct vm_copyinfo *copyinfo, size_t len)
{
const char *src;
int idx;
src = kaddr;
idx = 0;
while (len > 0) {
bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
len -= copyinfo[idx].len;
src += copyinfo[idx].len;
idx++;
}
}
/*
* Return the amount of in-use and wired memory for the VM. Since
* these are global stats, only return the values with for vCPU 0
*/
VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
VMM_STAT_DECLARE(VMM_MEM_WIRED);
static void
vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
{
if (vcpu == 0) {
vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
PAGE_SIZE * vmspace_resident_count(vm->vmspace));
}
}
static void
vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
{
if (vcpu == 0) {
vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
}
}
VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Index: head/sys/amd64/vmm/vmm_dev.c
===================================================================
--- head/sys/amd64/vmm/vmm_dev.c (revision 332156)
+++ head/sys/amd64/vmm/vmm_dev.c (revision 332157)
@@ -1,1070 +1,1080 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/sysctl.h>
#include <sys/libkern.h>
#include <sys/ioccom.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_object.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_dev.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
#include "vmm_mem.h"
#include "io/ppt.h"
#include "io/vatpic.h"
#include "io/vioapic.h"
#include "io/vhpet.h"
#include "io/vrtc.h"
struct devmem_softc {
int segid;
char *name;
struct cdev *cdev;
struct vmmdev_softc *sc;
SLIST_ENTRY(devmem_softc) link;
};
struct vmmdev_softc {
struct vm *vm; /* vm instance cookie */
struct cdev *cdev;
SLIST_ENTRY(vmmdev_softc) link;
SLIST_HEAD(, devmem_softc) devmem;
int flags;
};
#define VSC_LINKED 0x01
static SLIST_HEAD(, vmmdev_softc) head;
static struct mtx vmmdev_mtx;
static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
SYSCTL_DECL(_hw_vmm);
static int devmem_create_cdev(const char *vmname, int id, char *devmem);
static void devmem_destroy(void *arg);
static int
vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
{
int error;
if (vcpu < 0 || vcpu >= VM_MAXCPU)
return (EINVAL);
error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
return (error);
}
static void
vcpu_unlock_one(struct vmmdev_softc *sc, int vcpu)
{
enum vcpu_state state;
state = vcpu_get_state(sc->vm, vcpu, NULL);
if (state != VCPU_FROZEN) {
panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
vcpu, state);
}
vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
}
static int
vcpu_lock_all(struct vmmdev_softc *sc)
{
int error, vcpu;
for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
error = vcpu_lock_one(sc, vcpu);
if (error)
break;
}
if (error) {
while (--vcpu >= 0)
vcpu_unlock_one(sc, vcpu);
}
return (error);
}
static void
vcpu_unlock_all(struct vmmdev_softc *sc)
{
int vcpu;
for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
vcpu_unlock_one(sc, vcpu);
}
static struct vmmdev_softc *
vmmdev_lookup(const char *name)
{
struct vmmdev_softc *sc;
#ifdef notyet /* XXX kernel is not compiled with invariants */
mtx_assert(&vmmdev_mtx, MA_OWNED);
#endif
SLIST_FOREACH(sc, &head, link) {
if (strcmp(name, vm_name(sc->vm)) == 0)
break;
}
return (sc);
}
static struct vmmdev_softc *
vmmdev_lookup2(struct cdev *cdev)
{
return (cdev->si_drv1);
}
static int
vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
{
int error, off, c, prot;
vm_paddr_t gpa;
void *hpa, *cookie;
struct vmmdev_softc *sc;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
return (ENXIO);
/*
* Get a read lock on the guest memory map by freezing any vcpu.
*/
error = vcpu_lock_one(sc, VM_MAXCPU - 1);
if (error)
return (error);
prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
while (uio->uio_resid > 0 && error == 0) {
gpa = uio->uio_offset;
off = gpa & PAGE_MASK;
c = min(uio->uio_resid, PAGE_SIZE - off);
/*
* The VM has a hole in its physical memory map. If we want to
* use 'dd' to inspect memory beyond the hole we need to
* provide bogus data for memory that lies in the hole.
*
* Since this device does not support lseek(2), dd(1) will
* read(2) blocks of data to simulate the lseek(2).
*/
hpa = vm_gpa_hold(sc->vm, VM_MAXCPU - 1, gpa, c, prot, &cookie);
if (hpa == NULL) {
if (uio->uio_rw == UIO_READ)
error = uiomove(__DECONST(void *, zero_region),
c, uio);
else
error = EFAULT;
} else {
error = uiomove(hpa, c, uio);
vm_gpa_release(cookie);
}
}
vcpu_unlock_one(sc, VM_MAXCPU - 1);
return (error);
}
CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= SPECNAMELEN + 1);
static int
get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
{
struct devmem_softc *dsc;
int error;
bool sysmem;
error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
if (error || mseg->len == 0)
return (error);
if (!sysmem) {
SLIST_FOREACH(dsc, &sc->devmem, link) {
if (dsc->segid == mseg->segid)
break;
}
KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
__func__, mseg->segid));
error = copystr(dsc->name, mseg->name, SPECNAMELEN + 1, NULL);
} else {
bzero(mseg->name, sizeof(mseg->name));
}
return (error);
}
static int
alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg)
{
char *name;
int error;
bool sysmem;
error = 0;
name = NULL;
sysmem = true;
if (VM_MEMSEG_NAME(mseg)) {
sysmem = false;
name = malloc(SPECNAMELEN + 1, M_VMMDEV, M_WAITOK);
error = copystr(mseg->name, name, SPECNAMELEN + 1, 0);
if (error)
goto done;
}
error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
if (error)
goto done;
if (VM_MEMSEG_NAME(mseg)) {
error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
if (error)
vm_free_memseg(sc->vm, mseg->segid);
else
name = NULL; /* freed when 'cdev' is destroyed */
}
done:
free(name, M_VMMDEV);
return (error);
}
static int
vm_get_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
uint64_t *regval)
{
int error, i;
error = 0;
for (i = 0; i < count; i++) {
error = vm_get_register(vm, vcpu, regnum[i], &regval[i]);
if (error)
break;
}
return (error);
}
static int
vm_set_register_set(struct vm *vm, int vcpu, unsigned int count, int *regnum,
uint64_t *regval)
{
int error, i;
error = 0;
for (i = 0; i < count; i++) {
error = vm_set_register(vm, vcpu, regnum[i], regval[i]);
if (error)
break;
}
return (error);
}
static int
vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
struct thread *td)
{
int error, vcpu, state_changed, size;
cpuset_t *cpuset;
struct vmmdev_softc *sc;
struct vm_register *vmreg;
struct vm_seg_desc *vmsegdesc;
struct vm_register_set *vmregset;
struct vm_run *vmrun;
struct vm_exception *vmexc;
struct vm_lapic_irq *vmirq;
struct vm_lapic_msi *vmmsi;
struct vm_ioapic_irq *ioapic_irq;
struct vm_isa_irq *isa_irq;
struct vm_isa_irq_trigger *isa_irq_trigger;
struct vm_capability *vmcap;
struct vm_pptdev *pptdev;
struct vm_pptdev_mmio *pptmmio;
struct vm_pptdev_msi *pptmsi;
struct vm_pptdev_msix *pptmsix;
struct vm_nmi *vmnmi;
struct vm_stats *vmstats;
struct vm_stat_desc *statdesc;
struct vm_x2apic *x2apic;
struct vm_gpa_pte *gpapte;
struct vm_suspend *vmsuspend;
struct vm_gla2gpa *gg;
struct vm_activate_cpu *vac;
struct vm_cpuset *vm_cpuset;
struct vm_intinfo *vmii;
struct vm_rtc_time *rtctime;
struct vm_rtc_data *rtcdata;
struct vm_memmap *mm;
uint64_t *regvals;
int *regnums;
sc = vmmdev_lookup2(cdev);
if (sc == NULL)
return (ENXIO);
error = 0;
vcpu = -1;
state_changed = 0;
/*
* Some VMM ioctls can operate only on vcpus that are not running.
*/
switch (cmd) {
case VM_RUN:
case VM_GET_REGISTER:
case VM_SET_REGISTER:
case VM_GET_SEGMENT_DESCRIPTOR:
case VM_SET_SEGMENT_DESCRIPTOR:
case VM_GET_REGISTER_SET:
case VM_SET_REGISTER_SET:
case VM_INJECT_EXCEPTION:
case VM_GET_CAPABILITY:
case VM_SET_CAPABILITY:
case VM_PPTDEV_MSI:
case VM_PPTDEV_MSIX:
case VM_SET_X2APIC_STATE:
case VM_GLA2GPA:
case VM_GLA2GPA_NOFAULT:
case VM_ACTIVATE_CPU:
case VM_SET_INTINFO:
case VM_GET_INTINFO:
case VM_RESTART_INSTRUCTION:
/*
* XXX fragile, handle with care
* Assumes that the first field of the ioctl data is the vcpu.
*/
vcpu = *(int *)data;
error = vcpu_lock_one(sc, vcpu);
if (error)
goto done;
state_changed = 1;
break;
case VM_MAP_PPTDEV_MMIO:
case VM_BIND_PPTDEV:
case VM_UNBIND_PPTDEV:
case VM_ALLOC_MEMSEG:
case VM_MMAP_MEMSEG:
case VM_REINIT:
/*
* ioctls that operate on the entire virtual machine must
* prevent all vcpus from running.
*/
error = vcpu_lock_all(sc);
if (error)
goto done;
state_changed = 2;
break;
case VM_GET_MEMSEG:
case VM_MMAP_GETNEXT:
/*
* Lock a vcpu to make sure that the memory map cannot be
* modified while it is being inspected.
*/
vcpu = VM_MAXCPU - 1;
error = vcpu_lock_one(sc, vcpu);
if (error)
goto done;
state_changed = 1;
break;
default:
break;
}
switch(cmd) {
case VM_RUN:
vmrun = (struct vm_run *)data;
error = vm_run(sc->vm, vmrun);
break;
case VM_SUSPEND:
vmsuspend = (struct vm_suspend *)data;
error = vm_suspend(sc->vm, vmsuspend->how);
break;
case VM_REINIT:
error = vm_reinit(sc->vm);
break;
case VM_STAT_DESC: {
statdesc = (struct vm_stat_desc *)data;
error = vmm_stat_desc_copy(statdesc->index,
statdesc->desc, sizeof(statdesc->desc));
break;
}
case VM_STATS: {
CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS);
vmstats = (struct vm_stats *)data;
getmicrotime(&vmstats->tv);
error = vmm_stat_copy(sc->vm, vmstats->cpuid,
&vmstats->num_entries, vmstats->statbuf);
break;
}
case VM_PPTDEV_MSI:
pptmsi = (struct vm_pptdev_msi *)data;
error = ppt_setup_msi(sc->vm, pptmsi->vcpu,
pptmsi->bus, pptmsi->slot, pptmsi->func,
pptmsi->addr, pptmsi->msg,
pptmsi->numvec);
break;
case VM_PPTDEV_MSIX:
pptmsix = (struct vm_pptdev_msix *)data;
error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
pptmsix->bus, pptmsix->slot,
pptmsix->func, pptmsix->idx,
pptmsix->addr, pptmsix->msg,
pptmsix->vector_control);
break;
case VM_MAP_PPTDEV_MMIO:
pptmmio = (struct vm_pptdev_mmio *)data;
error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
pptmmio->func, pptmmio->gpa, pptmmio->len,
pptmmio->hpa);
break;
case VM_BIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_UNBIND_PPTDEV:
pptdev = (struct vm_pptdev *)data;
error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
pptdev->func);
break;
case VM_INJECT_EXCEPTION:
vmexc = (struct vm_exception *)data;
error = vm_inject_exception(sc->vm, vmexc->cpuid,
vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
vmexc->restart_instruction);
break;
case VM_INJECT_NMI:
vmnmi = (struct vm_nmi *)data;
error = vm_inject_nmi(sc->vm, vmnmi->cpuid);
break;
case VM_LAPIC_IRQ:
vmirq = (struct vm_lapic_irq *)data;
error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector);
break;
case VM_LAPIC_LOCAL_IRQ:
vmirq = (struct vm_lapic_irq *)data;
error = lapic_set_local_intr(sc->vm, vmirq->cpuid,
vmirq->vector);
break;
case VM_LAPIC_MSI:
vmmsi = (struct vm_lapic_msi *)data;
error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
break;
case VM_IOAPIC_ASSERT_IRQ:
ioapic_irq = (struct vm_ioapic_irq *)data;
error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
break;
case VM_IOAPIC_DEASSERT_IRQ:
ioapic_irq = (struct vm_ioapic_irq *)data;
error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
break;
case VM_IOAPIC_PULSE_IRQ:
ioapic_irq = (struct vm_ioapic_irq *)data;
error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
break;
case VM_IOAPIC_PINCOUNT:
*(int *)data = vioapic_pincount(sc->vm);
break;
case VM_ISA_ASSERT_IRQ:
isa_irq = (struct vm_isa_irq *)data;
error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
if (error == 0 && isa_irq->ioapic_irq != -1)
error = vioapic_assert_irq(sc->vm,
isa_irq->ioapic_irq);
break;
case VM_ISA_DEASSERT_IRQ:
isa_irq = (struct vm_isa_irq *)data;
error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
if (error == 0 && isa_irq->ioapic_irq != -1)
error = vioapic_deassert_irq(sc->vm,
isa_irq->ioapic_irq);
break;
case VM_ISA_PULSE_IRQ:
isa_irq = (struct vm_isa_irq *)data;
error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
if (error == 0 && isa_irq->ioapic_irq != -1)
error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
break;
case VM_ISA_SET_IRQ_TRIGGER:
isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
error = vatpic_set_irq_trigger(sc->vm,
isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
break;
case VM_MMAP_GETNEXT:
mm = (struct vm_memmap *)data;
error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
&mm->segoff, &mm->len, &mm->prot, &mm->flags);
break;
case VM_MMAP_MEMSEG:
mm = (struct vm_memmap *)data;
error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
mm->len, mm->prot, mm->flags);
break;
case VM_ALLOC_MEMSEG:
error = alloc_memseg(sc, (struct vm_memseg *)data);
break;
case VM_GET_MEMSEG:
error = get_memseg(sc, (struct vm_memseg *)data);
break;
case VM_GET_REGISTER:
vmreg = (struct vm_register *)data;
error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum,
&vmreg->regval);
break;
case VM_SET_REGISTER:
vmreg = (struct vm_register *)data;
error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum,
vmreg->regval);
break;
case VM_SET_SEGMENT_DESCRIPTOR:
vmsegdesc = (struct vm_seg_desc *)data;
error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid,
vmsegdesc->regnum,
&vmsegdesc->desc);
break;
case VM_GET_SEGMENT_DESCRIPTOR:
vmsegdesc = (struct vm_seg_desc *)data;
error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid,
vmsegdesc->regnum,
&vmsegdesc->desc);
break;
case VM_GET_REGISTER_SET:
vmregset = (struct vm_register_set *)data;
if (vmregset->count > VM_REG_LAST) {
error = EINVAL;
break;
}
regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
M_WAITOK);
regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
M_WAITOK);
error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
vmregset->count);
if (error == 0)
error = vm_get_register_set(sc->vm, vmregset->cpuid,
vmregset->count, regnums, regvals);
if (error == 0)
error = copyout(regvals, vmregset->regvals,
sizeof(regvals[0]) * vmregset->count);
free(regvals, M_VMMDEV);
free(regnums, M_VMMDEV);
break;
case VM_SET_REGISTER_SET:
vmregset = (struct vm_register_set *)data;
if (vmregset->count > VM_REG_LAST) {
error = EINVAL;
break;
}
regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
M_WAITOK);
regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
M_WAITOK);
error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
vmregset->count);
if (error == 0)
error = copyin(vmregset->regvals, regvals,
sizeof(regvals[0]) * vmregset->count);
if (error == 0)
error = vm_set_register_set(sc->vm, vmregset->cpuid,
vmregset->count, regnums, regvals);
free(regvals, M_VMMDEV);
free(regnums, M_VMMDEV);
break;
case VM_GET_CAPABILITY:
vmcap = (struct vm_capability *)data;
error = vm_get_capability(sc->vm, vmcap->cpuid,
vmcap->captype,
&vmcap->capval);
break;
case VM_SET_CAPABILITY:
vmcap = (struct vm_capability *)data;
error = vm_set_capability(sc->vm, vmcap->cpuid,
vmcap->captype,
vmcap->capval);
break;
case VM_SET_X2APIC_STATE:
x2apic = (struct vm_x2apic *)data;
error = vm_set_x2apic_state(sc->vm,
x2apic->cpuid, x2apic->state);
break;
case VM_GET_X2APIC_STATE:
x2apic = (struct vm_x2apic *)data;
error = vm_get_x2apic_state(sc->vm,
x2apic->cpuid, &x2apic->state);
break;
case VM_GET_GPA_PMAP:
gpapte = (struct vm_gpa_pte *)data;
pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
gpapte->gpa, gpapte->pte, &gpapte->ptenum);
error = 0;
break;
case VM_GET_HPET_CAPABILITIES:
error = vhpet_getcap((struct vm_hpet_cap *)data);
break;
case VM_GLA2GPA: {
CTASSERT(PROT_READ == VM_PROT_READ);
CTASSERT(PROT_WRITE == VM_PROT_WRITE);
CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
gg = (struct vm_gla2gpa *)data;
error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
gg->prot, &gg->gpa, &gg->fault);
KASSERT(error == 0 || error == EFAULT,
("%s: vm_gla2gpa unknown error %d", __func__, error));
break;
}
case VM_GLA2GPA_NOFAULT:
gg = (struct vm_gla2gpa *)data;
error = vm_gla2gpa_nofault(sc->vm, gg->vcpuid, &gg->paging,
gg->gla, gg->prot, &gg->gpa, &gg->fault);
KASSERT(error == 0 || error == EFAULT,
("%s: vm_gla2gpa unknown error %d", __func__, error));
break;
case VM_ACTIVATE_CPU:
vac = (struct vm_activate_cpu *)data;
error = vm_activate_cpu(sc->vm, vac->vcpuid);
break;
case VM_GET_CPUS:
error = 0;
vm_cpuset = (struct vm_cpuset *)data;
size = vm_cpuset->cpusetsize;
if (size < sizeof(cpuset_t) || size > CPU_MAXSIZE / NBBY) {
error = ERANGE;
break;
}
cpuset = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
if (vm_cpuset->which == VM_ACTIVE_CPUS)
*cpuset = vm_active_cpus(sc->vm);
else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
*cpuset = vm_suspended_cpus(sc->vm);
+ else if (vm_cpuset->which == VM_DEBUG_CPUS)
+ *cpuset = vm_debug_cpus(sc->vm);
else
error = EINVAL;
if (error == 0)
error = copyout(cpuset, vm_cpuset->cpus, size);
free(cpuset, M_TEMP);
+ break;
+ case VM_SUSPEND_CPU:
+ vac = (struct vm_activate_cpu *)data;
+ error = vm_suspend_cpu(sc->vm, vac->vcpuid);
+ break;
+ case VM_RESUME_CPU:
+ vac = (struct vm_activate_cpu *)data;
+ error = vm_resume_cpu(sc->vm, vac->vcpuid);
break;
case VM_SET_INTINFO:
vmii = (struct vm_intinfo *)data;
error = vm_exit_intinfo(sc->vm, vmii->vcpuid, vmii->info1);
break;
case VM_GET_INTINFO:
vmii = (struct vm_intinfo *)data;
error = vm_get_intinfo(sc->vm, vmii->vcpuid, &vmii->info1,
&vmii->info2);
break;
case VM_RTC_WRITE:
rtcdata = (struct vm_rtc_data *)data;
error = vrtc_nvram_write(sc->vm, rtcdata->offset,
rtcdata->value);
break;
case VM_RTC_READ:
rtcdata = (struct vm_rtc_data *)data;
error = vrtc_nvram_read(sc->vm, rtcdata->offset,
&rtcdata->value);
break;
case VM_RTC_SETTIME:
rtctime = (struct vm_rtc_time *)data;
error = vrtc_set_time(sc->vm, rtctime->secs);
break;
case VM_RTC_GETTIME:
error = 0;
rtctime = (struct vm_rtc_time *)data;
rtctime->secs = vrtc_get_time(sc->vm);
break;
case VM_RESTART_INSTRUCTION:
error = vm_restart_instruction(sc->vm, vcpu);
break;
default:
error = ENOTTY;
break;
}
if (state_changed == 1)
vcpu_unlock_one(sc, vcpu);
else if (state_changed == 2)
vcpu_unlock_all(sc);
done:
/* Make sure that no handler returns a bogus value like ERESTART */
KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
return (error);
}
static int
vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
struct vm_object **objp, int nprot)
{
struct vmmdev_softc *sc;
vm_paddr_t gpa;
size_t len;
vm_ooffset_t segoff, first, last;
int error, found, segid;
bool sysmem;
first = *offset;
last = first + mapsize;
if ((nprot & PROT_EXEC) || first < 0 || first >= last)
return (EINVAL);
sc = vmmdev_lookup2(cdev);
if (sc == NULL) {
/* virtual machine is in the process of being created */
return (EINVAL);
}
/*
* Get a read lock on the guest memory map by freezing any vcpu.
*/
error = vcpu_lock_one(sc, VM_MAXCPU - 1);
if (error)
return (error);
gpa = 0;
found = 0;
while (!found) {
error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
NULL, NULL);
if (error)
break;
if (first >= gpa && last <= gpa + len)
found = 1;
else
gpa += len;
}
if (found) {
error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
KASSERT(error == 0 && *objp != NULL,
("%s: invalid memory segment %d", __func__, segid));
if (sysmem) {
vm_object_reference(*objp);
*offset = segoff + (first - gpa);
} else {
error = EINVAL;
}
}
vcpu_unlock_one(sc, VM_MAXCPU - 1);
return (error);
}
static void
vmmdev_destroy(void *arg)
{
struct vmmdev_softc *sc = arg;
struct devmem_softc *dsc;
int error;
error = vcpu_lock_all(sc);
KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
SLIST_REMOVE_HEAD(&sc->devmem, link);
free(dsc->name, M_VMMDEV);
free(dsc, M_VMMDEV);
}
if (sc->cdev != NULL)
destroy_dev(sc->cdev);
if (sc->vm != NULL)
vm_destroy(sc->vm);
if ((sc->flags & VSC_LINKED) != 0) {
mtx_lock(&vmmdev_mtx);
SLIST_REMOVE(&head, sc, vmmdev_softc, link);
mtx_unlock(&vmmdev_mtx);
}
free(sc, M_VMMDEV);
}
static int
sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
{
int error;
char buf[VM_MAX_NAMELEN];
struct devmem_softc *dsc;
struct vmmdev_softc *sc;
struct cdev *cdev;
strlcpy(buf, "beavis", sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup(buf);
if (sc == NULL || sc->cdev == NULL) {
mtx_unlock(&vmmdev_mtx);
return (EINVAL);
}
/*
* The 'cdev' will be destroyed asynchronously when 'si_threadcount'
* goes down to 0 so we should not do it again in the callback.
*
* Setting 'sc->cdev' to NULL is also used to indicate that the VM
* is scheduled for destruction.
*/
cdev = sc->cdev;
sc->cdev = NULL;
mtx_unlock(&vmmdev_mtx);
/*
* Schedule all cdevs to be destroyed:
*
* - any new operations on the 'cdev' will return an error (ENXIO).
*
* - when the 'si_threadcount' dwindles down to zero the 'cdev' will
* be destroyed and the callback will be invoked in a taskqueue
* context.
*
* - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
*/
SLIST_FOREACH(dsc, &sc->devmem, link) {
KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
destroy_dev_sched_cb(dsc->cdev, devmem_destroy, dsc);
}
destroy_dev_sched_cb(cdev, vmmdev_destroy, sc);
return (0);
}
SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW,
NULL, 0, sysctl_vmm_destroy, "A", NULL);
static struct cdevsw vmmdevsw = {
.d_name = "vmmdev",
.d_version = D_VERSION,
.d_ioctl = vmmdev_ioctl,
.d_mmap_single = vmmdev_mmap_single,
.d_read = vmmdev_rw,
.d_write = vmmdev_rw,
};
static int
sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
{
int error;
struct vm *vm;
struct cdev *cdev;
struct vmmdev_softc *sc, *sc2;
char buf[VM_MAX_NAMELEN];
strlcpy(buf, "beavis", sizeof(buf));
error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
if (error != 0 || req->newptr == NULL)
return (error);
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup(buf);
mtx_unlock(&vmmdev_mtx);
if (sc != NULL)
return (EEXIST);
error = vm_create(buf, &vm);
if (error != 0)
return (error);
sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
sc->vm = vm;
SLIST_INIT(&sc->devmem);
/*
* Lookup the name again just in case somebody sneaked in when we
* dropped the lock.
*/
mtx_lock(&vmmdev_mtx);
sc2 = vmmdev_lookup(buf);
if (sc2 == NULL) {
SLIST_INSERT_HEAD(&head, sc, link);
sc->flags |= VSC_LINKED;
}
mtx_unlock(&vmmdev_mtx);
if (sc2 != NULL) {
vmmdev_destroy(sc);
return (EEXIST);
}
error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL,
UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
if (error != 0) {
vmmdev_destroy(sc);
return (error);
}
mtx_lock(&vmmdev_mtx);
sc->cdev = cdev;
sc->cdev->si_drv1 = sc;
mtx_unlock(&vmmdev_mtx);
return (0);
}
SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW,
NULL, 0, sysctl_vmm_create, "A", NULL);
void
vmmdev_init(void)
{
mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF);
}
int
vmmdev_cleanup(void)
{
int error;
if (SLIST_EMPTY(&head))
error = 0;
else
error = EBUSY;
return (error);
}
static int
devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
struct vm_object **objp, int nprot)
{
struct devmem_softc *dsc;
vm_ooffset_t first, last;
size_t seglen;
int error;
bool sysmem;
dsc = cdev->si_drv1;
if (dsc == NULL) {
/* 'cdev' has been created but is not ready for use */
return (ENXIO);
}
first = *offset;
last = *offset + len;
if ((nprot & PROT_EXEC) || first < 0 || first >= last)
return (EINVAL);
error = vcpu_lock_one(dsc->sc, VM_MAXCPU - 1);
if (error)
return (error);
error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
KASSERT(error == 0 && !sysmem && *objp != NULL,
("%s: invalid devmem segment %d", __func__, dsc->segid));
vcpu_unlock_one(dsc->sc, VM_MAXCPU - 1);
if (seglen >= last) {
vm_object_reference(*objp);
return (0);
} else {
return (EINVAL);
}
}
static struct cdevsw devmemsw = {
.d_name = "devmem",
.d_version = D_VERSION,
.d_mmap_single = devmem_mmap_single,
};
static int
devmem_create_cdev(const char *vmname, int segid, char *devname)
{
struct devmem_softc *dsc;
struct vmmdev_softc *sc;
struct cdev *cdev;
int error;
error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
if (error)
return (error);
dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
mtx_lock(&vmmdev_mtx);
sc = vmmdev_lookup(vmname);
KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
if (sc->cdev == NULL) {
/* virtual machine is being created or destroyed */
mtx_unlock(&vmmdev_mtx);
free(dsc, M_VMMDEV);
destroy_dev_sched_cb(cdev, NULL, 0);
return (ENODEV);
}
dsc->segid = segid;
dsc->name = devname;
dsc->cdev = cdev;
dsc->sc = sc;
SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
mtx_unlock(&vmmdev_mtx);
/* The 'cdev' is ready for use after 'si_drv1' is initialized */
cdev->si_drv1 = dsc;
return (0);
}
static void
devmem_destroy(void *arg)
{
struct devmem_softc *dsc = arg;
KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
dsc->cdev = NULL;
dsc->sc = NULL;
}

File Metadata

Mime Type
text/x-diff
Expires
Wed, Nov 12, 11:44 PM (1 h, 3 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25184755
Default Alt Text
(321 KB)

Event Timeline