Index: lib/libvmmapi/vmmapi.h =================================================================== --- lib/libvmmapi/vmmapi.h +++ lib/libvmmapi/vmmapi.h @@ -33,6 +33,7 @@ #include #include +#include /* * API version for out-of-tree consumers like grub-bhyve for making compile @@ -42,6 +43,7 @@ struct iovec; struct vmctx; +struct vm_snapshot_meta; enum x2apic_state; /* @@ -88,6 +90,10 @@ */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); + +int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, + size_t *lowmem_size, size_t *highmem_size); + /* * Create a device memory segment identified by 'segid'. * @@ -110,6 +116,8 @@ int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +/* inverse operation to vm_map_gpa - extract guest address from host pointer */ +vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); @@ -120,6 +128,7 @@ void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); +int vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, @@ -237,4 +246,25 @@ uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); + +/* + * Save and restore + */ + +#define MAX_SNAPSHOT_VMNAME 100 + +enum checkpoint_opcodes { + START_CHECKPOINT = 0, + START_SUSPEND = 1, +}; + +struct checkpoint_op { + unsigned int op; + char snapshot_filename[MAX_SNAPSHOT_VMNAME]; +}; + +int vm_snapshot_req(struct vm_snapshot_meta *meta); +int vm_restore_time(struct vmctx *ctx); +int vm_restore_mem(struct vmctx *ctx, int vmmem_fd, size_t size); + #endif /* _VMMAPI_H_ */ Index: lib/libvmmapi/vmmapi.c =================================================================== --- lib/libvmmapi/vmmapi.c +++ lib/libvmmapi/vmmapi.c @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -51,8 +52,10 @@ #include +#include #include #include +#include #include "vmmapi.h" @@ -233,6 +236,17 @@ return (error); } +int +vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, + size_t *lowmem_size, size_t *highmem_size) +{ + + *guest_baseaddr = ctx->baseaddr; + *lowmem_size = ctx->lowmem; + *highmem_size = ctx->highmem; + return (0); +} + int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) @@ -444,6 +458,34 @@ return (NULL); } +vm_paddr_t +vm_rev_map_gpa(struct vmctx *ctx, void *addr) +{ + vm_paddr_t offaddr; + + offaddr = (char *)addr - ctx->baseaddr; + + if (ctx->lowmem > 0) + if (offaddr >= 0 && offaddr <= ctx->lowmem) + return (offaddr); + + if (ctx->highmem > 0) + if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) + return (offaddr); + + return ((vm_paddr_t)-1); +} + +/* TODO: maximum size for vmname */ +int +vm_get_name(struct vmctx *ctx, char *buf, size_t max_len) +{ + + if (strlcpy(buf, ctx->name, max_len) >= max_len) + return (EINVAL); + return (0); +} + size_t vm_get_lowmem_size(struct vmctx *ctx) { @@ -1504,6 +1546,100 @@ return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } +int +vm_snapshot_req(struct vm_snapshot_meta *meta) +{ + + if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", + __func__, meta->dev_name, errno); +#endif + return (-1); + } + return (0); +} + +static bool +vm_mem_read_from_file(int fd, void *dest, size_t file_offset, size_t len) +{ + char *p; + ssize_t cnt_read; + + if (lseek(fd, file_offset, SEEK_SET) == -1) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, + "%s: Could not change file offset errno = %d\r\n", + __func__, errno); +#endif + return (false); + } + + p = dest; + while (len > 0) { + cnt_read = read(fd, p, len); + if (cnt_read == 0) { + errno = ENOSPC; +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, "%s: short read\r\n", __func__); +#endif + return (false); + } else if (cnt_read < 0) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr,"%s: read error: %d\r\n", + __func__, errno); +#endif + return (false); + } + p += cnt_read; + len -= cnt_read; + } + return (true); +} + +int +vm_restore_mem(struct vmctx *ctx, int vmmem_fd, size_t size) +{ + + if (ctx->lowmem + ctx->highmem != size) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\n", + __func__, ctx->lowmem + ctx->highmem, size); +#endif + return (-1); + } + + if (!vm_mem_read_from_file(vmmem_fd, ctx->baseaddr, 0, ctx->lowmem)) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, + "%s: Could not read lowmem from file\r\n", __func__); +#endif + return (-1); + } + + if (ctx->highmem > 0) { + if (!vm_mem_read_from_file(vmmem_fd, ctx->baseaddr + 4*GB, + ctx->lowmem, ctx->highmem)) { +#ifdef SNAPSHOT_DEBUG + fprintf(stderr, + "%s: Could not read highmem from file\r\n", + __func__); +#endif + return (-1); + } + } + return (0); +} + +int +vm_restore_time(struct vmctx *ctx) +{ + int dummy; + + dummy = 0; + return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); +} + int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) Index: share/man/man5/src.conf.5 =================================================================== --- share/man/man5/src.conf.5 +++ share/man/man5/src.conf.5 @@ -170,6 +170,13 @@ associated utilities, and examples. .Pp This option only affects amd64/amd64. +.It Va WITH_BHYVE_SNAPSHOT +Set to include support for save and restore (snapshots) in +.Xr bhyve 8 +and +.Xr bhyvectl 8 . +.Pp +This option only affects amd64/amd64. .It Va WITH_BIND_NOW Build all binaries with the .Dv DF_BIND_NOW Index: share/mk/kmod.opts.mk =================================================================== --- /dev/null +++ share/mk/kmod.opts.mk @@ -0,0 +1,20 @@ +# $FreeBSD$ + +# Search for kernel source tree in standard places. +.if empty(KERNBUILDDIR) +.if !defined(SYSDIR) +.for _dir in ${SRCTOP:D${SRCTOP}/sys} \ + ${.CURDIR}/../.. ${.CURDIR}/../../.. ${.CURDIR}/../../../.. \ + ${.CURDIR}/../../../../.. /sys /usr/src/sys +.if !defined(SYSDIR) && exists(${_dir}/kern/) && exists(${_dir}/conf/kmod.opts.mk) +SYSDIR= ${_dir:tA} +.endif +.endfor +.endif +.if !defined(SYSDIR) || !exists(${SYSDIR}/kern/) || \ + !exists(${SYSDIR}/conf/kmod.opts.mk) +.error Unable to locate the kernel source tree. Set SYSDIR to override. +.endif +.endif + +.include "${SYSDIR}/conf/kmod.opts.mk" Index: share/mk/src.opts.mk =================================================================== --- share/mk/src.opts.mk +++ share/mk/src.opts.mk @@ -193,6 +193,7 @@ __DEFAULT_NO_OPTIONS = \ BEARSSL \ + BHYVE_SNAPSHOT \ BSD_GREP \ CLANG_EXTRAS \ DTRACE_TESTS \ Index: sys/amd64/conf/GENERIC =================================================================== --- sys/amd64/conf/GENERIC +++ sys/amd64/conf/GENERIC @@ -356,6 +356,9 @@ device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device +# bhyve snapshot +option BHYVE_SNAPSHOT + # HyperV drivers and enhancement support device hyperv # HyperV drivers Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h +++ sys/amd64/include/vmm.h @@ -34,6 +34,8 @@ #include #include +struct vm_snapshot_meta; + #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif @@ -151,6 +153,7 @@ struct vm_object; struct vm_guest_paging; struct pmap; +enum snapshot_req; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ @@ -179,6 +182,10 @@ typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta); +typedef int (*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta, + int vcpu); +typedef int (*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ @@ -198,6 +205,11 @@ vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; + + /* checkpoint operations */ + vmi_snapshot_t vmsnapshot; + vmi_snapshot_vmcx_t vmcx_snapshot; + vmi_restore_tsc_t vm_restore_tsc; }; extern struct vmm_ops vmm_ops_intel; @@ -271,6 +283,9 @@ void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); +int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); +int vm_restore_time(struct vm *vm); + #ifdef _SYS__CPUSET_H_ /* @@ -408,6 +423,15 @@ int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); +/* + * Function used to keep track of the guest's TSC offset. The + * offset is used by the virutalization extensions to provide a consistent + * value for the Time Stamp Counter to the guest. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset); + enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { Index: sys/amd64/include/vmm_dev.h =================================================================== --- sys/amd64/include/vmm_dev.h +++ sys/amd64/include/vmm_dev.h @@ -31,6 +31,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +struct vm_snapshot_meta; + #ifdef _KERNEL void vmmdev_init(void); int vmmdev_cleanup(void); @@ -312,6 +314,11 @@ IOCNUM_RTC_WRITE = 101, IOCNUM_RTC_SETTIME = 102, IOCNUM_RTC_GETTIME = 103, + + /* checkpoint */ + IOCNUM_SNAPSHOT_REQ = 113, + + IOCNUM_RESTORE_TIME = 115 }; #define VM_RUN \ @@ -422,4 +429,8 @@ _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) #define VM_RESTART_INSTRUCTION \ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) +#define VM_SNAPSHOT_REQ \ + _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta) +#define VM_RESTORE_TIME \ + _IOWR('v', IOCNUM_RESTORE_TIME, int) #endif Index: sys/amd64/include/vmm_snapshot.h =================================================================== --- /dev/null +++ sys/amd64/include/vmm_snapshot.h @@ -0,0 +1,119 @@ +#ifndef _VMM_SNAPSHOT_ +#define _VMM_SNAPSHOT_ + +#include +#include +#ifndef _KERNEL +#include +#endif + +struct vmctx; + +enum snapshot_req { + STRUCT_VMX, + STRUCT_VIOAPIC, + STRUCT_VM, + STRUCT_VLAPIC, + VM_MEM, + STRUCT_VHPET, + STRUCT_VMCX, + STRUCT_VATPIC, + STRUCT_VATPIT, + STRUCT_VPMTMR, + STRUCT_VRTC, +}; + +struct vm_snapshot_buffer { + /* + * R/O for device-specific functions; + * written by generic snapshot functions. + */ + uint8_t *const buf_start; + const size_t buf_size; + + /* + * R/W for device-specific functions used to keep track of buffer + * current position and remaining size. + */ + uint8_t *buf; + size_t buf_rem; + + /* + * Length of the snapshot is either determined as (buf_size - buf_rem) + * or (buf - buf_start) -- the second variation returns a signed value + * so it may not be appropriate. + * + * Use vm_get_snapshot_size(meta). + */ +}; + +enum vm_snapshot_op { + VM_SNAPSHOT_SAVE, + VM_SNAPSHOT_RESTORE, +}; + +struct vm_snapshot_meta { + struct vmctx *ctx; + void *dev_data; + const char *dev_name; /* identify userspace devices */ + enum snapshot_req dev_req; /* identify kernel structs */ + + struct vm_snapshot_buffer buffer; + + enum vm_snapshot_op op; +}; + + +void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op); +int vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta); +size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta); +int vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, + struct vm_snapshot_meta *meta); +int vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta); + +#define SNAPSHOT_BUF_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_buf((DATA), (LEN), (META)); \ + if ((RES) != 0) { \ + vm_snapshot_buf_err(#DATA, (META)->op); \ + goto LABEL; \ + } \ +} while (0) + +#define SNAPSHOT_VAR_OR_LEAVE(DATA, META, RES, LABEL) \ + SNAPSHOT_BUF_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) + +/* + * Address variables are pointers to guest memory. + * + * When RNULL != 0, do not enforce invalid address checks; instead, make the + * pointer NULL at restore time. + */ +#define SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ADDR, LEN, RNULL, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_guest2host_addr((void **)&(ADDR), (LEN), (RNULL), \ + (META)); \ + if ((RES) != 0) { \ + if ((RES) == EFAULT) \ + fprintf(stderr, "%s: invalid address: %s\r\n", \ + __func__, #ADDR); \ + goto LABEL; \ + } \ +} while (0) + +/* compare the value in the meta buffer with the data */ +#define SNAPSHOT_BUF_CMP_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_buf_cmp((DATA), (LEN), (META)); \ + if ((RES) != 0) { \ + vm_snapshot_buf_err(#DATA, (META)->op); \ + goto LABEL; \ + } \ +} while (0) + +#define SNAPSHOT_VAR_CMP_OR_LEAVE(DATA, META, RES, LABEL) \ + SNAPSHOT_BUF_CMP_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) + +#endif Index: sys/amd64/vmm/amd/svm.h =================================================================== --- sys/amd64/vmm/amd/svm.h +++ sys/amd64/vmm/amd/svm.h @@ -32,6 +32,7 @@ #define _SVM_H_ struct pcpu; +struct svm_softc; /* * Guest register state that is saved outside the VMCB. @@ -66,5 +67,8 @@ }; void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); +#ifdef BHYVE_SNAPSHOT +int svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset); +#endif #endif /* _SVM_H_ */ Index: sys/amd64/vmm/amd/svm.c =================================================================== --- sys/amd64/vmm/amd/svm.c +++ sys/amd64/vmm/amd/svm.c @@ -29,6 +29,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -50,6 +52,7 @@ #include #include #include +#include #include "vmm_lapic.h" #include "vmm_stat.h" @@ -275,6 +278,25 @@ svm_enable(NULL); } +#ifdef BHYVE_SNAPSHOT +int +svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset) +{ + int error; + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + ctrl->tsc_offset = offset; + + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR1(sc->vm, vcpu, "tsc offset changed to %#lx", offset); + + error = vm_set_tsc_offset(sc->vm, vcpu, offset); + + return (error); +} +#endif + /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 #define MSR_PENTIUM_END 0x1FFF @@ -2197,6 +2219,36 @@ return (EINVAL); } +#ifdef BHYVE_SNAPSHOT +static int +svm_snapshot_reg(void *arg, int vcpu, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = svm_getreg(arg, vcpu, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = svm_setreg(arg, vcpu, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif + static int svm_setcap(void *arg, int vcpu, int type, int val) { @@ -2279,6 +2331,306 @@ free(vlapic, M_SVM_VLAPIC); } +#ifdef BHYVE_SNAPSHOT +static int +svm_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta) +{ + /* struct svm_softc is AMD's representation for SVM softc */ + struct svm_softc *sc; + struct svm_vcpu *vcpu; + struct vmcb *vmcb; + uint64_t val; + int i; + int ret; + + sc = arg; + + KASSERT(sc != NULL, ("%s: arg was NULL", __func__)); + + SNAPSHOT_VAR_OR_LEAVE(sc->nptp, meta, ret, done); + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu = &sc->vcpu[i]; + vmcb = &vcpu->vmcb; + + /* VMCB fields for virtual cpu i */ + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.v_tpr, meta, ret, done); + val = vmcb->ctrl.v_tpr; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.v_tpr = val; + + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.asid, meta, ret, done); + val = vmcb->ctrl.np_enable; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.np_enable = val; + + val = vmcb->ctrl.intr_shadow; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.intr_shadow = val; + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.tlb_ctrl, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad1, + sizeof(vmcb->state.pad1), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cpl, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad2, + sizeof(vmcb->state.pad2), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.efer, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad3, + sizeof(vmcb->state.pad3), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr4, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr7, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr6, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rflags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rip, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad4, + sizeof(vmcb->state.pad4), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rsp, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad5, + sizeof(vmcb->state.pad5), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rax, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.star, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.lstar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cstar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sfmask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.kernelgsbase, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_cs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_esp, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_eip, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr2, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad6, + sizeof(vmcb->state.pad6), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.g_pat, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dbgctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_from, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_to, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_from, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_to, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad7, + sizeof(vmcb->state.pad7), + meta, ret, done); + + /* Snapshot swctx for virtual cpu i */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbp, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rcx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rsi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r8, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r9, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r10, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r11, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r12, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r13, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r14, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r15, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr3, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr6, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr7, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_debugctl, meta, ret, + done); + + /* Restore other svm_vcpu struct fields */ + + /* Restore NEXTRIP field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); + + /* Restore lastcpu field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->lastcpu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->dirty, meta, ret, done); + + /* Restore EPTGEN field - EPT is Extended Page Tabel */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->eptgen, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.gen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.num, meta, ret, done); + + /* Set all caches dirty */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + svm_set_dirty(sc, i, VMCB_CACHE_ASID); + svm_set_dirty(sc, i, VMCB_CACHE_IOPM); + svm_set_dirty(sc, i, VMCB_CACHE_I); + svm_set_dirty(sc, i, VMCB_CACHE_TPR); + svm_set_dirty(sc, i, VMCB_CACHE_CR2); + svm_set_dirty(sc, i, VMCB_CACHE_CR); + svm_set_dirty(sc, i, VMCB_CACHE_DT); + svm_set_dirty(sc, i, VMCB_CACHE_SEG); + svm_set_dirty(sc, i, VMCB_CACHE_NP); + } + } + + if (meta->op == VM_SNAPSHOT_RESTORE) + flush_by_asid(); + +done: + return (ret); +} + +static int +svm_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + int err, running, hostcpu; + + sc = (struct svm_softc *)arg; + err = 0; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcb = svm_get_vmcb(sc, vcpu); + + running = vcpu_is_running(sc->vm, vcpu, &hostcpu); + if (running && hostcpu !=curcpu) { + printf("%s: %s%d is running", __func__, vm_name(sc->vm), vcpu); + return (EINVAL); + } + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR0, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR2, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR3, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR4, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DR7, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RAX, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RSP, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RIP, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + /* ES */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_ES, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_ES, meta); + + /* CS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_CS, meta); + + /* SS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_SS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_SS, meta); + + /* DS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_DS, meta); + + /* FS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_FS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_FS, meta); + + /* GS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_GS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GS, meta); + + /* TR */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_TR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_TR, meta); + + /* LDTR */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_LDTR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_LDTR, meta); + + /* EFER */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_EFER, meta); + + /* IDTR and GDTR */ + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_IDTR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GDTR, meta); + + /* Specific AMD registers */ + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_CS, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_ESP, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_EIP, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_NPT_BASE, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_CR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_DR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXC_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_INST1_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_INST2_INTERCEPT, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_TLB_CTRL, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO1, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO2, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINTINFO, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_VIRQ, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_GUEST_PAT, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_BAR, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PAGE, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_LT, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PT, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_IO_PERM, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_MSR_PERM, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_ASID, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXIT_REASON, 8), meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_INTR_SHADOW, meta); + + return (err); +} + +static int +svm_restore_tsc(void *arg, int vcpu, uint64_t offset) +{ + int err; + + err = svm_set_tsc_offset(arg, vcpu, offset); + + return (err); +} +#endif + struct vmm_ops vmm_ops_amd = { .init = svm_init, .cleanup = svm_cleanup, @@ -2296,4 +2648,9 @@ .vmspace_free = svm_npt_free, .vlapic_init = svm_vlapic_init, .vlapic_cleanup = svm_vlapic_cleanup, +#ifdef BHYVE_SNAPSHOT + .vmsnapshot = svm_snapshot_vmi, + .vmcx_snapshot = svm_snapshot_vmcx, + .vm_restore_tsc = svm_restore_tsc, +#endif }; Index: sys/amd64/vmm/amd/svm_msr.c =================================================================== --- sys/amd64/vmm/amd/svm_msr.c +++ sys/amd64/vmm/amd/svm_msr.c @@ -29,6 +29,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -162,6 +164,11 @@ * Ignore writes to microcode update register. */ break; +#ifdef BHYVE_SNAPSHOT + case MSR_TSC: + error = svm_set_tsc_offset(sc, vcpu, val - rdtsc()); + break; +#endif case MSR_EXTFEATURES: break; default: Index: sys/amd64/vmm/amd/vmcb.h =================================================================== --- sys/amd64/vmm/amd/vmcb.h +++ sys/amd64/vmm/amd/vmcb.h @@ -31,8 +31,6 @@ #ifndef _VMCB_H_ #define _VMCB_H_ -struct svm_softc; - #define BIT(n) (1ULL << n) /* @@ -209,6 +207,10 @@ #define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) #ifdef _KERNEL + +struct svm_softc; +struct vm_snapshot_meta; + /* VMCB save state area segment format */ struct vmcb_segment { uint16_t selector; @@ -331,6 +333,14 @@ int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); +#ifdef BHYVE_SNAPSHOT +int vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val); +int vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val); +int vmcb_snapshot_desc(void *arg, int vcpu, int reg, + struct vm_snapshot_meta *meta); +int vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident, + struct vm_snapshot_meta *meta); +#endif #endif /* _KERNEL */ #endif /* _VMCB_H_ */ Index: sys/amd64/vmm/amd/vmcb.c =================================================================== --- sys/amd64/vmm/amd/vmcb.c +++ sys/amd64/vmm/amd/vmcb.c @@ -29,12 +29,15 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include #include #include +#include #include "vmm_ktr.h" @@ -452,3 +455,106 @@ return (0); } + +#ifdef BHYVE_SNAPSHOT +int +vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val) +{ + int error = 0; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto err; + } + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vm_get_register(sc->vm, vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val) +{ + int error = 0; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto err; + } + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vm_set_register(sc->vm, vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_snapshot_desc(void *arg, int vcpu, int reg, struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getdesc(arg, vcpu, reg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcb_setdesc(arg, vcpu, reg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getany(sc, vcpu, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcb_setany(sc, vcpu, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif Index: sys/amd64/vmm/intel/vmcs.h =================================================================== --- sys/amd64/vmm/intel/vmcs.h +++ sys/amd64/vmm/intel/vmcs.h @@ -32,6 +32,9 @@ #define _VMCS_H_ #ifdef _KERNEL + +struct vm_snapshot_meta; + struct vmcs { uint32_t identifier; uint32_t abort_code; @@ -55,6 +58,16 @@ struct seg_desc *desc); int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, struct seg_desc *desc); +#ifdef BHYVE_SNAPSHOT +int vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val); +int vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); +#endif /* * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h Index: sys/amd64/vmm/intel/vmcs.c =================================================================== --- sys/amd64/vmm/intel/vmcs.c +++ sys/amd64/vmm/intel/vmcs.c @@ -28,6 +28,7 @@ * $FreeBSD$ */ +#include "opt_bhyve_snapshot.h" #include "opt_ddb.h" #include @@ -43,6 +44,7 @@ #include #include +#include #include "vmm_host.h" #include "vmx_cpufunc.h" #include "vmcs.h" @@ -428,6 +430,128 @@ return (error); } +#ifdef BHYVE_SNAPSHOT +int +vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmread(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getreg(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setreg(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcs_setdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getany(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setany(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} +#endif + #ifdef DDB extern int vmxon_enabled[]; Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c +++ sys/amd64/vmm/intel/vmx.c @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -56,6 +58,8 @@ #include #include #include +#include + #include "vmm_lapic.h" #include "vmm_host.h" #include "vmm_ioport.h" @@ -288,6 +292,9 @@ static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); static void vmx_inject_pir(struct vlapic *vlapic); +#ifdef BHYVE_SNAPSHOT +static int vmx_restore_tsc(void *arg, int vcpu, uint64_t now); +#endif #ifdef KTR static const char * @@ -1281,7 +1288,10 @@ } error = vmwrite(VMCS_TSC_OFFSET, offset); - +#ifdef BHYVE_SNAPSHOT + if (error == 0) + error = vm_set_tsc_offset(vmx->vm, vcpu, offset); +#endif return (error); } @@ -3789,6 +3799,153 @@ free(vlapic, M_VLAPIC); } +#ifdef BHYVE_SNAPSHOT +static int +vmx_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta) +{ + struct vmx *vmx; + struct vmxctx *vmxctx; + int i; + int ret; + + vmx = arg; + + KASSERT(vmx != NULL, ("%s: arg was NULL", __func__)); + + for (i = 0; i < VM_MAXCPU; i++) { + SNAPSHOT_BUF_OR_LEAVE(vmx->guest_msrs[i], + sizeof(vmx->guest_msrs[i]), meta, ret, done); + + vmxctx = &vmx->ctx[i]; + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, ret, done); + } + +done: + return (ret); +} + +static int +vmx_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu) +{ + struct vmcs *vmcs; + struct vmx *vmx; + int err, run, hostcpu; + + vmx = (struct vmx *)arg; + err = 0; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcs = &vmx->vmcs[vcpu]; + + run = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (run && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); + return (EINVAL); + } + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta); + + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta); + + /* Guest page tables */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta); + + /* Other guest state */ + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta); + + return (err); +} + +static int +vmx_restore_tsc(void *arg, int vcpu, uint64_t offset) +{ + struct vmcs *vmcs; + struct vmx *vmx = (struct vmx *)arg; + int error, running, hostcpu; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcs = &vmx->vmcs[vcpu]; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); + return (EINVAL); + } + + if (!running) + VMPTRLD(vmcs); + + error = vmx_set_tsc_offset(vmx, vcpu, offset); + + if (!running) + VMCLEAR(vmcs); + return (error); +} +#endif + struct vmm_ops vmm_ops_intel = { .init = vmx_init, .cleanup = vmx_cleanup, @@ -3806,4 +3963,9 @@ .vmspace_free = ept_vmspace_free, .vlapic_init = vmx_vlapic_init, .vlapic_cleanup = vmx_vlapic_cleanup, +#ifdef BHYVE_SNAPSHOT + .vmsnapshot = vmx_snapshot_vmi, + .vmcx_snapshot = vmx_snapshot_vmcx, + .vm_restore_tsc = vmx_restore_tsc, +#endif }; Index: sys/amd64/vmm/io/vatpic.h =================================================================== --- sys/amd64/vmm/io/vatpic.h +++ sys/amd64/vmm/io/vatpic.h @@ -36,6 +36,8 @@ #define IO_ELCR1 0x4d0 #define IO_ELCR2 0x4d1 +struct vm_snapshot_meta; + struct vatpic *vatpic_init(struct vm *vm); void vatpic_cleanup(struct vatpic *vatpic); @@ -54,4 +56,8 @@ void vatpic_pending_intr(struct vm *vm, int *vecptr); void vatpic_intr_accepted(struct vm *vm, int vector); +#ifdef BHYVE_SNAPSHOT +int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta); +#endif + #endif /* _VATPIC_H_ */ Index: sys/amd64/vmm/io/vatpic.c =================================================================== --- sys/amd64/vmm/io/vatpic.c +++ sys/amd64/vmm/io/vatpic.c @@ -29,6 +29,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -42,6 +44,7 @@ #include #include +#include #include "vmm_ktr.h" #include "vmm_lapic.h" @@ -808,3 +811,43 @@ { free(vatpic, M_VATPIC); } + +#ifdef BHYVE_SNAPSHOT +int +vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct atpic *atpic; + + for (i = 0; i < nitems(vatpic->atpic); i++) { + atpic = &vatpic->atpic[i]; + + SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done); + + } + + SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc), + meta, ret, done); + +done: + return (ret); +} +#endif Index: sys/amd64/vmm/io/vatpit.h =================================================================== --- sys/amd64/vmm/io/vatpit.h +++ sys/amd64/vmm/io/vatpit.h @@ -36,6 +36,8 @@ #define NMISC_PORT 0x61 +struct vm_snapshot_meta; + struct vatpit *vatpit_init(struct vm *vm); void vatpit_cleanup(struct vatpit *vatpit); @@ -43,5 +45,8 @@ uint32_t *eax); int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); +#ifdef BHYVE_SNAPSHOT +int vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta); +#endif #endif /* _VATPIT_H_ */ Index: sys/amd64/vmm/io/vatpit.c =================================================================== --- sys/amd64/vmm/io/vatpit.c +++ sys/amd64/vmm/io/vatpit.c @@ -29,6 +29,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -39,6 +41,7 @@ #include #include +#include #include "vmm_ktr.h" #include "vatpic.h" @@ -472,3 +475,42 @@ free(vatpit, M_VATPIT); } + +#ifdef BHYVE_SNAPSHOT +int +vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct channel *channel; + + SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_bt.frac, meta, ret, done); + + /* properly restore timers; they will NOT work currently */ + printf("%s: snapshot restore does not reset timers!\r\n", __func__); + + for (i = 0; i < nitems(vatpit->channel); i++) { + channel = &vatpit->channel[i]; + + SNAPSHOT_VAR_OR_LEAVE(channel->mode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->initial, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->now_bt.frac, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->cr, sizeof(channel->cr), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->ol, sizeof(channel->ol), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->slatched, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->crbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->frbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->callout_bt.frac, meta, ret, + done); + } + +done: + return (ret); +} +#endif Index: sys/amd64/vmm/io/vhpet.h =================================================================== --- sys/amd64/vmm/io/vhpet.h +++ sys/amd64/vmm/io/vhpet.h @@ -35,6 +35,8 @@ #define VHPET_BASE 0xfed00000 #define VHPET_SIZE 1024 +struct vm_snapshot_meta; + struct vhpet *vhpet_init(struct vm *vm); void vhpet_cleanup(struct vhpet *vhpet); int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, @@ -42,5 +44,9 @@ int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, int size, void *arg); int vhpet_getcap(struct vm_hpet_cap *cap); +#ifdef BHYVE_SNAPSHOT +int vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta); +int vhpet_restore_time(struct vhpet *vhpet); +#endif #endif /* _VHPET_H_ */ Index: sys/amd64/vmm/io/vhpet.c =================================================================== --- sys/amd64/vmm/io/vhpet.c +++ sys/amd64/vmm/io/vhpet.c @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -43,6 +45,7 @@ #include #include +#include #include "vmm_lapic.h" #include "vatpic.h" @@ -761,3 +764,49 @@ cap->capabilities = vhpet_capabilities(); return (0); } + +#ifdef BHYVE_SNAPSHOT +int +vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta) +{ + int i, ret; + uint32_t countbase; + + SNAPSHOT_VAR_OR_LEAVE(vhpet->freq_sbt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->isr, meta, ret, done); + + /* at restore time the countbase should have the value it had when the + * snapshot was created; since the value is not directly kept in + * vhpet->countbase, but rather computed relative to the current system + * uptime using countbase_sbt, save the value retured by vhpet_counter + */ + if (meta->op == VM_SNAPSHOT_SAVE) + countbase = vhpet_counter(vhpet, NULL); + SNAPSHOT_VAR_OR_LEAVE(countbase, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vhpet->countbase = countbase; + + for (i = 0; i < nitems(vhpet->timer); i++) { + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].cap_config, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].msireg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].compval, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].comprate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].callout_sbt, + meta, ret, done); + } + +done: + return (ret); +} + +int +vhpet_restore_time(struct vhpet *vhpet) +{ + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + + return (0); +} +#endif Index: sys/amd64/vmm/io/vioapic.h =================================================================== --- sys/amd64/vmm/io/vioapic.h +++ sys/amd64/vmm/io/vioapic.h @@ -32,6 +32,8 @@ #ifndef _VIOAPIC_H_ #define _VIOAPIC_H_ +struct vm_snapshot_meta; + #define VIOAPIC_BASE 0xFEC00000 #define VIOAPIC_SIZE 4096 @@ -49,4 +51,9 @@ int vioapic_pincount(struct vm *vm); void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); +#ifdef BHYVE_SNAPSHOT +int vioapic_snapshot(struct vioapic *vioapic, + struct vm_snapshot_meta *meta); +#endif + #endif Index: sys/amd64/vmm/io/vioapic.c =================================================================== --- sys/amd64/vmm/io/vioapic.c +++ sys/amd64/vmm/io/vioapic.c @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -42,6 +44,7 @@ #include #include +#include #include "vmm_ktr.h" #include "vmm_lapic.h" @@ -499,3 +502,22 @@ return (REDIR_ENTRIES); } + +#ifdef BHYVE_SNAPSHOT +int +vioapic_snapshot(struct vioapic *vioapic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + + SNAPSHOT_VAR_OR_LEAVE(vioapic->ioregsel, meta, ret, done); + + for (i = 0; i < nitems(vioapic->rtbl); i++) { + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].reg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].acnt, meta, ret, done); + } + +done: + return (ret); +} +#endif Index: sys/amd64/vmm/io/vlapic.h =================================================================== --- sys/amd64/vmm/io/vlapic.h +++ sys/amd64/vmm/io/vlapic.h @@ -32,6 +32,7 @@ #define _VLAPIC_H_ struct vm; +struct vm_snapshot_meta; enum x2apic_state; int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, @@ -107,4 +108,9 @@ void vlapic_dcr_write_handler(struct vlapic *vlapic); void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); + +#ifdef BHYVE_SNAPSHOT +int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta); +#endif + #endif /* _VLAPIC_H_ */ Index: sys/amd64/vmm/io/vlapic.c =================================================================== --- sys/amd64/vmm/io/vlapic.c +++ sys/amd64/vmm/io/vlapic.c @@ -32,6 +32,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -47,6 +49,7 @@ #include #include +#include #include "vmm_lapic.h" #include "vmm_ktr.h" @@ -1643,3 +1646,106 @@ VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } + +#ifdef BHYVE_SNAPSHOT +static void +vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr) +{ + /* The implementation is similar to the one in the + * `vlapic_icrtmr_write_handler` function + */ + sbintime_t sbt; + struct bintime bt; + + VLAPIC_TIMER_LOCK(vlapic); + + bt = vlapic->timer_freq_bt; + bintime_mul(&bt, ccr); + + if (ccr != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &bt); + + sbt = bttosbt(bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else { + /* even if the CCR was 0, periodic timers should be reset */ + if (vlapic_periodic_timer(vlapic)) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, + &vlapic->timer_period_bt); + sbt = bttosbt(vlapic->timer_period_bt); + + callout_stop(&vlapic->callout); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } + } + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +int +vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int i, ret; + struct vlapic *vlapic; + struct LAPIC *lapic; + uint32_t ccr; + + KASSERT(vm != NULL, ("%s: arg was NULL", __func__)); + + ret = 0; + + for (i = 0; i < VM_MAXCPU; i++) { + vlapic = vm_lapic(vm, i); + + /* snapshot the page first; timer period depends on icr_timer */ + lapic = vlapic->apic_page; + SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac, + meta, ret, done); + + /* + * Timer period is equal to 'icr_timer' ticks at a frequency of + * 'timer_freq_bt'. + */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + } + + SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk, + sizeof(vlapic->isrvec_stk), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last, + sizeof(vlapic->lvt_last), + meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) + ccr = vlapic_get_ccr(vlapic); + + SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done); + + if (meta->op == VM_SNAPSHOT_RESTORE) { + /* Reset the value of the 'timer_fire_bt' and the vlapic + * callout based on the value of the current count + * register saved when the VM snapshot was created + */ + vlapic_reset_callout(vlapic, ccr); + } + } + +done: + return (ret); +} +#endif Index: sys/amd64/vmm/io/vpmtmr.h =================================================================== --- sys/amd64/vmm/io/vpmtmr.h +++ sys/amd64/vmm/io/vpmtmr.h @@ -34,6 +34,7 @@ #define IO_PMTMR 0x408 struct vpmtmr; +struct vm_snapshot_meta; struct vpmtmr *vpmtmr_init(struct vm *vm); void vpmtmr_cleanup(struct vpmtmr *pmtmr); @@ -41,4 +42,8 @@ int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); +#ifdef BHYVE_SNAPSHOT +int vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta); +#endif + #endif Index: sys/amd64/vmm/io/vpmtmr.c =================================================================== --- sys/amd64/vmm/io/vpmtmr.c +++ sys/amd64/vmm/io/vpmtmr.c @@ -29,6 +29,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -36,6 +38,7 @@ #include #include +#include #include "vpmtmr.h" @@ -103,3 +106,16 @@ return (0); } + +#ifdef BHYVE_SNAPSHOT +int +vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(vpmtmr->baseval, meta, ret, done); + +done: + return (ret); +} +#endif Index: sys/amd64/vmm/io/vrtc.h =================================================================== --- sys/amd64/vmm/io/vrtc.h +++ sys/amd64/vmm/io/vrtc.h @@ -34,6 +34,7 @@ #include struct vrtc; +struct vm_snapshot_meta; struct vrtc *vrtc_init(struct vm *vm); void vrtc_cleanup(struct vrtc *vrtc); @@ -49,4 +50,8 @@ int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); +#ifdef BHYVE_SNAPSHOT +int vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta); +#endif + #endif Index: sys/amd64/vmm/io/vrtc.c =================================================================== --- sys/amd64/vmm/io/vrtc.c +++ sys/amd64/vmm/io/vrtc.c @@ -29,6 +29,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -40,6 +42,7 @@ #include #include +#include #include @@ -1019,3 +1022,45 @@ callout_drain(&vrtc->callout); free(vrtc, M_VRTC); } + +#ifdef BHYVE_SNAPSHOT +int +vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta) +{ + int ret; + + VRTC_LOCK(vrtc); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->addr, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vrtc->base_uptime = sbinuptime(); + SNAPSHOT_VAR_OR_LEAVE(vrtc->base_rtctime, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_week, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.year, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_a, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_b, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_c, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_d, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram, sizeof(vrtc->rtcdev.nvram), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.century, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram2, sizeof(vrtc->rtcdev.nvram2), + meta, ret, done); + + vrtc_callout_reset(vrtc, vrtc_freq(vrtc)); + + VRTC_UNLOCK(vrtc); + +done: + return (ret); +} +#endif Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c +++ sys/amd64/vmm/vmm.c @@ -31,6 +31,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -44,7 +46,7 @@ #include #include #include -#include +#include #include #include @@ -53,6 +55,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include @@ -64,6 +71,7 @@ #include #include #include +#include #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -111,6 +119,7 @@ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ + uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) @@ -204,6 +213,14 @@ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) #define VLAPIC_CLEANUP(vmi, vlapic) \ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) +#ifdef BHYVE_SNAPSHOT +#define VM_SNAPSHOT_VMI(vmi, meta) \ + (ops != NULL ? (*ops->vmsnapshot)(vmi, meta) : ENXIO) +#define VM_SNAPSHOT_VMCX(vmi, meta, vcpuid) \ + (ops != NULL ? (*ops->vmcx_snapshot)(vmi, meta, vcpuid) : ENXIO) +#define VM_RESTORE_TSC(vmi, vcpuid, offset) \ + (ops != NULL ? (*ops->vm_restore_tsc)(vmi, vcpuid, offset) : ENXIO) +#endif #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() @@ -289,6 +306,7 @@ vcpu->hostcpu = NOCPU; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); + vcpu->tsc_offset = 0; } vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); @@ -2715,3 +2733,177 @@ VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); + +#ifdef BHYVE_SNAPSHOT +static int +vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct vcpu *vcpu; + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu = &vm->vcpu[i]; + + SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); + /* XXX we're cheating here, since the value of tsc_offset as + * saved here is actually the value of the guest's TSC value. + * + * It will be turned turned back into an actual offset when the + * TSC restore function is called + */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->tsc_offset, meta, ret, done); + } + +done: + return (ret); +} + +static int +vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + uint64_t now; + + ret = 0; + now = rdtsc(); + + if (meta->op == VM_SNAPSHOT_SAVE) { + /* XXX make tsc_offset take the value TSC proper as seen by the + * guest + */ + for (i = 0; i < VM_MAXCPU; i++) + vm->vcpu[i].tsc_offset += now; + } + + ret = vm_snapshot_vcpus(vm, meta); + if (ret != 0) { + printf("%s: failed to copy vm data to user buffer", __func__); + goto done; + } + + if (meta->op == VM_SNAPSHOT_SAVE) { + /* XXX turn tsc_offset back into an offset; actual value is only + * required for restore; using it otherwise would be wrong + */ + for (i = 0; i < VM_MAXCPU; i++) + vm->vcpu[i].tsc_offset -= now; + } + +done: + return (ret); +} + +static int +vm_snapshot_vmcx(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int i, error; + + error = 0; + + for (i = 0; i < VM_MAXCPU; i++) { + error = VM_SNAPSHOT_VMCX(vm->cookie, meta, i); + if (error != 0) { + printf("%s: failed to snapshot vmcs/vmcb data for " + "vCPU: %d; error: %d\n", __func__, i, error); + goto done; + } + } + +done: + return (error); +} + +/* + * Save kernel-side structures to user-space for snapshotting. + */ +int +vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret = 0; + + switch (meta->dev_req) { + case STRUCT_VMX: + ret = VM_SNAPSHOT_VMI(vm->cookie, meta); + break; + case STRUCT_VMCX: + ret = vm_snapshot_vmcx(vm, meta); + break; + case STRUCT_VM: + ret = vm_snapshot_vm(vm, meta); + break; + case STRUCT_VIOAPIC: + ret = vioapic_snapshot(vm_ioapic(vm), meta); + break; + case STRUCT_VLAPIC: + ret = vlapic_snapshot(vm, meta); + break; + case STRUCT_VHPET: + ret = vhpet_snapshot(vm_hpet(vm), meta); + break; + case STRUCT_VATPIC: + ret = vatpic_snapshot(vm_atpic(vm), meta); + break; + case STRUCT_VATPIT: + ret = vatpit_snapshot(vm_atpit(vm), meta); + break; + case STRUCT_VPMTMR: + ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); + break; + case STRUCT_VRTC: + ret = vrtc_snapshot(vm_rtc(vm), meta); + break; + default: + printf("%s: failed to find the requested type %#x\n", + __func__, meta->dev_req); + ret = (EINVAL); + } + return (ret); +} + +int +vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu->tsc_offset = offset; + + return (0); +} + +int +vm_restore_time(struct vm *vm) +{ + int error, i; + uint64_t now; + struct vcpu *vcpu; + + now = rdtsc(); + + error = vhpet_restore_time(vm_hpet(vm)); + if (error) + return (error); + + for (i = 0; i < nitems(vm->vcpu); i++) { + vcpu = &vm->vcpu[i]; + + error = VM_RESTORE_TSC(vm->cookie, i, vcpu->tsc_offset - now); + if (error) + return (error); + } + + return (0); +} +#endif Index: sys/amd64/vmm/vmm_dev.c =================================================================== --- sys/amd64/vmm/vmm_dev.c +++ sys/amd64/vmm/vmm_dev.c @@ -31,6 +31,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_bhyve_snapshot.h" + #include #include #include @@ -53,8 +55,9 @@ #include #include -#include #include +#include +#include #include "vmm_lapic.h" #include "vmm_stat.h" @@ -381,6 +384,9 @@ struct vm_cpu_topology *topology; uint64_t *regvals; int *regnums; +#ifdef BHYVE_SNAPSHOT + struct vm_snapshot_meta *snapshot_meta; +#endif error = vmm_priv_check(curthread->td_ucred); if (error) @@ -784,6 +790,15 @@ &topology->threads, &topology->maxcpus); error = 0; break; +#ifdef BHYVE_SNAPSHOT + case VM_SNAPSHOT_REQ: + snapshot_meta = (struct vm_snapshot_meta *)data; + error = vm_snapshot_req(sc->vm, snapshot_meta); + break; + case VM_RESTORE_TIME: + error = vm_restore_time(sc->vm); + break; +#endif default: error = ENOTTY; break; Index: sys/amd64/vmm/vmm_snapshot.c =================================================================== --- /dev/null +++ sys/amd64/vmm/vmm_snapshot.c @@ -0,0 +1,103 @@ +#include +#include + +#include + +void +vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) +{ + const char *opstr; + + if (op == VM_SNAPSHOT_SAVE) + opstr = "save"; + else if (op == VM_SNAPSHOT_RESTORE) + opstr = "restore"; + else + opstr = "unknown"; + + printf("%s: snapshot-%s failed for %s\r\n", __func__, opstr, bufname); +} + +int +vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + void *nv_data; + + nv_data = __DEVOLATILE(void *, data); + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + printf("%s: buffer too small\r\n", __func__); + return (E2BIG); + } + + if (op == VM_SNAPSHOT_SAVE) + copyout(nv_data, buffer->buf, data_size); + else if (op == VM_SNAPSHOT_RESTORE) + copyin(buffer->buf, nv_data, data_size); + else + return (EINVAL); + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + + return (0); +} + +size_t +vm_get_snapshot_size(struct vm_snapshot_meta *meta) +{ + size_t length; + struct vm_snapshot_buffer *buffer; + + buffer = &meta->buffer; + + if (buffer->buf_size < buffer->buf_rem) { + printf("%s: Invalid buffer: size = %zu, rem = %zu\r\n", + __func__, buffer->buf_size, buffer->buf_rem); + length = 0; + } else { + length = buffer->buf_size - buffer->buf_rem; + } + + return (length); +} + +int +vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + int ret; + void *_data = *(void **)(void *)&data; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + printf("%s: buffer too small\r\n", __func__); + ret = E2BIG; + goto done; + } + + if (op == VM_SNAPSHOT_SAVE) { + ret = 0; + copyout(_data, buffer->buf, data_size); + } else if (op == VM_SNAPSHOT_RESTORE) { + ret = memcmp(_data, buffer->buf, data_size); + } else { + ret = EINVAL; + goto done; + } + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + +done: + return (ret); +} Index: sys/conf/config.mk =================================================================== --- sys/conf/config.mk +++ sys/conf/config.mk @@ -8,6 +8,10 @@ # the code here when they all produce identical results # (or should) .if !defined(KERNBUILDDIR) +.if ${MK_BHYVE_SNAPSHOT} != "no" +opt_bhyve_snapshot.h: + @echo "#define BHYVE_SNAPSHOT 1" > ${.TARGET} +.endif opt_bpf.h: echo "#define DEV_BPF 1" > ${.TARGET} .if ${MK_INET_SUPPORT} != "no" @@ -38,6 +42,9 @@ KERN_OPTS=MROUTING IEEE80211_DEBUG \ IEEE80211_SUPPORT_MESH DEV_BPF \ ${KERN_OPTS.${MACHINE}} ${KERN_OPTS_EXTRA} +.if ${MK_BHYVE_SNAPSHOT} != "no" +KERN_OPTS+= BHYVE_SNAPSHOT +.endif .if ${MK_INET_SUPPORT} != "no" KERN_OPTS+= INET TCP_OFFLOAD .endif Index: sys/conf/kern.opts.mk =================================================================== --- sys/conf/kern.opts.mk +++ sys/conf/kern.opts.mk @@ -49,6 +49,7 @@ ZFS __DEFAULT_NO_OPTIONS = \ + BHYVE_SNAPSHOT \ EXTRA_TCP_STACKS \ KERNEL_RETPOLINE \ OFED \ Index: sys/conf/kmod.mk =================================================================== --- sys/conf/kmod.mk +++ sys/conf/kmod.mk @@ -73,12 +73,7 @@ KMODISLOADED?= /sbin/kldstat -q -n OBJCOPY?= objcopy -.include -# Grab all the options for a kernel build. For backwards compat, we need to -# do this after bsd.own.mk. -.include "kern.opts.mk" -.include -.include "config.mk" +.include "kmod.opts.mk" # Search for kernel source tree in standard places. .if empty(KERNBUILDDIR) Index: sys/conf/kmod.opts.mk =================================================================== --- /dev/null +++ sys/conf/kmod.opts.mk @@ -0,0 +1,16 @@ +# $FreeBSD$ +# +# Handle options (KERN_OPTS) for kernel module options. This can be included earlier in a kmod Makefile +# to allow KERN_OPTS to control SRCS, etc. + +.if !target(____) +____: + +.include +# Grab all the options for a kernel build. For backwards compat, we need to +# do this after bsd.own.mk. +.include "kern.opts.mk" +.include +.include "config.mk" + +.endif # !target(____) Index: sys/conf/options.amd64 =================================================================== --- sys/conf/options.amd64 +++ sys/conf/options.amd64 @@ -3,6 +3,7 @@ AUTO_EOI_1 opt_auto_eoi.h AUTO_EOI_2 opt_auto_eoi.h +BHYVE_SNAPSHOT COUNT_XINVLTLB_HITS opt_smp.h COUNT_IPIS opt_smp.h MAXMEM Index: sys/modules/vmm/Makefile =================================================================== --- sys/modules/vmm/Makefile +++ sys/modules/vmm/Makefile @@ -1,8 +1,11 @@ # $FreeBSD$ +.include + KMOD= vmm -SRCS= opt_acpi.h opt_ddb.h device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h +SRCS= opt_acpi.h opt_bhyve_snapshot.h opt_ddb.h +SRCS+= device_if.h bus_if.h pci_if.h pcib_if.h acpi_if.h vnode_if.h DPSRCS+= vmx_assym.h svm_assym.h DPSRCS+= vmx_genassym.c svm_genassym.c offset.inc @@ -55,6 +58,10 @@ amdvi_hw.c \ svm_msr.c +.if ${KERN_OPTS:MBHYVE_SNAPSHOT} != "" +SRCS+= vmm_snapshot.c +.endif + CLEANFILES= vmx_assym.h vmx_genassym.o svm_assym.h svm_genassym.o OBJS_DEPEND_GUESS.vmx_support.o+= vmx_assym.h Index: tools/build/options/WITH_BHYVE_SNAPSHOT =================================================================== --- /dev/null +++ tools/build/options/WITH_BHYVE_SNAPSHOT @@ -0,0 +1,7 @@ +.\" $FreeBSD$ +Set to include support for save and restore (snapshots) in +.Xr bhyve 8 +and +.Xr bhyvectl 8 . +.Pp +This option only affects amd64/amd64. Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -71,10 +71,17 @@ spinup_ap.c \ iov.c +.if ${MK_BHYVE_SNAPSHOT} != "no" +SRCS+= snapshot.c +.endif + .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c LIBADD= vmmapi md pthread z util sbuf cam +.if ${MK_BHYVE_SNAPSHOT} != "no" +LIBADD+= ucl xo +.endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET @@ -91,6 +98,14 @@ CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/e1000 CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/mii CFLAGS+= -I${BHYVE_SYSDIR}/sys/dev/usb/controller +.if ${MK_BHYVE_SNAPSHOT} != "no" +CFLAGS+= -I${SRCTOP}/contrib/libucl/include + +# Temporary disable capsicum, until we integrate checkpoint code with it. +CFLAGS+= -DWITHOUT_CAPSICUM + +CFLAGS+= -DBHYVE_SNAPSHOT +.endif .ifdef GDB_LOG CFLAGS+=-DGDB_LOG Index: usr.sbin/bhyve/Makefile.depend =================================================================== --- usr.sbin/bhyve/Makefile.depend +++ usr.sbin/bhyve/Makefile.depend @@ -12,8 +12,10 @@ lib/libcompiler_rt \ lib/libmd \ lib/libthr \ + lib/libucl \ lib/libutil \ lib/libvmmapi \ + lib/libxo \ lib/libz \ secure/lib/libcrypto \ Index: usr.sbin/bhyve/atkbdc.h =================================================================== --- usr.sbin/bhyve/atkbdc.h +++ usr.sbin/bhyve/atkbdc.h @@ -30,9 +30,14 @@ #define _ATKBDC_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct vmctx; void atkbdc_init(struct vmctx *ctx); void atkbdc_event(struct atkbdc_softc *sc, int iskbd); +#ifdef BHYVE_SNAPSHOT +int atkbdc_snapshot(struct vm_snapshot_meta *meta); +#endif + #endif /* _ATKBDC_H_ */ Index: usr.sbin/bhyve/atkbdc.c =================================================================== --- usr.sbin/bhyve/atkbdc.c +++ usr.sbin/bhyve/atkbdc.c @@ -33,6 +33,7 @@ #include #include +#include #include @@ -137,6 +138,10 @@ struct aux_dev aux; }; +#ifdef BHYVE_SNAPSHOT +static struct atkbdc_softc *atkbdc_sc = NULL; +#endif + static void atkbdc_assert_kbd_intr(struct atkbdc_softc *sc) { @@ -548,7 +553,48 @@ sc->ps2kbd_sc = ps2kbd_init(sc); sc->ps2mouse_sc = ps2mouse_init(sc); + +#ifdef BHYVE_SNAPSHOT + assert(atkbdc_sc == NULL); + atkbdc_sc = sc; +#endif +} + +#ifdef BHYVE_SNAPSHOT +int +atkbdc_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->outport, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->ram, + sizeof(atkbdc_sc->ram), meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->curcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->ctrlbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->kbd.buffer, + sizeof(atkbdc_sc->kbd.buffer), meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.brd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bwr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bcnt, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq, meta, ret, done); + + ret = ps2kbd_snapshot(atkbdc_sc->ps2kbd_sc, meta); + if (ret != 0) + goto done; + + ret = ps2mouse_snapshot(atkbdc_sc->ps2mouse_sc, meta); + +done: + return (ret); } +#endif static void atkbdc_dsdt(void) Index: usr.sbin/bhyve/bhyve.8 =================================================================== --- usr.sbin/bhyve/bhyve.8 +++ usr.sbin/bhyve/bhyve.8 @@ -47,6 +47,7 @@ .Op Fl l Ar help|lpcdev Ns Op , Ns Ar conf .Op Fl m Ar memsize Ns Op Ar K|k|M|m|G|g|T|t .Op Fl p Ar vcpu:hostcpu +.Op Fl r Ar file .Op Fl s Ar help|slot,emulation Ns Op , Ns Ar conf .Op Fl G Ar port .Op Fl U Ar uuid @@ -174,6 +175,21 @@ .Em hostcpu . .It Fl P Force the guest virtual CPU to exit when a PAUSE instruction is detected. +.It Fl r Ar file +Resume a guest from a snapshot. +The guest memory contents are restored from +.Ar file , +and the guest device and vCPU state are restored from the file +.Dq Ar file Ns .kern . +.Pp +Note that the current snapshot file format requires that the configuration of +devices in the new VM match the VM from which the snapshot was taken by specifying the +same +.Op Fl s +and +.Op Fl l +options. +The count of vCPUs and memory configuration are read from the snapshot. .It Fl s Op Ar help|slot,emulation Ns Op , Ns Ar conf Configure a virtual PCI slot and function. .Pp Index: usr.sbin/bhyve/bhyverun.h =================================================================== --- usr.sbin/bhyve/bhyverun.h +++ usr.sbin/bhyve/bhyverun.h @@ -38,9 +38,12 @@ extern int guest_ncpus; extern uint16_t cores, sockets, threads; extern char *guest_uuid_str; -extern char *vmname; +extern const char *vmname; void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); +#ifdef BHYVE_SNAPSHOT +uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr); +#endif void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c +++ usr.sbin/bhyve/bhyverun.c @@ -36,7 +36,14 @@ #include #endif #include +#ifdef BHYVE_SNAPSHOT +#include +#include +#endif #include +#ifdef BHYVE_SNAPSHOT +#include +#endif #include @@ -51,6 +58,9 @@ #include #include #include +#ifdef BHYVE_SNAPSHOT +#include +#endif #include #include #include @@ -59,6 +69,12 @@ #include #include #include +#ifdef BHYVE_SNAPSHOT +#include +#include + +#include +#endif #include #ifndef WITHOUT_CAPSICUM @@ -81,6 +97,9 @@ #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" +#ifdef BHYVE_SNAPSHOT +#include "snapshot.h" +#endif #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" @@ -160,7 +179,7 @@ typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); -char *vmname; +const char *vmname; int guest_ncpus; uint16_t cores, maxcpus, sockets, threads; @@ -223,6 +242,9 @@ " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" +#ifdef BHYVE_SNAPSHOT + " -r: path to checkpoint file\n" +#endif " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" @@ -382,6 +404,14 @@ return (vm_map_gpa(ctx, gaddr, len)); } +#ifdef BHYVE_SNAPSHOT +uintptr_t +paddr_host2guest(struct vmctx *ctx, void *addr) +{ + return (vm_rev_map_gpa(ctx, addr)); +} +#endif + int fbsdrun_vmexit_on_pause(void) { @@ -416,6 +446,9 @@ snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_add(vcpu); +#endif gdb_cpu_add(vcpu); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); @@ -690,7 +723,13 @@ stats.vmexit_mtrap++; +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_suspend(*pvcpu); +#endif gdb_cpu_mtrap(*pvcpu); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_resume(*pvcpu); +#endif return (VMEXIT_CONTINUE); } @@ -770,7 +809,13 @@ vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_suspend(*pvcpu); +#endif gdb_cpu_suspend(*pvcpu); +#ifdef BHYVE_SNAPSHOT + checkpoint_cpu_resume(*pvcpu); +#endif return (VMEXIT_CONTINUE); } @@ -972,6 +1017,22 @@ return (ctx); } +void +spinup_vcpu(struct vmctx *ctx, int vcpu) +{ + int error; + uint64_t rip; + + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, vcpu); + error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + assert(error == 0); + + fbsdrun_addcpu(ctx, BSP, vcpu, rip); +} + int main(int argc, char *argv[]) { @@ -983,6 +1044,13 @@ uint64_t rip; size_t memsize; char *optstr; +#ifdef BHYVE_SNAPSHOT + char *restore_file; + struct restore_state rstate; + int vcpu; + + restore_file = NULL; +#endif bvmcons = 0; progname = basename(argv[0]); @@ -997,7 +1065,11 @@ rtc_localtime = 1; memflags = 0; +#ifdef BHYVE_SNAPSHOT + optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:"; +#else optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:"; +#endif while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': @@ -1043,6 +1115,11 @@ "configuration '%s'", optarg); } break; +#ifdef BHYVE_SNAPSHOT + case 'r': + restore_file = optarg; + break; +#endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); @@ -1104,12 +1181,50 @@ argc -= optind; argv += optind; +#ifdef BHYVE_SNAPSHOT + if (argc > 1 || (argc == 0 && restore_file == NULL)) + usage(1); + + if (restore_file != NULL) { + error = load_restore_file(restore_file, &rstate); + if (error) { + fprintf(stderr, "Failed to read checkpoint info from " + "file: '%s'.\n", restore_file); + exit(1); + } + } + + if (argc == 1) { + vmname = argv[0]; + } else { + vmname = lookup_vmname(&rstate); + if (vmname == NULL) { + fprintf(stderr, "Cannot find VM name in restore file. " + "Please specify one.\n"); + exit(1); + } + } +#else if (argc != 1) usage(1); vmname = argv[0]; +#endif ctx = do_open(vmname); +#ifdef BHYVE_SNAPSHOT + if (restore_file != NULL) { + guest_ncpus = lookup_guest_ncpus(&rstate); + memflags = lookup_memflags(&rstate); + memsize = lookup_memsize(&rstate); + } + + if (guest_ncpus < 1) { + fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); + exit(1); + } +#endif + max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", @@ -1168,6 +1283,40 @@ assert(error == 0); } +#ifdef BHYVE_SNAPSHOT + if (restore_file != NULL) { + fprintf(stdout, "Pausing pci devs...\r\n"); + if (vm_pause_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to pause PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Restoring vm mem...\r\n"); + if (restore_vm_mem(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore VM memory.\n"); + exit(1); + } + + fprintf(stdout, "Restoring pci devs...\r\n"); + if (vm_restore_user_devs(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Restoring kernel structs...\r\n"); + if (vm_restore_kern_structs(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore kernel structs.\n"); + exit(1); + } + + fprintf(stdout, "Resuming pci devs...\r\n"); + if (vm_resume_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to resume PCI device state.\n"); + exit(1); + } + } +#endif + error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); @@ -1208,11 +1357,41 @@ errx(EX_OSERR, "cap_enter() failed"); #endif +#ifdef BHYVE_SNAPSHOT + if (restore_file != NULL) + destroy_restore_state(&rstate); + + /* + * checkpointing thread for communication with bhyvectl + */ + if (init_checkpoint_thread(ctx) < 0) + printf("Failed to start checkpoint thread!\r\n"); + + if (restore_file != NULL) + vm_restore_time(ctx); +#endif + /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, BSP, rip); +#ifdef BHYVE_SNAPSHOT + /* + * If we restore a VM, start all vCPUs now (including APs), otherwise, + * let the guest OS to spin them up later via vmexits. + */ + if (restore_file != NULL) { + for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { + if (vcpu == BSP) + continue; + + fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu); + spinup_vcpu(ctx, vcpu); + } + } +#endif + /* * Head off to the main event dispatch loop */ Index: usr.sbin/bhyve/block_if.h =================================================================== --- usr.sbin/bhyve/block_if.h +++ usr.sbin/bhyve/block_if.h @@ -41,6 +41,9 @@ #include #include +struct vm_snapshot_meta; + + /* * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in * a single request. BLOCKIF_RING_MAX is the maxmimum number of @@ -74,5 +77,13 @@ int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); +#ifdef BHYVE_SNAPSHOT +void blockif_pause(struct blockif_ctxt *bc); +void blockif_resume(struct blockif_ctxt *bc); +int blockif_snapshot_req(struct blockif_req *br, + struct vm_snapshot_meta *meta); +int blockif_snapshot(struct blockif_ctxt *bc, + struct vm_snapshot_meta *meta); +#endif #endif /* _BLOCK_IF_H_ */ Index: usr.sbin/bhyve/block_if.c =================================================================== --- usr.sbin/bhyve/block_if.c +++ usr.sbin/bhyve/block_if.c @@ -57,6 +57,7 @@ #include #include +#include #include "bhyverun.h" #include "mevent.h" @@ -103,9 +104,13 @@ int bc_psectsz; int bc_psectoff; int bc_closing; + int bc_paused; + int bc_work_count; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; + pthread_cond_t bc_paused_cond; + pthread_cond_t bc_work_done_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; @@ -208,6 +213,18 @@ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } +static int +blockif_flush_bc(struct blockif_ctxt *bc) +{ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + return (errno); + } else if (fsync(bc->bc_fd)) + return (errno); + + return (0); +} + static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { @@ -298,11 +315,7 @@ } break; case BOP_FLUSH: - if (bc->bc_ischr) { - if (ioctl(bc->bc_fd, DIOCGFLUSH)) - err = errno; - } else if (fsync(bc->bc_fd)) - err = errno; + err = blockif_flush_bc(bc); break; case BOP_DELETE: if (!bc->bc_candelete) @@ -346,15 +359,30 @@ pthread_mutex_lock(&bc->bc_mtx); for (;;) { - while (blockif_dequeue(bc, t, &be)) { + bc->bc_work_count++; + + /* We cannot process work if the interface is paused */ + while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } + + bc->bc_work_count--; + + /* If none of the workers are busy, notify the main thread */ + if (bc->bc_work_count == 0) + pthread_cond_broadcast(&bc->bc_work_done_cond); + /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) break; + + /* Make all worker threads wait here if the device is paused */ + while (bc->bc_paused) + pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); @@ -558,6 +586,10 @@ bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); + bc->bc_paused = 0; + bc->bc_work_count = 0; + pthread_cond_init(&bc->bc_paused_cond, NULL); + pthread_cond_init(&bc->bc_work_done_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); @@ -650,6 +682,8 @@ assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); + /* XXX: not waiting while paused */ + /* * Check pending requests. */ @@ -848,3 +882,100 @@ assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } + +#ifdef BHYVE_SNAPSHOT +void +blockif_pause(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 1; + + /* The interface is paused. Wait for workers to finish their work */ + while (bc->bc_work_count) + pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); + pthread_mutex_unlock(&bc->bc_mtx); + + if (blockif_flush_bc(bc)) + fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", + __func__); +} + +void +blockif_resume(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 0; + /* resume the threads waiting for paused */ + pthread_cond_broadcast(&bc->bc_paused_cond); + /* kick the threads after restore */ + pthread_cond_broadcast(&bc->bc_cond); + pthread_mutex_unlock(&bc->bc_mtx); +} + +int +blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) +{ + int i; + struct iovec *iov; + int ret; + + SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); + + /* + * XXX: The callback and parameter must be filled by the virtualized + * device that uses the interface, during its init; we're not touching + * them here. + */ + + /* Snapshot the iovecs. */ + for (i = 0; i < br->br_iovcnt; i++) { + iov = &br->br_iov[i]; + + SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); + + /* We assume the iov is a guest-mapped address. */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, + false, meta, ret, done); + } + +done: + return (ret); +} + +int +blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) +{ + int ret; + + if (bc->bc_paused == 0) { + fprintf(stderr, "%s: Snapshot failed: " + "interface not paused.\r\n", __func__); + return (ENXIO); + } + + pthread_mutex_lock(&bc->bc_mtx); + + SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); + +done: + pthread_mutex_unlock(&bc->bc_mtx); + return (ret); +} +#endif Index: usr.sbin/bhyve/mevent.c =================================================================== --- usr.sbin/bhyve/mevent.c +++ usr.sbin/bhyve/mevent.c @@ -68,7 +68,7 @@ #define MEV_DEL_PENDING 4 #define MEV_ADD_DISABLED 5 -extern char *vmname; +extern const char *vmname; static pthread_t mevent_tid; static int mevent_timid = 43; Index: usr.sbin/bhyve/pci_ahci.c =================================================================== --- usr.sbin/bhyve/pci_ahci.c +++ usr.sbin/bhyve/pci_ahci.c @@ -41,6 +41,8 @@ #include #include +#include + #include #include #include @@ -131,6 +133,7 @@ uint32_t done; int slot; int more; + int readop; }; struct ahci_port { @@ -724,6 +727,7 @@ aior->slot = slot; aior->len = len; aior->done = done; + aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); @@ -1420,6 +1424,7 @@ aior->slot = slot; aior->len = len; aior->done = done; + aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); @@ -2446,6 +2451,283 @@ return (pci_ahci_init(ctx, pi, opts, 1)); } +#ifdef BHYVE_SNAPSHOT +static int +pci_ahci_snapshot_save_queues(struct ahci_port *port, + struct vm_snapshot_meta *meta) +{ + int ret; + int idx; + struct ahci_ioreq *ioreq; + + STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) { + idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + } + + idx = -1; + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + + TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) { + idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + + /* + * Snapshot only the busy requests; other requests are + * not valid. + */ + ret = blockif_snapshot_req(&ioreq->io_req, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to snapshot req\r\n", + __func__); + goto done; + } + } + + idx = -1; + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + +done: + return (ret); +} + +static int +pci_ahci_snapshot_restore_queues(struct ahci_port *port, + struct vm_snapshot_meta *meta) +{ + int ret; + int idx; + struct ahci_ioreq *ioreq; + + /* Empty the free queue before restoring. */ + while (!STAILQ_EMPTY(&port->iofhd)) + STAILQ_REMOVE_HEAD(&port->iofhd, io_flist); + + /* Restore the free queue. */ + while (1) { + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + if (idx == -1) + break; + + STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist); + } + + /* Restore the busy queue. */ + while (1) { + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + if (idx == -1) + break; + + ioreq = &port->ioreq[idx]; + TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist); + + /* + * Restore only the busy requests; other requests are + * not valid. + */ + ret = blockif_snapshot_req(&ioreq->io_req, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to restore request\r\n", + __func__); + goto done; + } + + /* Re-enqueue the requests in the block interface. */ + if (ioreq->readop) + ret = blockif_read(port->bctx, &ioreq->io_req); + else + ret = blockif_write(port->bctx, &ioreq->io_req); + + if (ret != 0) { + fprintf(stderr, + "%s: failed to re-enqueue request\r\n", + __func__); + goto done; + } + } + +done: + return (ret); +} + +static int +pci_ahci_snapshot(struct vm_snapshot_meta *meta) +{ + int i, j, ret; + void *bctx; + struct pci_devinst *pi; + struct pci_ahci_softc *sc; + struct ahci_port *port; + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *ioreq; + + pi = meta->dev_data; + sc = pi->pi_arg; + + /* TODO: add mtx lock/unlock */ + + SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); + + for (i = 0; i < MAX_PORTS; i++) { + port = &sc->port[i]; + + if (meta->op == VM_SNAPSHOT_SAVE) + bctx = port->bctx; + + SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); + + /* Mostly for restore; save is ensured by the lines above. */ + if (((bctx == NULL) && (port->bctx != NULL)) || + ((bctx != NULL) && (port->bctx == NULL))) { + fprintf(stderr, "%s: ports not matching\r\n", __func__); + ret = EINVAL; + goto done; + } + + if (port->bctx == NULL) + continue; + + if (port->port != i) { + fprintf(stderr, "%s: ports not matching: " + "actual: %d expected: %d\r\n", + __func__, port->port, i); + ret = EINVAL; + goto done; + } + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst, + AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta, + ret, done); + + SNAPSHOT_VAR_OR_LEAVE(port->ident, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->is, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); + + for (j = 0; j < port->ioqsz; j++) { + ioreq = &port->ioreq[j]; + + /* blockif_req snapshot done only for busy requests. */ + hdr = (struct ahci_cmd_hdr *)(port->cmd_lst + + ioreq->slot * AHCI_CL_SIZE); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis, + 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry), + false, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done); + } + + /* Perform save / restore specific operations. */ + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = pci_ahci_snapshot_save_queues(port, meta); + if (ret != 0) + goto done; + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + ret = pci_ahci_snapshot_restore_queues(port, meta); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + + ret = blockif_snapshot(port->bctx, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to restore blockif\r\n", + __func__); + goto done; + } + } + +done: + return (ret); +} + +static int +pci_ahci_pause(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct pci_ahci_softc *sc; + struct blockif_ctxt *bctxt; + int i; + + sc = pi->pi_arg; + + for (i = 0; i < MAX_PORTS; i++) { + bctxt = sc->port[i].bctx; + if (bctxt == NULL) + continue; + + blockif_pause(bctxt); + } + + return (0); +} + +static int +pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct pci_ahci_softc *sc; + struct blockif_ctxt *bctxt; + int i; + + sc = pi->pi_arg; + + for (i = 0; i < MAX_PORTS; i++) { + bctxt = sc->port[i].bctx; + if (bctxt == NULL) + continue; + + blockif_resume(bctxt); + } + + return (0); +} +#endif + /* * Use separate emulation names to distinguish drive and atapi devices */ @@ -2453,7 +2735,12 @@ .pe_emu = "ahci", .pe_init = pci_ahci_hd_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, +#endif }; PCI_EMUL_SET(pci_de_ahci); @@ -2461,7 +2748,12 @@ .pe_emu = "ahci-hd", .pe_init = pci_ahci_hd_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, +#endif }; PCI_EMUL_SET(pci_de_ahci_hd); @@ -2469,6 +2761,11 @@ .pe_emu = "ahci-cd", .pe_init = pci_ahci_atapi_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, +#endif }; PCI_EMUL_SET(pci_de_ahci_cd); Index: usr.sbin/bhyve/pci_e82545.c =================================================================== --- usr.sbin/bhyve/pci_e82545.c +++ usr.sbin/bhyve/pci_e82545.c @@ -46,6 +46,8 @@ #ifndef WITHOUT_CAPSICUM #include #endif +#include + #include #include #include @@ -2357,11 +2359,168 @@ return (0); } +#ifdef BHYVE_SNAPSHOT +static int +e82545_snapshot(struct vm_snapshot_meta *meta) +{ + int i; + int ret; + struct e82545_softc *sc; + struct pci_devinst *pi; + uint64_t bitmap_value; + + pi = meta->dev_data; + sc = pi->pi_arg; + + /* esc_mevp and esc_mevpitr should be reinitiated at init. */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done); + + /* General */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done); + + /* Interrupt control */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done); + + /* + * Transmit + * + * The fields in the unions are in superposition to access certain + * bytes in the larger uint variables. + * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1] + */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done); + + /* Has dependency on esc_TDLEN; reoreder of fields from struct. */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN, + true, meta, ret, done); + + /* L2 frame acceptance */ + for (i = 0; i < nitems(sc->esc_uni); i++) { + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done); + } + + SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan), + meta, ret, done); + + /* Receive */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done); + + /* Has dependency on esc_RDLEN; reoreder of fields from struct. */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN, + true, meta, ret, done); + + /* IO Port register access */ + SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done); + + /* Shadow copy of MDIC */ + SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done); + + /* Shadow copy of EECD */ + SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done); + + /* Latest NVM in/out */ + SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done); + + /* Stats */ + SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) + bitmap_value = sc->nvm_bits; + SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + sc->nvm_bits = bitmap_value; + + if (meta->op == VM_SNAPSHOT_SAVE) + bitmap_value = sc->nvm_bits; + SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + sc->nvm_bits = bitmap_value; + + /* EEPROM data */ + SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data), + meta, ret, done); + +done: + return (ret); +} +#endif + struct pci_devemu pci_de_e82545 = { .pe_emu = "e1000", .pe_init = e82545_init, .pe_barwrite = e82545_write, - .pe_barread = e82545_read + .pe_barread = e82545_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = e82545_snapshot, +#endif }; PCI_EMUL_SET(pci_de_e82545); Index: usr.sbin/bhyve/pci_emul.h =================================================================== --- usr.sbin/bhyve/pci_emul.h +++ usr.sbin/bhyve/pci_emul.h @@ -45,6 +45,7 @@ struct vmctx; struct pci_devinst; struct memory_region; +struct vm_snapshot_meta; struct pci_devemu { char *pe_emu; /* Name of device emulation */ @@ -71,6 +72,11 @@ uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); + + /* Save/restore device state */ + int (*pe_snapshot)(struct vm_snapshot_meta *meta); + int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi); + int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); @@ -246,6 +252,11 @@ void pci_write_dsdt(void); uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); +#ifdef BHYVE_SNAPSHOT +int pci_snapshot(struct vm_snapshot_meta *meta); +int pci_pause(struct vmctx *ctx, const char *dev_name); +int pci_resume(struct vmctx *ctx, const char *dev_name); +#endif static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) Index: usr.sbin/bhyve/pci_emul.c =================================================================== --- usr.sbin/bhyve/pci_emul.c +++ usr.sbin/bhyve/pci_emul.c @@ -45,6 +45,7 @@ #include #include +#include #include #include "acpi.h" @@ -1962,6 +1963,191 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); +#ifdef BHYVE_SNAPSHOT +/* + * Saves/restores PCI device emulated state. Returns 0 on success. + */ +static int +pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) +{ + struct pci_devinst *pi; + int i; + int ret; + + pi = meta->dev_data; + + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), + meta, ret, done); + + for (i = 0; i < nitems(pi->pi_bar); i++) { + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); + } + + /* Restore MSI-X table. */ + for (i = 0; i < pi->pi_msix.table_count; i++) { + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, + meta, ret, done); + } + +done: + return (ret); +} + +static int +pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde, + struct pci_devinst **pdi) +{ + struct businfo *bi; + struct slotinfo *si; + struct funcinfo *fi; + int bus, slot, func; + + assert(dev_name != NULL); + assert(pde != NULL); + assert(pdi != NULL); + + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_name == NULL) + continue; + if (strcmp(dev_name, fi->fi_name)) + continue; + + *pde = pci_emul_finddev(fi->fi_name); + assert(*pde != NULL); + + *pdi = fi->fi_devi; + return (0); + } + } + } + + return (EINVAL); +} + +int +pci_snapshot(struct vm_snapshot_meta *meta) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(meta->dev_name != NULL); + + ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi); + if (ret != 0) { + fprintf(stderr, "%s: no such name: %s\r\n", + __func__, meta->dev_name); + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + return (0); + } + + meta->dev_data = pdi; + + if (pde->pe_snapshot == NULL) { + fprintf(stderr, "%s: not implemented yet for: %s\r\n", + __func__, meta->dev_name); + return (-1); + } + + ret = pci_snapshot_pci_dev(meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to snapshot pci dev\r\n", + __func__); + return (-1); + } + + ret = (*pde->pe_snapshot)(meta); + + return (ret); +} + +int +pci_pause(struct vmctx *ctx, const char *dev_name) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(dev_name != NULL); + + ret = pci_find_slotted_dev(dev_name, &pde, &pdi); + if (ret != 0) { + /* + * It is possible to call this function without + * checking that the device is inserted first. + */ + fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); + return (0); + } + + if (pde->pe_pause == NULL) { + /* The pause/resume functionality is optional. */ + fprintf(stderr, "%s: not implemented for: %s\n", + __func__, dev_name); + return (0); + } + + return (*pde->pe_pause)(ctx, pdi); +} + +int +pci_resume(struct vmctx *ctx, const char *dev_name) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(dev_name != NULL); + + ret = pci_find_slotted_dev(dev_name, &pde, &pdi); + if (ret != 0) { + /* + * It is possible to call this function without + * checking that the device is inserted first. + */ + fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); + return (0); + } + + if (pde->pe_resume == NULL) { + /* The pause/resume functionality is optional. */ + fprintf(stderr, "%s: not implemented for: %s\n", + __func__, dev_name); + return (0); + } + + return (*pde->pe_resume)(ctx, pdi); +} +#endif + #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* @@ -1970,7 +2156,7 @@ #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { - uint8_t ioregs[DIOSZ]; + uint8_t ioregs[DIOSZ]; uint8_t memregs[2][DMEMSZ]; }; @@ -2062,7 +2248,7 @@ } else { printf("diow: memw unknown size %d\n", size); } - + /* * magic interrupt ?? */ @@ -2087,7 +2273,7 @@ offset, size); return (0); } - + value = 0; if (size == 1) { value = sc->ioregs[offset]; @@ -2106,7 +2292,7 @@ offset, size); return (0); } - + i = baridx - 1; /* 'memregs' index */ if (size == 1) { @@ -2131,11 +2317,23 @@ return (value); } +#ifdef BHYVE_SNAPSHOT +int +pci_emul_snapshot(struct vm_snapshot_meta *meta) +{ + + return (0); +} +#endif + struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, - .pe_barread = pci_emul_dior + .pe_barread = pci_emul_dior, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_emul_snapshot, +#endif }; PCI_EMUL_SET(pci_dummy); Index: usr.sbin/bhyve/pci_fbuf.c =================================================================== --- usr.sbin/bhyve/pci_fbuf.c +++ usr.sbin/bhyve/pci_fbuf.c @@ -35,6 +35,7 @@ #include #include +#include #include #include @@ -439,10 +440,26 @@ return (error); } +#ifdef BHYVE_SNAPSHOT +static int +pci_fbuf_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err); + +err: + return (ret); +} +#endif + struct pci_devemu pci_fbuf = { .pe_emu = "fbuf", .pe_init = pci_fbuf_init, .pe_barwrite = pci_fbuf_write, - .pe_barread = pci_fbuf_read + .pe_barread = pci_fbuf_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_fbuf_snapshot, +#endif }; PCI_EMUL_SET(pci_fbuf); Index: usr.sbin/bhyve/pci_lpc.c =================================================================== --- usr.sbin/bhyve/pci_lpc.c +++ usr.sbin/bhyve/pci_lpc.c @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -451,12 +452,35 @@ pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); } +#ifdef BHYVE_SNAPSHOT +static int +pci_lpc_snapshot(struct vm_snapshot_meta *meta) +{ + int unit, ret; + struct uart_softc *sc; + + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = lpc_uart_softc[unit].uart_softc; + + ret = uart_snapshot(sc, meta); + if (ret != 0) + goto done; + } + +done: + return (ret); +} +#endif + struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, .pe_write_dsdt = pci_lpc_write_dsdt, .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, - .pe_barread = pci_lpc_read + .pe_barread = pci_lpc_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_lpc_snapshot, +#endif }; PCI_EMUL_SET(pci_de_lpc); Index: usr.sbin/bhyve/pci_virtio_block.c =================================================================== --- usr.sbin/bhyve/pci_virtio_block.c +++ usr.sbin/bhyve/pci_virtio_block.c @@ -39,6 +39,8 @@ #include #include +#include + #include #include #include @@ -150,6 +152,11 @@ static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); +#ifdef BHYVE_SNAPSHOT +static void pci_vtblk_pause(void *); +static void pci_vtblk_resume(void *); +static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); +#endif static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ @@ -161,6 +168,11 @@ pci_vtblk_cfgwrite, /* write PCI config */ NULL, /* apply negotiated features */ VTBLK_S_HOSTCAPS, /* our capabilities */ +#ifdef BHYVE_SNAPSHOT + pci_vtblk_pause, /* pause blockif threads */ + pci_vtblk_resume, /* resume blockif threads */ + pci_vtblk_snapshot, /* save / restore device state */ +#endif }; static void @@ -172,6 +184,40 @@ vi_reset_dev(&sc->vbsc_vs); } +#ifdef BHYVE_SNAPSHOT +static void +pci_vtblk_pause(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device pause requested !\n")); + blockif_pause(sc->bc); +} + +static void +pci_vtblk_resume(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device resume requested !\n")); + blockif_resume(sc->bc); +} + +static int +pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_vtblk_softc *sc = vsc; + + SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), + meta, ret, done); + +done: + return (ret); +} +#endif + static void pci_vtblk_done(struct blockif_req *br, int err) { @@ -419,6 +465,9 @@ .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_barwrite = vi_pci_write, - .pe_barread = vi_pci_read + .pe_barread = vi_pci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = vi_pci_snapshot, +#endif }; PCI_EMUL_SET(pci_de_vblk); Index: usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- usr.sbin/bhyve/pci_virtio_net.c +++ usr.sbin/bhyve/pci_virtio_net.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include /* IFNAMSIZ */ @@ -124,6 +125,11 @@ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); static void pci_vtnet_neg_features(void *, uint64_t); +#ifdef BHYVE_SNAPSHOT +static void pci_vtnet_pause(void *); +static void pci_vtnet_resume(void *); +static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *); +#endif static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ @@ -135,6 +141,11 @@ pci_vtnet_cfgwrite, /* write PCI config */ pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ +#ifdef BHYVE_SNAPSHOT + pci_vtnet_pause, /* pause rx/tx threads */ + pci_vtnet_resume, /* resume rx/tx threads */ + pci_vtnet_snapshot, /* save / restore device state */ +#endif }; static void @@ -172,6 +183,68 @@ pthread_mutex_unlock(&sc->rx_mtx); } +#ifdef BHYVE_SNAPSHOT +static void +pci_vtnet_pause(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device pause requested !\n")); + + /* Acquire the RX lock to block RX processing. */ + pthread_mutex_lock(&sc->rx_mtx); + + /* Wait for the transmit thread to finish its processing. */ + pthread_mutex_lock(&sc->tx_mtx); + while (sc->tx_in_progress) { + pthread_mutex_unlock(&sc->tx_mtx); + usleep(10000); + pthread_mutex_lock(&sc->tx_mtx); + } +} + +static void +pci_vtnet_resume(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device resume requested !\n")); + + pthread_mutex_unlock(&sc->tx_mtx); + /* The RX lock should have been acquired in vtnet_pause. */ + pthread_mutex_unlock(&sc->rx_mtx); +} + +static int +pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device snapshot requested !\n")); + + /* + * Queues and consts should have been saved by the more generic + * vi_pci_snapshot function. We need to save only our features and + * config. + */ + + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done); + + /* Force reapply negociated features at restore time */ + if (meta->op == VM_SNAPSHOT_RESTORE) + pci_vtnet_neg_features(sc, sc->vsc_features); + + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rx_vhdrlen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_rx_ready, meta, ret, done); + +done: + return (ret); +} +#endif + static void pci_vtnet_rx(struct pci_vtnet_softc *sc) { @@ -529,6 +602,11 @@ .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_barwrite = vi_pci_write, - .pe_barread = vi_pci_read + .pe_barread = vi_pci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = vi_pci_snapshot, + .pe_pause = vi_pci_pause, + .pe_resume = vi_pci_resume, +#endif }; PCI_EMUL_SET(pci_de_vnet); Index: usr.sbin/bhyve/pci_xhci.c =================================================================== --- usr.sbin/bhyve/pci_xhci.c +++ usr.sbin/bhyve/pci_xhci.c @@ -48,6 +48,8 @@ #include #include +#include + #include #include #include @@ -150,6 +152,8 @@ #define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & ((m) << (s))))) +#define SNAP_DEV_NAME_LEN 128 + struct pci_xhci_trb_ring { uint64_t ringaddr; /* current dequeue guest address */ uint32_t ccs; /* consumer cycle state */ @@ -285,9 +289,10 @@ #define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) +#define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \ + (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1))) #define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ - (a), \ - XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1))) + (a), XHCI_GADDR_SIZE(a)) static int xhci_in_use; @@ -2853,12 +2858,265 @@ return (error); } +#ifdef BHYVE_SNAPSHOT +static void +pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[]) +{ + int i, j; + struct pci_xhci_dev_emu *dev, *slot; + + memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS); + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + for (j = 1; j <= XHCI_MAX_DEVS; j++) { + slot = XHCI_SLOTDEV_PTR(sc, i); + dev = XHCI_DEVINST_PTR(sc, j); + + if (slot == dev) + maps[i] = j; + } + } +} +static int +pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, + int idx, struct vm_snapshot_meta *meta) +{ + int k; + int ret; + struct usb_data_xfer *xfer; + struct usb_data_xfer_block *xfer_block; + + /* some sanity checks */ + if (meta->op == VM_SNAPSHOT_SAVE) + xfer = dev->eps[idx].ep_xfer; + + SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done); + if (xfer == NULL) { + ret = 0; + goto done; + } + + if (meta->op == VM_SNAPSHOT_RESTORE) { + pci_xhci_init_ep(dev, idx); + xfer = dev->eps[idx].ep_xfer; + } + + /* save / restore proper */ + for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) { + xfer_block = &xfer->data[k]; + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf, + XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret, + done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done); + if (xfer->ureq) { + /* xfer->ureq is not allocated at restore time */ + if (meta->op == VM_SNAPSHOT_RESTORE) + xfer->ureq = malloc(sizeof(struct usb_device_request)); + + SNAPSHOT_BUF_OR_LEAVE(xfer->ureq, + sizeof(struct usb_device_request), + meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done); + +done: + return (ret); +} + +static int +pci_xhci_snapshot(struct vm_snapshot_meta *meta) +{ + int i, j; + int ret; + int restore_idx; + struct pci_devinst *pi; + struct pci_xhci_softc *sc; + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + char dname[SNAP_DEV_NAME_LEN]; + int maps[XHCI_MAX_SLOTS + 1]; + + pi = meta->dev_data; + sc = pi->pi_arg; + + SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done); + + /* opregs */ + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done); + + /* opregs.cr_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p, + XHCI_GADDR_SIZE(sc->opregs.cr_p), false, meta, ret, done); + + /* opregs.dcbaa_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p, + XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), false, meta, ret, done); + + /* rtsregs */ + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done); + + /* rtsregs.intrreg */ + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done); + + /* rtsregs.erstba_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p, + XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), false, meta, ret, done); + + /* rtsregs.erst_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p, + XHCI_GADDR_SIZE(sc->rtsregs.erst_p), false, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done); + + /* sanity checking */ + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + dev = XHCI_DEVINST_PTR(sc, i); + if (dev == NULL) + continue; + + if (meta->op == VM_SNAPSHOT_SAVE) + restore_idx = i; + SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done); + + /* check if the restored device (when restoring) is sane */ + if (restore_idx != i) { + fprintf(stderr, "%s: idx not matching: actual: %d, " + "expected: %d\r\n", __func__, restore_idx, i); + ret = EINVAL; + goto done; + } + + if (meta->op == VM_SNAPSHOT_SAVE) { + memset(dname, 0, sizeof(dname)); + strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1); + } + + SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done); + + if (meta->op == VM_SNAPSHOT_RESTORE) { + dname[sizeof(dname) - 1] = '\0'; + if (strcmp(dev->dev_ue->ue_emu, dname)) { + fprintf(stderr, "%s: device names mismatch: " + "actual: %s, expected: %s\r\n", + __func__, dname, dev->dev_ue->ue_emu); + + ret = EINVAL; + goto done; + } + } + } + + /* portregs */ + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + port = XHCI_PORTREG_PTR(sc, i); + dev = XHCI_DEVINST_PTR(sc, i); + + if (dev == NULL) + continue; + + SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done); + } + + /* slots */ + if (meta->op == VM_SNAPSHOT_SAVE) + pci_xhci_map_devs_slots(sc, maps); + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) { + dev = XHCI_SLOTDEV_PTR(sc, i); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + if (maps[i] != 0) + dev = XHCI_DEVINST_PTR(sc, maps[i]); + else + dev = NULL; + + XHCI_SLOTDEV_PTR(sc, i) = dev; + } else { + /* error */ + ret = EINVAL; + goto done; + } + + if (dev == NULL) + continue; + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx, + XHCI_GADDR_SIZE(dev->dev_ctx), false, meta, ret, done); + + for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) { + ret = pci_xhci_snapshot_ep(sc, dev, j, meta); + if (ret != 0) + goto done; + } + + SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done); + + /* devices[i]->dev_sc */ + dev->dev_ue->ue_snapshot(dev->dev_sc, meta); + + /* devices[i]->hci */ + SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(sc->ndevices, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done); + +done: + return (ret); +} +#endif struct pci_devemu pci_de_xhci = { .pe_emu = "xhci", .pe_init = pci_xhci_init, .pe_barwrite = pci_xhci_write, - .pe_barread = pci_xhci_read + .pe_barread = pci_xhci_read, +#ifdef BHYVE_SNAPSHOT + .pe_snapshot = pci_xhci_snapshot, +#endif }; PCI_EMUL_SET(pci_de_xhci); Index: usr.sbin/bhyve/ps2kbd.h =================================================================== --- usr.sbin/bhyve/ps2kbd.h +++ usr.sbin/bhyve/ps2kbd.h @@ -32,10 +32,15 @@ #define _PS2KBD_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc); int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val); void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val); +#ifdef BHYVE_SNAPSHOT +int ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta); +#endif + #endif /* _PS2KBD_H_ */ Index: usr.sbin/bhyve/ps2kbd.c =================================================================== --- usr.sbin/bhyve/ps2kbd.c +++ usr.sbin/bhyve/ps2kbd.c @@ -32,10 +32,13 @@ #include +#include + #include #include #include #include +#include #include #include #include @@ -381,3 +384,17 @@ return (sc); } +#ifdef BHYVE_SNAPSHOT +int +ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done); + +done: + return (ret); +} +#endif + Index: usr.sbin/bhyve/ps2mouse.h =================================================================== --- usr.sbin/bhyve/ps2mouse.h +++ usr.sbin/bhyve/ps2mouse.h @@ -32,6 +32,7 @@ #define _PS2MOUSE_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc); @@ -40,4 +41,8 @@ void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable); int ps2mouse_fifocnt(struct ps2mouse_softc *sc); +#ifdef BHYVE_SNAPSHOT +int ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta); +#endif + #endif /* _PS2MOUSE_H_ */ Index: usr.sbin/bhyve/ps2mouse.c =================================================================== --- usr.sbin/bhyve/ps2mouse.c +++ usr.sbin/bhyve/ps2mouse.c @@ -32,10 +32,13 @@ #include +#include + #include #include #include #include +#include #include #include #include @@ -415,4 +418,23 @@ return (sc); } - +#ifdef BHYVE_SNAPSHOT +int +ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->resolution, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->sampling_rate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ctrlenable, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cur_x, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cur_y, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->delta_x, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->delta_y, meta, ret, done); + +done: + return (ret); +} +#endif Index: usr.sbin/bhyve/snapshot.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/snapshot.h @@ -0,0 +1,71 @@ +#ifndef _BHYVE_SNAPSHOT_ +#define _BHYVE_SNAPSHOT_ + +#include +#include +#include + +struct vmctx; + +struct restore_state { + int kdata_fd; + int vmmem_fd; + + void *kdata_map; + size_t kdata_len; + + size_t vmmem_len; + + struct ucl_parser *meta_parser; + ucl_object_t *meta_root_obj; +}; + +struct checkpoint_thread_info { + struct vmctx *ctx; + int socket_fd; + struct sockaddr_un *addr; +} checkpoint_info; + +typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); +typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *); +typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *); + +struct vm_snapshot_dev_info { + const char *dev_name; /* device name */ + vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */ + vm_pause_dev_cb pause_cb; /* callback for device pause */ + vm_resume_dev_cb resume_cb; /* callback for device resume */ +}; + +struct vm_snapshot_kern_info { + const char *struct_name; /* kernel structure name*/ + enum snapshot_req req; /* request type */ +}; + + +void destroy_restore_state(struct restore_state *rstate); + +const char *lookup_vmname(struct restore_state *rstate); +int lookup_memflags(struct restore_state *rstate); +size_t lookup_memsize(struct restore_state *rstate); +int lookup_guest_ncpus(struct restore_state *rstate); + +void checkpoint_cpu_add(int vcpu); +void checkpoint_cpu_resume(int vcpu); +void checkpoint_cpu_suspend(int vcpu); + +int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); +int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); + +int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate); +int vm_pause_user_devs(struct vmctx *ctx); +int vm_resume_user_devs(struct vmctx *ctx); + +int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); +void *checkpoint_thread(void *param); +int init_checkpoint_thread(struct vmctx *ctx); + + +int load_restore_file(const char *filename, struct restore_state *rstate); + +#endif Index: usr.sbin/bhyve/snapshot.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/snapshot.c @@ -0,0 +1,1498 @@ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#include "bhyverun.h" +#include "acpi.h" +#include "atkbdc.h" +#include "inout.h" +#include "dbgport.h" +#include "fwctl.h" +#include "ioapic.h" +#include "mem.h" +#include "mevent.h" +#include "mptbl.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "smbiostbl.h" +#include "snapshot.h" +#include "xmsr.h" +#include "spinup_ap.h" +#include "rtc.h" + +#include +#include + +extern int guest_ncpus; + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +#define BHYVE_RUN_DIR "/var/run/bhyve" +#define CHECKPOINT_RUN_DIR BHYVE_RUN_DIR "/checkpoint" +#define MAX_VMNAME 100 + +#define MAX_MSG_SIZE 1024 + +#define SNAPSHOT_BUFFER_SIZE (20 * MB) + +#define JSON_STRUCT_ARR_KEY "structs" +#define JSON_DEV_ARR_KEY "devices" +#define JSON_BASIC_METADATA_KEY "basic metadata" +#define JSON_SNAPSHOT_REQ_KEY "snapshot_req" +#define JSON_SIZE_KEY "size" +#define JSON_FILE_OFFSET_KEY "file_offset" + +#define JSON_NCPUS_KEY "ncpus" +#define JSON_VMNAME_KEY "vmname" +#define JSON_MEMSIZE_KEY "memsize" +#define JSON_MEMFLAGS_KEY "memflags" + +const struct vm_snapshot_dev_info snapshot_devs[] = { + { "atkbdc", atkbdc_snapshot, NULL, NULL }, + { "virtio-net", pci_snapshot, pci_pause, pci_resume }, + { "virtio-blk", pci_snapshot, pci_pause, pci_resume }, + { "lpc", pci_snapshot, NULL, NULL }, + { "fbuf", pci_snapshot, NULL, NULL }, + { "xhci", pci_snapshot, NULL, NULL }, + { "e1000", pci_snapshot, NULL, NULL }, + { "ahci", pci_snapshot, pci_pause, pci_resume }, + { "ahci-hd", pci_snapshot, pci_pause, pci_resume }, + { "ahci-cd", pci_snapshot, NULL, NULL }, +}; + +const struct vm_snapshot_kern_info snapshot_kern_structs[] = { + { "vhpet", STRUCT_VHPET }, + { "vm", STRUCT_VM }, + { "vmx", STRUCT_VMX }, + { "vioapic", STRUCT_VIOAPIC }, + { "vlapic", STRUCT_VLAPIC }, + { "vmcx", STRUCT_VMCX }, + { "vatpit", STRUCT_VATPIT }, + { "vatpic", STRUCT_VATPIC }, + { "vpmtmr", STRUCT_VPMTMR }, + { "vrtc", STRUCT_VRTC }, +}; + +static cpuset_t vcpus_active, vcpus_suspended; +static pthread_mutex_t vcpu_lock; +static pthread_cond_t vcpus_idle, vcpus_can_run; +static bool checkpoint_active; + +/* + * TODO: Harden this function and all of its callers since 'base_str' is a user + * provided string. + */ +static char * +strcat_extension(const char *base_str, const char *ext) +{ + char *res; + size_t base_len, ext_len; + + base_len = strnlen(base_str, MAX_VMNAME); + ext_len = strnlen(ext, MAX_VMNAME); + + if (base_len + ext_len > MAX_VMNAME) { + fprintf(stderr, "Filename exceeds maximum length.\n"); + return (NULL); + } + + res = malloc(base_len + ext_len + 1); + if (res == NULL) { + perror("Failed to allocate memory."); + return (NULL); + } + + memcpy(res, base_str, base_len); + memcpy(res + base_len, ext, ext_len); + res[base_len + ext_len] = 0; + + return (res); +} + +void +destroy_restore_state(struct restore_state *rstate) +{ + if (rstate == NULL) { + fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); + return; + } + + if (rstate->kdata_map != MAP_FAILED) + munmap(rstate->kdata_map, rstate->kdata_len); + + if (rstate->kdata_fd > 0) + close(rstate->kdata_fd); + if (rstate->vmmem_fd > 0) + close(rstate->vmmem_fd); + + if (rstate->meta_root_obj != NULL) + ucl_object_unref(rstate->meta_root_obj); + if (rstate->meta_parser != NULL) + ucl_parser_free(rstate->meta_parser); +} + +static int +load_vmmem_file(const char *filename, struct restore_state *rstate) +{ + struct stat sb; + int err; + + rstate->vmmem_fd = open(filename, O_RDONLY); + if (rstate->vmmem_fd < 0) { + perror("Failed to open restore file"); + return (-1); + } + + err = fstat(rstate->vmmem_fd, &sb); + if (err < 0) { + perror("Failed to stat restore file"); + goto err_load_vmmem; + } + + if (sb.st_size == 0) { + fprintf(stderr, "Restore file is empty.\n"); + goto err_load_vmmem; + } + + rstate->vmmem_len = sb.st_size; + + return (0); + +err_load_vmmem: + if (rstate->vmmem_fd > 0) + close(rstate->vmmem_fd); + return (-1); +} + +static int +load_kdata_file(const char *filename, struct restore_state *rstate) +{ + struct stat sb; + int err; + + rstate->kdata_fd = open(filename, O_RDONLY); + if (rstate->kdata_fd < 0) { + perror("Failed to open kernel data file"); + return (-1); + } + + err = fstat(rstate->kdata_fd, &sb); + if (err < 0) { + perror("Failed to stat kernel data file"); + goto err_load_kdata; + } + + if (sb.st_size == 0) { + fprintf(stderr, "Kernel data file is empty.\n"); + goto err_load_kdata; + } + + rstate->kdata_len = sb.st_size; + rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, + MAP_SHARED, rstate->kdata_fd, 0); + if (rstate->kdata_map == MAP_FAILED) { + perror("Failed to map restore file"); + goto err_load_kdata; + } + + return (0); + +err_load_kdata: + if (rstate->kdata_fd > 0) + close(rstate->kdata_fd); + return (-1); +} + +static int +load_metadata_file(const char *filename, struct restore_state *rstate) +{ + const ucl_object_t *obj; + struct ucl_parser *parser; + int err; + + parser = ucl_parser_new(UCL_PARSER_DEFAULT); + if (parser == NULL) { + fprintf(stderr, "Failed to initialize UCL parser.\n"); + goto err_load_metadata; + } + + err = ucl_parser_add_file(parser, filename); + if (err == 0) { + fprintf(stderr, "Failed to parse metadata file: '%s'\n", + filename); + err = -1; + goto err_load_metadata; + } + + obj = ucl_parser_get_object(parser); + if (obj == NULL) { + fprintf(stderr, "Failed to parse object.\n"); + err = -1; + goto err_load_metadata; + } + + rstate->meta_parser = parser; + rstate->meta_root_obj = (ucl_object_t *)obj; + + return (0); + +err_load_metadata: + if (parser != NULL) + ucl_parser_free(parser); + return (err); +} + +int +load_restore_file(const char *filename, struct restore_state *rstate) +{ + int err = 0; + char *kdata_filename = NULL, *meta_filename = NULL; + + assert(filename != NULL); + assert(rstate != NULL); + + memset(rstate, 0, sizeof(*rstate)); + rstate->kdata_map = MAP_FAILED; + + err = load_vmmem_file(filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest RAM file.\n"); + goto err_restore; + } + + kdata_filename = strcat_extension(filename, ".kern"); + if (kdata_filename == NULL) { + fprintf(stderr, "Failed to construct kernel data filename.\n"); + goto err_restore; + } + + err = load_kdata_file(kdata_filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest kernel data file.\n"); + goto err_restore; + } + + meta_filename = strcat_extension(filename, ".meta"); + if (meta_filename == NULL) { + fprintf(stderr, "Failed to construct kernel metadata filename.\n"); + goto err_restore; + } + + err = load_metadata_file(meta_filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest metadata file.\n"); + goto err_restore; + } + + return (0); + +err_restore: + destroy_restore_state(rstate); + if (kdata_filename != NULL) + free(kdata_filename); + if (meta_filename != NULL) + free(meta_filename); + return (-1); +} + +#define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ +do { \ + const ucl_object_t *obj__; \ + obj__ = ucl_object_lookup(obj, key); \ + if (obj__ == NULL) { \ + fprintf(stderr, "Missing key: '%s'", key); \ + return (ret); \ + } \ + if (!ucl_object_toint_safe(obj__, result_ptr)) { \ + fprintf(stderr, "Cannot convert '%s' value to int.", key); \ + return (ret); \ + } \ +} while(0) + +#define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ +do { \ + const ucl_object_t *obj__; \ + obj__ = ucl_object_lookup(obj, key); \ + if (obj__ == NULL) { \ + fprintf(stderr, "Missing key: '%s'", key); \ + return (ret); \ + } \ + if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ + fprintf(stderr, "Cannot convert '%s' value to string.", key); \ + return (ret); \ + } \ +} while(0) + +static void * +lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate, + size_t *struct_size) +{ + const ucl_object_t *structs = NULL, *obj = NULL; + ucl_object_iter_t it = NULL; + int64_t snapshot_req, size, file_offset; + + structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY); + if (structs == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_STRUCT_ARR_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)structs) != UCL_ARRAY) { + fprintf(stderr, "Object '%s' is not an array.\n", + JSON_STRUCT_ARR_KEY); + return (NULL); + } + + while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) { + snapshot_req = -1; + JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, + &snapshot_req, NULL); + assert(snapshot_req >= 0); + if ((enum snapshot_req) snapshot_req == struct_id) { + JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, + &size, NULL); + assert(size >= 0); + + JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, + &file_offset, NULL); + assert(file_offset >= 0); + assert(file_offset + size <= rstate->kdata_len); + + *struct_size = (size_t)size; + return (rstate->kdata_map + file_offset); + } + } + + return (NULL); +} + +static void * +lookup_check_dev(const char *dev_name, struct restore_state *rstate, + const ucl_object_t *obj, size_t *data_size) +{ + const char *snapshot_req; + int64_t size, file_offset; + + snapshot_req = NULL; + JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, + &snapshot_req, NULL); + assert(snapshot_req != NULL); + if (!strcmp(snapshot_req, dev_name)) { + JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, + &size, NULL); + assert(size >= 0); + + JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, + &file_offset, NULL); + assert(file_offset >= 0); + assert(file_offset + size <= rstate->kdata_len); + + *data_size = (size_t)size; + return (rstate->kdata_map + file_offset); + } + + return (NULL); +} + +static void* +lookup_dev(const char *dev_name, struct restore_state *rstate, + size_t *data_size) +{ + const ucl_object_t *devs = NULL, *obj = NULL; + ucl_object_iter_t it = NULL; + void *ret; + + devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY); + if (devs == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_DEV_ARR_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)devs) != UCL_ARRAY) { + fprintf(stderr, "Object '%s' is not an array.\n", + JSON_DEV_ARR_KEY); + return (NULL); + } + + while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { + ret = lookup_check_dev(dev_name, rstate, obj, data_size); + if (ret != NULL) + return (ret); + } + + return (NULL); +} + +static const ucl_object_t * +lookup_basic_metadata_object(struct restore_state *rstate) +{ + const ucl_object_t *basic_meta_obj = NULL; + + basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, + JSON_BASIC_METADATA_KEY); + if (basic_meta_obj == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_BASIC_METADATA_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)basic_meta_obj) != UCL_OBJECT) { + fprintf(stderr, "Object '%s' is not a JSON object.\n", + JSON_BASIC_METADATA_KEY); + return (NULL); + } + + return (basic_meta_obj); +} + +const char * +lookup_vmname(struct restore_state *rstate) +{ + const char *vmname; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (NULL); + + JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); + return (vmname); +} + +int +lookup_memflags(struct restore_state *rstate) +{ + int64_t memflags; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); + + return ((int)memflags); +} + +size_t +lookup_memsize(struct restore_state *rstate) +{ + int64_t memsize; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); + if (memsize < 0) + memsize = 0; + + return ((size_t)memsize); +} + + +int +lookup_guest_ncpus(struct restore_state *rstate) +{ + int64_t ncpus; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); + return ((int)ncpus); +} + +int +restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) +{ + return vm_restore_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len); +} + +static int +vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate, + const struct vm_snapshot_kern_info *info) +{ + void *struct_ptr; + size_t struct_size; + int ret; + struct vm_snapshot_meta *meta; + + struct_ptr = lookup_struct(info->req, rstate, &struct_size); + if (struct_ptr == NULL) { + fprintf(stderr, "%s: Failed to lookup struct %s\r\n", + __func__, info->struct_name); + ret = -1; + goto done; + } + + if (struct_size == 0) { + fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n", + __func__, info->struct_name); + ret = -1; + goto done; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + .dev_name = info->struct_name, + .dev_req = info->req, + + .buffer.buf_start = struct_ptr, + .buffer.buf_size = struct_size, + + .buffer.buf = struct_ptr, + .buffer.buf_rem = struct_size, + + .op = VM_SNAPSHOT_RESTORE, + }; + + ret = vm_snapshot_req(meta); + if (ret != 0) { + fprintf(stderr, "%s: Failed to restore struct: %s\r\n", + __func__, info->struct_name); + goto done; + } + +done: + return (ret); +} + +int +vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) +{ + int ret; + int i; + + for (i = 0; i < nitems(snapshot_kern_structs); i++) { + ret = vm_restore_kern_struct(ctx, rstate, + &snapshot_kern_structs[i]); + if (ret != 0) + return (ret); + } + + return (0); +} + +int +vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate, + const struct vm_snapshot_dev_info *info) +{ + void *dev_ptr; + size_t dev_size; + int ret; + struct vm_snapshot_meta *meta; + + dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size); + if (dev_ptr == NULL) { + fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name); + fprintf(stderr, "Continuing the restore/migration process\r\n"); + return (0); + } + + if (dev_size == 0) { + fprintf(stderr, "%s: Device size is 0. " + "Assuming %s is not used\r\n", + __func__, info->dev_name); + return (0); + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + .dev_name = info->dev_name, + + .buffer.buf_start = dev_ptr, + .buffer.buf_size = dev_size, + + .buffer.buf = dev_ptr, + .buffer.buf_rem = dev_size, + + .op = VM_SNAPSHOT_RESTORE, + }; + + ret = (*info->snapshot_cb)(meta); + if (ret != 0) { + fprintf(stderr, "Failed to restore dev: %s\r\n", + info->dev_name); + return (-1); + } + + return (0); +} + + +int +vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate) +{ + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]); + if (ret != 0) + return (ret); + } + + return 0; +} + +int +vm_pause_user_devs(struct vmctx *ctx) +{ + const struct vm_snapshot_dev_info *info; + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + info = &snapshot_devs[i]; + if (info->pause_cb == NULL) + continue; + + ret = info->pause_cb(ctx, info->dev_name); + if (ret != 0) + return (ret); + } + + return (0); +} + +int +vm_resume_user_devs(struct vmctx *ctx) +{ + const struct vm_snapshot_dev_info *info; + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + info = &snapshot_devs[i]; + if (info->resume_cb == NULL) + continue; + + ret = info->resume_cb(ctx, info->dev_name); + if (ret != 0) + return (ret); + } + + return (0); +} + +static int +vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + size_t data_size; + ssize_t write_cnt; + + ret = vm_snapshot_req(meta); + if (ret != 0) { + fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", + __func__, meta->dev_name); + ret = -1; + goto done; + } + + data_size = vm_get_snapshot_size(meta); + + write_cnt = write(data_fd, meta->buffer.buf_start, data_size); + if (write_cnt != data_size) { + perror("Failed to write all snapshotted data."); + ret = -1; + goto done; + } + + /* Write metadata. */ + xo_open_instance_h(xop, array_key); + xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name); + xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n", + meta->dev_req); + xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); + xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); + xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY); + + *offset += data_size; + +done: + return (ret); +} + +static int +vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) +{ + int ret, i, error; + size_t offset, buf_size; + char *buffer; + struct vm_snapshot_meta *meta; + + error = 0; + offset = 0; + buf_size = SNAPSHOT_BUFFER_SIZE; + + buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); + if (buffer == NULL) { + error = ENOMEM; + perror("Failed to allocate memory for snapshot buffer"); + goto err_vm_snapshot_kern_data; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + + .buffer.buf_start = buffer, + .buffer.buf_size = buf_size, + + .op = VM_SNAPSHOT_SAVE, + }; + + xo_open_list_h(xop, JSON_STRUCT_ARR_KEY); + for (i = 0; i < nitems(snapshot_kern_structs); i++) { + meta->dev_name = snapshot_kern_structs[i].struct_name; + meta->dev_req = snapshot_kern_structs[i].req; + + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY, + meta, &offset); + if (ret != 0) { + error = -1; + goto err_vm_snapshot_kern_data; + } + } + xo_close_list_h(xop, JSON_STRUCT_ARR_KEY); + +err_vm_snapshot_kern_data: + if (buffer != NULL) + free(buffer); + return (error); +} + +static int +vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop) +{ + int error; + size_t memsize; + int memflags; + char vmname_buf[MAX_VMNAME]; + + memset(vmname_buf, 0, MAX_VMNAME); + error = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (error != 0) { + perror("Failed to get VM name"); + goto err; + } + + memsize = vm_get_lowmem_size(ctx) + vm_get_highmem_size(ctx); + memflags = vm_get_memflags(ctx); + + xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); + xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); + xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vmname_buf); + xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsize); + xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", memflags); + xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); + +err: + return (error); +} + +static int +vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + size_t data_size; + + data_size = vm_get_snapshot_size(meta); + + ret = write(data_fd, meta->buffer.buf_start, data_size); + if (ret != data_size) { + perror("Failed to write all snapshotted data."); + return (-1); + } + + /* Write metadata. */ + xo_open_instance_h(xop, array_key); + xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); + xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); + xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); + xo_close_instance_h(xop, array_key); + + *offset += data_size; + + return (0); +} + +static int +vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info, + int data_fd, xo_handle_t *xop, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + + ret = (*info->snapshot_cb)(meta); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n", + meta->dev_name, ret); + return (ret); + } + + ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, + offset); + if (ret != 0) + return (ret); + + return (0); +} + +static int +vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) +{ + int ret, i; + off_t offset; + void *buffer; + size_t buf_size; + struct vm_snapshot_meta *meta; + + buf_size = SNAPSHOT_BUFFER_SIZE; + + offset = lseek(data_fd, 0, SEEK_CUR); + if (offset < 0) { + perror("Failed to get data file current offset."); + return (-1); + } + + buffer = malloc(buf_size); + if (buffer == NULL) { + perror("Failed to allocate memory for snapshot buffer"); + ret = ENOSPC; + goto snapshot_err; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + + .buffer.buf_start = buffer, + .buffer.buf_size = buf_size, + + .op = VM_SNAPSHOT_SAVE, + }; + + xo_open_list_h(xop, JSON_DEV_ARR_KEY); + + /* Restore other devices that support this feature */ + for (i = 0; i < nitems(snapshot_devs); i++) { + meta->dev_name = snapshot_devs[i].dev_name; + + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop, + meta, &offset); + if (ret != 0) + goto snapshot_err; + } + + xo_close_list_h(xop, JSON_DEV_ARR_KEY); + +snapshot_err: + if (buffer != NULL) + free(buffer); + return (ret); +} + +static int +vm_mem_write_to_file(int fd, const void *src, size_t dst_offset, size_t len) +{ + size_t write_total; + ssize_t cnt_write; + size_t to_write; + + write_total = 0; + to_write = len; + + if (lseek(fd, dst_offset, SEEK_SET) < 0 ) { + perror("Failed to changed file offset"); + return (-1); + } + + while (write_total < len) { + cnt_write = write(fd, src + write_total, to_write); + if (cnt_write < 0) { + perror("Failed to write in file"); + return (-1); + } + to_write -= cnt_write; + write_total += cnt_write; + } + + return (0); +} + +void +checkpoint_cpu_add(int vcpu) +{ + + pthread_mutex_lock(&vcpu_lock); + CPU_SET(vcpu, &vcpus_active); + + if (checkpoint_active) { + CPU_SET(vcpu, &vcpus_suspended); + while (checkpoint_active) + pthread_cond_wait(&vcpus_can_run, &vcpu_lock); + CPU_CLR(vcpu, &vcpus_suspended); + } + pthread_mutex_unlock(&vcpu_lock); +} + +/* + * When a vCPU is suspended for any reason, it calls + * checkpoint_cpu_suspend(). This records that the vCPU is idle. + * Before returning from suspension, checkpoint_cpu_resume() is + * called. In suspend we note that the vCPU is idle. In resume we + * pause the vCPU thread until the checkpoint is complete. The reason + * for the two-step process is that vCPUs might already be stopped in + * the debug server when a checkpoint is requested. This approach + * allows us to account for and handle those vCPUs. + */ +void +checkpoint_cpu_suspend(int vcpu) +{ + + pthread_mutex_lock(&vcpu_lock); + CPU_SET(vcpu, &vcpus_suspended); + if (checkpoint_active && CPU_CMP(&vcpus_active, &vcpus_suspended) == 0) + pthread_cond_signal(&vcpus_idle); + pthread_mutex_unlock(&vcpu_lock); +} + +void +checkpoint_cpu_resume(int vcpu) +{ + + pthread_mutex_lock(&vcpu_lock); + while (checkpoint_active) + pthread_cond_wait(&vcpus_can_run, &vcpu_lock); + CPU_CLR(vcpu, &vcpus_suspended); + pthread_mutex_unlock(&vcpu_lock); +} + +static void +vm_vcpu_pause(struct vmctx *ctx) +{ + + pthread_mutex_lock(&vcpu_lock); + checkpoint_active = true; + vm_suspend_cpu(ctx, -1); + while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) + pthread_cond_wait(&vcpus_idle, &vcpu_lock); + pthread_mutex_unlock(&vcpu_lock); +} + +static void +vm_vcpu_resume(struct vmctx *ctx) +{ + + pthread_mutex_lock(&vcpu_lock); + checkpoint_active = false; + pthread_mutex_unlock(&vcpu_lock); + vm_resume_cpu(ctx, -1); + pthread_cond_broadcast(&vcpus_can_run); +} + +static int +vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm) +{ + int fd_checkpoint = 0, kdata_fd = 0; + int ret = 0; + int error = 0; + size_t guest_lowmem, guest_highmem, guest_memsize; + char *guest_baseaddr; + char *guest_lowmem_addr, *guest_highmem_addr; + xo_handle_t *xop = NULL; + char *meta_filename = NULL; + char *kdata_filename = NULL; + FILE *meta_file = NULL; + + kdata_filename = strcat_extension(checkpoint_file, ".kern"); + if (kdata_filename == NULL) { + fprintf(stderr, "Failed to construct kernel data filename.\n"); + return (-1); + } + + kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); + if (kdata_fd < 0) { + perror("Failed to open kernel data snapshot file."); + error = -1; + goto done; + } + + fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); + + if (fd_checkpoint < 0) { + perror("Failed to create checkpoint file"); + error = -1; + goto done; + } + + ret = vm_get_guestmem_from_ctx(ctx, &guest_baseaddr, &guest_lowmem, &guest_highmem); + guest_memsize = guest_lowmem + guest_highmem; + if (ret < 0) { + fprintf(stderr, "Failed to get guest mem information (base, low, high)\n"); + error = -1; + goto done; + } + + /* make space for VMs address space */ + ret = ftruncate(fd_checkpoint, guest_memsize); + if (ret < 0) { + perror("Failed to truncate checkpoint file\n"); + goto done; + } + + meta_filename = strcat_extension(checkpoint_file, ".meta"); + if (meta_filename == NULL) { + fprintf(stderr, "Failed to construct vm metadata filename.\n"); + goto done; + } + + meta_file = fopen(meta_filename, "w"); + if (meta_file == NULL) { + perror("Failed to open vm metadata snapshot file."); + goto done; + } + + xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); + if (xop == NULL) { + perror("Failed to get libxo handle on metadata file."); + goto done; + } + + ret = vm_snapshot_basic_metadata(ctx, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); + error = -1; + goto done; + } + + guest_lowmem_addr = guest_baseaddr; + if (guest_highmem > 0) + guest_highmem_addr = guest_baseaddr + 4*GB; + + vm_vcpu_pause(ctx); + + ret = vm_pause_user_devs(ctx); + if (ret != 0) { + fprintf(stderr, "Could not pause devices\r\n"); + error = ret; + goto done; + } + + + ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot vm kernel data.\n"); + error = -1; + goto done; + } + + ret = vm_snapshot_user_devs(ctx, kdata_fd, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot device state.\n"); + error = -1; + goto done; + } + + if (vm_mem_write_to_file(fd_checkpoint, guest_lowmem_addr, + 0, guest_lowmem) != 0) { + perror("Could not write lowmem"); + error = -1; + goto done; + } + + if (guest_highmem > 0) { + if (vm_mem_write_to_file(fd_checkpoint, guest_highmem_addr, + guest_lowmem, guest_highmem) != 0) { + perror("Could not write highmem"); + error = -1; + goto done; + } + } + + xo_finish_h(xop); + + if (stop_vm) { + vm_destroy(ctx); + exit(0); + } + +done: + ret = vm_resume_user_devs(ctx); + if (ret != 0) + fprintf(stderr, "Could not resume devices\r\n"); + vm_vcpu_resume(ctx); + if (fd_checkpoint > 0) + close(fd_checkpoint); + if (meta_filename != NULL) + free(meta_filename); + if (kdata_filename != NULL) + free(kdata_filename); + if (xop != NULL) + xo_destroy(xop); + if (meta_file != NULL) + fclose(meta_file); + if (kdata_fd > 0) + close(kdata_fd); + return (error); +} + +int +get_checkpoint_msg(int conn_fd, struct vmctx *ctx) +{ + unsigned char buf[MAX_MSG_SIZE]; + struct checkpoint_op *checkpoint_op; + int len, recv_len, total_recv = 0; + int err = 0; + + len = sizeof(struct checkpoint_op); /* expected length */ + while ((recv_len = recv(conn_fd, buf + total_recv, len - total_recv, 0)) > 0) { + total_recv += recv_len; + } + if (recv_len < 0) { + perror("Error while receiving data from bhyvectl"); + err = -1; + goto done; + } + + checkpoint_op = (struct checkpoint_op *)buf; + switch (checkpoint_op->op) { + case START_CHECKPOINT: + err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, false); + break; + case START_SUSPEND: + err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, true); + break; + default: + fprintf(stderr, "Unrecognized checkpoint operation.\n"); + err = -1; + } + +done: + close(conn_fd); + return (err); +} + +/* + * Listen for commands from bhyvectl + */ +void * +checkpoint_thread(void *param) +{ + struct checkpoint_thread_info *thread_info; + socklen_t addr_len; + int conn_fd, ret; + + thread_info = (struct checkpoint_thread_info *)param; + + addr_len = sizeof(thread_info->addr); + while ((conn_fd = accept(thread_info->socket_fd, + (struct sockaddr *) thread_info->addr, + &addr_len)) > -1) { + ret = get_checkpoint_msg(conn_fd, thread_info->ctx); + if (ret != 0) { + fprintf(stderr, "Failed to read message on checkpoint " + "socket. Retrying.\n"); + } + + addr_len = sizeof(struct sockaddr_un); + } + if (conn_fd < -1) { + perror("Failed to accept connection"); + } + + return (NULL); +} + +/* + * Create directory tree to store runtime specific information: + * i.e. UNIX sockets for IPC with bhyvectl. + */ +static int +make_checkpoint_dir(void) +{ + int err; + + err = mkdir(BHYVE_RUN_DIR, 0755); + if (err < 0 && errno != EEXIST) + return (err); + + err = mkdir(CHECKPOINT_RUN_DIR, 0755); + if (err < 0 && errno != EEXIST) + return (err); + + return 0; +} + +/* + * Create the listening socket for IPC with bhyvectl + */ +int +init_checkpoint_thread(struct vmctx *ctx) +{ + struct sockaddr_un addr; + int socket_fd; + pthread_t checkpoint_pthread; + char vmname_buf[MAX_VMNAME]; + int ret, err = 0; + + err = pthread_mutex_init(&vcpu_lock, NULL); + if (err != 0) + errc(1, err, "checkpoint mutex init"); + err = pthread_cond_init(&vcpus_idle, NULL); + if (err == 0) + err = pthread_cond_init(&vcpus_can_run, NULL); + if (err != 0) + errc(1, err, "checkpoint cv init"); + + socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (socket_fd < 0) { + perror("Socket creation failed (IPC with bhyvectl"); + err = -1; + goto fail; + } + + err = make_checkpoint_dir(); + if (err < 0) { + perror("Failed to create checkpoint runtime directory"); + goto fail; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + + err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (err != 0) { + perror("Failed to get VM name"); + goto fail; + } + + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", + CHECKPOINT_RUN_DIR, vmname_buf); + unlink(addr.sun_path); + + if (bind(socket_fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_un)) != 0) { + perror("Failed to bind socket (IPC with bhyvectl)"); + err = -1; + goto fail; + } + + if (listen(socket_fd, 10) < 0) { + perror("Failed to listen on socket (IPC with bhyvectl)"); + err = -1; + goto fail; + } + + memset(&checkpoint_info, 0, sizeof(struct checkpoint_thread_info)); + checkpoint_info.ctx = ctx; + checkpoint_info.socket_fd = socket_fd; + checkpoint_info.addr = &addr; + + + /* TODO: start thread for listening connections */ + pthread_set_name_np(checkpoint_pthread, "checkpoint thread"); + ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, + &checkpoint_info); + if (ret < 0) { + err = ret; + goto fail; + } + + return (0); +fail: + if (socket_fd > 0) + close(socket_fd); + unlink(addr.sun_path); + + return (err); +} + +void +vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) +{ + const char *__op; + + if (op == VM_SNAPSHOT_SAVE) + __op = "save"; + else if (op == VM_SNAPSHOT_RESTORE) + __op = "restore"; + else + __op = "unknown"; + + fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", + __func__, __op, bufname); +} + +int +vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + fprintf(stderr, "%s: buffer too small\r\n", __func__); + return (E2BIG); + } + + if (op == VM_SNAPSHOT_SAVE) + memcpy(buffer->buf, (uint8_t *) data, data_size); + else if (op == VM_SNAPSHOT_RESTORE) + memcpy((uint8_t *) data, buffer->buf, data_size); + else + return (EINVAL); + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + + return (0); +} + +size_t +vm_get_snapshot_size(struct vm_snapshot_meta *meta) +{ + size_t length; + struct vm_snapshot_buffer *buffer; + + buffer = &meta->buffer; + + if (buffer->buf_size < buffer->buf_rem) { + fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", + __func__, buffer->buf_size, buffer->buf_rem); + length = 0; + } else { + length = buffer->buf_size - buffer->buf_rem; + } + + return (length); +} + +int +vm_snapshot_guest2host_addr(void **addrp, size_t len, bool restore_null, + struct vm_snapshot_meta *meta) +{ + int ret; + vm_paddr_t gaddr; + + if (meta->op == VM_SNAPSHOT_SAVE) { + gaddr = paddr_host2guest(meta->ctx, *addrp); + if (gaddr == (vm_paddr_t) -1) { + if (!restore_null || + (restore_null && (*addrp != NULL))) { + ret = EFAULT; + goto done; + } + } + + SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); + if (gaddr == (vm_paddr_t) -1) { + if (!restore_null) { + ret = EFAULT; + goto done; + } + } + + *addrp = paddr_guest2host(meta->ctx, gaddr, len); + } else { + ret = EINVAL; + } + +done: + return (ret); +} + +int +vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + int ret; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + fprintf(stderr, "%s: buffer too small\r\n", __func__); + ret = E2BIG; + goto done; + } + + if (op == VM_SNAPSHOT_SAVE) { + ret = 0; + memcpy(buffer->buf, (uint8_t *) data, data_size); + } else if (op == VM_SNAPSHOT_RESTORE) { + ret = memcmp((uint8_t *) data, buffer->buf, data_size); + } else { + ret = EINVAL; + goto done; + } + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + +done: + return (ret); +} Index: usr.sbin/bhyve/uart_emul.h =================================================================== --- usr.sbin/bhyve/uart_emul.h +++ usr.sbin/bhyve/uart_emul.h @@ -31,10 +31,10 @@ #ifndef _UART_EMUL_H_ #define _UART_EMUL_H_ - #define UART_IO_BAR_SIZE 8 struct uart_softc; +struct vm_snapshot_meta; typedef void (*uart_intr_func_t)(void *arg); struct uart_softc *uart_init(uart_intr_func_t intr_assert, @@ -44,4 +44,7 @@ uint8_t uart_read(struct uart_softc *sc, int offset); void uart_write(struct uart_softc *sc, int offset, uint8_t value); int uart_set_backend(struct uart_softc *sc, const char *opt); +#ifdef BHYVE_SNAPSHOT +int uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta); +#endif #endif Index: usr.sbin/bhyve/uart_emul.c =================================================================== --- usr.sbin/bhyve/uart_emul.c +++ usr.sbin/bhyve/uart_emul.c @@ -39,6 +39,8 @@ #include #endif +#include + #include #include #include @@ -717,3 +719,35 @@ return (retval); } + +#ifdef BHYVE_SNAPSHOT +int +uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ier, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lsr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->msr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->fcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->scr, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->dll, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->dlh, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.rindex, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.windex, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.num, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.size, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->rxfifo.buf, sizeof(sc->rxfifo.buf), + meta, ret, done); + + sc->thre_int_pending = 1; + +done: + return (ret); +} +#endif Index: usr.sbin/bhyve/usb_emul.h =================================================================== --- usr.sbin/bhyve/usb_emul.h +++ usr.sbin/bhyve/usb_emul.h @@ -41,10 +41,10 @@ #define USB_XFER_IN 1 - struct usb_hci; struct usb_device_request; struct usb_data_xfer; +struct vm_snapshot_meta; /* Device emulation handlers */ struct usb_devemu { @@ -62,6 +62,7 @@ int (*ue_reset)(void *sc); int (*ue_remove)(void *sc); int (*ue_stop)(void *sc); + int (*ue_snapshot)(void *scarg, struct vm_snapshot_meta *meta); }; #define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x); @@ -148,7 +149,6 @@ pthread_mutex_unlock(&((x)->mtx)); \ } while (0) - struct usb_devemu *usb_emu_finddev(char *name); struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer, Index: usr.sbin/bhyve/usb_mouse.c =================================================================== --- usr.sbin/bhyve/usb_mouse.c +++ usr.sbin/bhyve/usb_mouse.c @@ -31,6 +31,8 @@ #include +#include + #include #include #include @@ -787,6 +789,29 @@ return (0); } +#ifdef BHYVE_SNAPSHOT +static int +umouse_snapshot(void *scarg, struct vm_snapshot_meta *meta) +{ + int ret; + struct umouse_softc *sc; + + sc = scarg; + + SNAPSHOT_VAR_OR_LEAVE(sc->um_report, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->newdata, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.idle, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.protocol, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.feature, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->polling, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_usec, meta, ret, done); + +done: + return (ret); +} +#endif struct usb_devemu ue_mouse = { .ue_emu = "tablet", @@ -797,6 +822,9 @@ .ue_data = umouse_data_handler, .ue_reset = umouse_reset, .ue_remove = umouse_remove, - .ue_stop = umouse_stop + .ue_stop = umouse_stop, +#ifdef BHYVE_SNAPSHOT + .ue_snapshot = umouse_snapshot, +#endif }; USB_EMUL_SET(ue_mouse); Index: usr.sbin/bhyve/virtio.h =================================================================== --- usr.sbin/bhyve/virtio.h +++ usr.sbin/bhyve/virtio.h @@ -287,6 +287,7 @@ struct vmctx; struct pci_devinst; struct vqueue_info; +struct vm_snapshot_meta; /* * A virtual device, with some number (possibly 0) of virtual @@ -361,6 +362,10 @@ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ + void (*vc_pause)(void *); /* called to pause device activity */ + void (*vc_resume)(void *); /* called to resume device activity */ + int (*vc_snapshot)(void *, struct vm_snapshot_meta *); + /* called to save / restore device state */ }; /* @@ -487,4 +492,9 @@ int baridx, uint64_t offset, int size); void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); +#ifdef BHYVE_SNAPSHOT +int vi_pci_snapshot(struct vm_snapshot_meta *meta); +int vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi); +int vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi); +#endif #endif /* _VIRTIO_H_ */ Index: usr.sbin/bhyve/virtio.c =================================================================== --- usr.sbin/bhyve/virtio.c +++ usr.sbin/bhyve/virtio.c @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -794,3 +795,149 @@ if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } + +#ifdef BHYVE_SNAPSHOT +int +vi_pci_pause(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct virtio_softc *vs; + struct virtio_consts *vc; + + vs = pi->pi_arg; + vc = vs->vs_vc; + + vc = vs->vs_vc; + assert(vc->vc_pause != NULL); + (*vc->vc_pause)(DEV_SOFTC(vs)); + + return (0); +} + +int +vi_pci_resume(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct virtio_softc *vs; + struct virtio_consts *vc; + + vs = pi->pi_arg; + vc = vs->vs_vc; + + vc = vs->vs_vc; + assert(vc->vc_resume != NULL); + (*vc->vc_resume)(DEV_SOFTC(vs)); + + return (0); +} + +static int +vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done); + +done: + return (ret); +} + +static int +vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done); + +done: + return (ret); +} + +static int +vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta) +{ + int i; + int ret; + struct virtio_consts *vc; + struct vqueue_info *vq; + uint64_t addr_size; + + vc = vs->vs_vc; + + /* Save virtio queue info */ + for (i = 0; i < vc->vc_nvq; i++) { + vq = &vs->vs_queues[i]; + + SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done); + + addr_size = vq->vq_qsize * sizeof(struct virtio_desc); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size, + false, meta, ret, done); + + addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size, + false, meta, ret, done); + + addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size, + false, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size(vq->vq_qsize), + meta, ret, done); + } + +done: + return (ret); +} + +int +vi_pci_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_devinst *pi; + struct virtio_softc *vs; + struct virtio_consts *vc; + + pi = meta->dev_data; + vs = pi->pi_arg; + vc = vs->vs_vc; + + /* Save virtio softc */ + ret = vi_pci_snapshot_softc(vs, meta); + if (ret != 0) + goto done; + + /* Save virtio consts */ + ret = vi_pci_snapshot_consts(vc, meta); + if (ret != 0) + goto done; + + /* Save virtio queue info */ + ret = vi_pci_snapshot_queues(vs, meta); + if (ret != 0) + goto done; + + /* Save device softc, if needed */ + if (vc->vc_snapshot != NULL) { + ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta); + if (ret != 0) + goto done; + } + +done: + return (ret); +} +#endif Index: usr.sbin/bhyvectl/Makefile =================================================================== --- usr.sbin/bhyvectl/Makefile +++ usr.sbin/bhyvectl/Makefile @@ -2,6 +2,8 @@ # $FreeBSD$ # +.include + PROG= bhyvectl SRCS= bhyvectl.c PACKAGE= bhyve @@ -14,4 +16,8 @@ CFLAGS+= -I${SRCTOP}/sys/amd64/vmm +.if ${MK_BHYVE_SNAPSHOT} != "no" +CFLAGS+= -DBHYVE_SNAPSHOT +.endif + .include Index: usr.sbin/bhyvectl/bhyvectl.8 =================================================================== --- usr.sbin/bhyvectl/bhyvectl.8 +++ usr.sbin/bhyvectl/bhyvectl.8 @@ -39,6 +39,8 @@ .Op Fl -inject-nmi .Op Fl -force-reset .Op Fl -force-poweroff +.Op Fl -checkpoint= Ns Ar +.Op Fl -suspend= Ns Ar .Sh DESCRIPTION The .Nm @@ -72,6 +74,17 @@ Force the VM to reset. .It Fl -force-poweroff Force the VM to power off. +.It Fl -checkpoint= Ns Ar +Save a snapshot of a virtual machine. +The guest memory contents are saved in the file given in +.Ar . +The guest device and vCPU state are saved in the file +.Ar .kern . +.It Fl -suspend= Ns Ar +Save a snapshot of a virtual machine similar to +.Fl -checkpoint . +The virtual machine will terminate after the snapshot has been +saved. .El .Sh EXIT STATUS .Ex -std @@ -79,6 +92,10 @@ Destroy the VM called fbsd10: .Pp .Dl "bhyvectl --vm=fbsd10 --destroy" +.Sh COMPATIBILITY +The snapshot file format is not yet stable and is subject to future changes. +Backwards compatibility support for the current snapshot file format is not +guaranteed when future changes are made. .Sh SEE ALSO .Xr bhyve 8 , .Xr bhyveload 8 Index: usr.sbin/bhyvectl/bhyvectl.c =================================================================== --- usr.sbin/bhyvectl/bhyvectl.c +++ usr.sbin/bhyvectl/bhyvectl.c @@ -57,6 +57,9 @@ #include #include +#include +#include + #include "amd/vmcb.h" #include "intel/vmcs.h" @@ -67,6 +70,9 @@ #define NO_ARG no_argument #define OPT_ARG optional_argument +#define CHECKPOINT_RUN_DIR "/var/run/bhyve/checkpoint" +#define MAX_VMNAME 100 + static const char *progname; static void @@ -78,6 +84,10 @@ " [--cpu=]\n" " [--create]\n" " [--destroy]\n" +#ifdef BHYVE_SNAPSHOT + " [--checkpoint=]\n" + " [--suspend=]\n" +#endif " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" @@ -287,6 +297,10 @@ static int unassign_pptdev, bus, slot, func; static int run; static int get_cpu_topology; +#ifdef BHYVE_SNAPSHOT +static int vm_checkpoint_opt; +static int vm_suspend_opt; +#endif /* * VMCB specific. @@ -591,6 +605,10 @@ SET_RTC_TIME, SET_RTC_NVRAM, RTC_NVRAM_OFFSET, +#ifdef BHYVE_SNAPSHOT + SET_CHECKPOINT_FILE, + SET_SUSPEND_FILE, +#endif }; static void @@ -1459,6 +1477,10 @@ { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, { "get-intinfo", NO_ARG, &get_intinfo, 1 }, { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, +#ifdef BHYVE_SNAPSHOT + { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE}, + { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE}, +#endif }; const struct option intel_opts[] = { @@ -1676,6 +1698,82 @@ } } +#ifdef BHYVE_SNAPSHOT +static int +send_checkpoint_op_req(struct vmctx *ctx, struct checkpoint_op *op) +{ + struct sockaddr_un addr; + int socket_fd, len, len_sent, total_sent; + int err = 0; + char vmname_buf[MAX_VMNAME]; + + socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (socket_fd < 0) { + perror("Error creating bhyvectl socket"); + err = -1; + goto done; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + + err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (err != 0) { + perror("Failed to get VM name"); + goto done; + } + + snprintf(addr.sun_path, PATH_MAX, "%s/%s", CHECKPOINT_RUN_DIR, vmname_buf); + + if (connect(socket_fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_un)) != 0) { + perror("Connect to VM socket failed"); + err = -1; + goto done; + } + + len = sizeof(*op); + total_sent = 0; + while ((len_sent = send(socket_fd, (char *)op + total_sent, len - total_sent, 0)) > 0) { + total_sent += len_sent; + } + + if (len_sent < 0) { + perror("Failed to send checkpoint operation request"); + err = -1; + } + +done: + if (socket_fd > 0) + close(socket_fd); + return (err); +} + +static int +send_start_checkpoint(struct vmctx *ctx, const char *checkpoint_file) +{ + struct checkpoint_op op; + + op.op = START_CHECKPOINT; + strncpy(op.snapshot_filename, checkpoint_file, MAX_SNAPSHOT_VMNAME); + op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0; + + return (send_checkpoint_op_req(ctx, &op)); +} + +static int +send_start_suspend(struct vmctx *ctx, const char *suspend_file) +{ + struct checkpoint_op op; + + op.op = START_SUSPEND; + strncpy(op.snapshot_filename, suspend_file, MAX_SNAPSHOT_VMNAME); + op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0; + + return (send_checkpoint_op_req(ctx, &op)); +} +#endif + int main(int argc, char *argv[]) { @@ -1692,6 +1790,9 @@ uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; struct tm tm; struct option *opts; +#ifdef BHYVE_SNAPSHOT + char *checkpoint_file, *suspend_file; +#endif cpu_intel = cpu_vendor_intel(); opts = setup_options(cpu_intel); @@ -1858,6 +1959,16 @@ case ASSERT_LAPIC_LVT: assert_lapic_lvt = atoi(optarg); break; +#ifdef BHYVE_SNAPSHOT + case SET_CHECKPOINT_FILE: + vm_checkpoint_opt = 1; + checkpoint_file = optarg; + break; + case SET_SUSPEND_FILE: + vm_suspend_opt = 1; + suspend_file = optarg; + break; +#endif default: usage(cpu_intel); } @@ -2343,6 +2454,14 @@ if (!error && destroy) vm_destroy(ctx); +#ifdef BHYVE_SNAPSHOT + if (!error && vm_checkpoint_opt) + error = send_start_checkpoint(ctx, checkpoint_file); + + if (!error && vm_suspend_opt) + error = send_start_suspend(ctx, suspend_file); +#endif + free (opts); exit(error); }