Index: lib/libvmmapi/vmmapi.h =================================================================== --- lib/libvmmapi/vmmapi.h +++ lib/libvmmapi/vmmapi.h @@ -33,6 +33,7 @@ #include #include +#include /* * API version for out-of-tree consumers like grub-bhyve for making compile @@ -42,6 +43,7 @@ struct iovec; struct vmctx; +struct vm_snapshot_meta; enum x2apic_state; /* @@ -88,6 +90,10 @@ */ int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); + +int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, + size_t *lowmem_size, size_t *highmem_size); + /* * Create a device memory segment identified by 'segid'. * @@ -110,6 +116,8 @@ int vm_parse_memsize(const char *optarg, size_t *memsize); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +/* inverse operation to vm_map_gpa - extract guest address from host pointer */ +vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr); int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, int *fault); @@ -120,6 +128,7 @@ void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); void vm_set_memflags(struct vmctx *ctx, int flags); int vm_get_memflags(struct vmctx *ctx); +int vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, @@ -226,6 +235,8 @@ uint16_t threads, uint16_t maxcpus); int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus); +void vm_vcpu_pause(struct vmctx *ctx); +void vm_vcpu_resume(struct vmctx *ctx); /* * FreeBSD specific APIs @@ -237,4 +248,27 @@ uint32_t eip, uint32_t gdtbase, uint32_t esp); void vm_setup_freebsd_gdt(uint64_t *gdtr); + +/* + * Bhyve save-restore + */ + +#define MAX_SNAPSHOT_VMNAME 100 + +enum checkpoint_opcodes { + START_CHECKPOINT = 0, + START_SUSPEND = 1, +}; + +struct checkpoint_op { + unsigned int op; + char snapshot_filename[MAX_SNAPSHOT_VMNAME]; +}; + +int vm_snapshot_req(struct vm_snapshot_meta *meta); + +int vm_restore_time(struct vmctx *ctx); + +int vm_restore_mem(struct vmctx *ctx, int vmmem_fd, size_t size); + #endif /* _VMMAPI_H_ */ Index: lib/libvmmapi/vmmapi.c =================================================================== --- lib/libvmmapi/vmmapi.c +++ lib/libvmmapi/vmmapi.c @@ -47,12 +47,15 @@ #include #include #include +#include #include #include +#include #include #include +#include #include "vmmapi.h" @@ -69,6 +72,14 @@ #define PROT_RW (PROT_READ | PROT_WRITE) #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) +struct vcpu_lock { + pthread_mutex_t vl_mtx; + int vl_running_cnt; + pthread_cond_t vl_run_complete_cond; + int vl_paused; + pthread_cond_t vl_paused_cond; +}; + struct vmctx { int fd; uint32_t lowmem_limit; @@ -77,6 +88,8 @@ size_t highmem; char *baseaddr; char *name; + + struct vcpu_lock lock; }; #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) @@ -111,6 +124,7 @@ vm_open(const char *name) { struct vmctx *vm; + struct vcpu_lock *lock; vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); assert(vm != NULL); @@ -124,6 +138,14 @@ if ((vm->fd = vm_device_open(vm->name)) < 0) goto err; + lock = &vm->lock; + + pthread_mutex_init(&lock->vl_mtx, NULL); + lock->vl_running_cnt = 0; + pthread_cond_init(&lock->vl_run_complete_cond, NULL); + lock->vl_paused = 0; + pthread_cond_init(&lock->vl_paused_cond, NULL); + return (vm); err: vm_destroy(vm); @@ -233,6 +255,16 @@ return (error); } +int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, + size_t *lowmem_size, size_t *highmem_size) +{ + *guest_baseaddr = ctx->baseaddr; + *lowmem_size = ctx->lowmem; + *highmem_size = ctx->highmem; + + return 0; +} + int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) @@ -444,6 +476,35 @@ return (NULL); } +vm_paddr_t +vm_rev_map_gpa(struct vmctx *ctx, void *addr) +{ + vm_paddr_t offaddr; + + offaddr = (char *) addr - ctx->baseaddr; + + if (ctx->lowmem > 0) + if (offaddr >= 0 && offaddr <= ctx->lowmem) + return (offaddr); + + if (ctx->highmem > 0) + if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem) + return (offaddr); + + return ((vm_paddr_t) -1); +} + +/* TODO: maximum size for vmname */ +int +vm_get_name(struct vmctx *ctx, char *buf, size_t max_len) +{ + if (max_len < strlen(ctx->name)) + return (EINVAL); + + strlcpy(buf, ctx->name, max_len); + return (0); +} + size_t vm_get_lowmem_size(struct vmctx *ctx) { @@ -625,12 +686,29 @@ { int error; struct vm_run vmrun; + struct vcpu_lock *lock; + + lock = &ctx->lock; + + pthread_mutex_lock(&lock->vl_mtx); + while (lock->vl_paused) + pthread_cond_wait(&lock->vl_paused_cond, &lock->vl_mtx); + + lock->vl_running_cnt++; + pthread_mutex_unlock(&lock->vl_mtx); bzero(&vmrun, sizeof(vmrun)); vmrun.cpuid = vcpu; error = ioctl(ctx->fd, VM_RUN, &vmrun); bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); + + pthread_mutex_lock(&lock->vl_mtx); + lock->vl_running_cnt--; + if (lock->vl_running_cnt == 0) + pthread_cond_broadcast(&lock->vl_run_complete_cond); + pthread_mutex_unlock(&lock->vl_mtx); + return (error); } @@ -1504,6 +1582,125 @@ return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } +void +vm_vcpu_pause(struct vmctx *ctx) +{ + struct vcpu_lock *lock; + struct vm_activate_cpu ac; + + lock = &ctx->lock; + + pthread_mutex_lock(&lock->vl_mtx); + lock->vl_paused = 1; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = -1; + ac.no_debug = 1; + ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); + while (lock->vl_running_cnt != 0) + pthread_cond_wait(&lock->vl_run_complete_cond, &lock->vl_mtx); + pthread_mutex_unlock(&lock->vl_mtx); +} + +void +vm_vcpu_resume(struct vmctx *ctx) +{ + struct vcpu_lock *lock; + + lock = &ctx->lock; + + pthread_mutex_lock(&lock->vl_mtx); + lock->vl_paused = 0; + pthread_cond_broadcast(&lock->vl_paused_cond); + pthread_mutex_unlock(&lock->vl_mtx); +} + +int +vm_snapshot_req(struct vm_snapshot_meta *meta) +{ + int error; + + error = ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta); + if (error != 0) { + fprintf(stderr, "%s: snapshot failed for %s\r\n", + __func__, meta->dev_name); + goto done; + } + +done: + return (error); +} + +static int +vm_mem_read_from_file(int fd, void *dest, size_t file_offset, size_t len) +{ + ssize_t cnt_read = 0; + size_t read_total = 0; + size_t to_read = len; + + if ( lseek(fd, file_offset , SEEK_SET) < 0) { + fprintf(stderr, + "%s: Could not change file offset errno = %d\r\n", + __func__, errno); + return (-1); + } + + while (read_total < len) { + cnt_read = read(fd, dest + read_total, to_read); + /* TODO - fix for when read returns 0 */ + if (cnt_read <= 0) { + fprintf(stderr,"%s: read error: %d\r\n", + __func__, errno); + return (-1); + } + read_total += cnt_read; + to_read -= cnt_read; + } + + return (0); +} + +int +vm_restore_mem(struct vmctx *ctx, int vmmem_fd, size_t size) +{ + if (ctx->lowmem + ctx->highmem != size) { + fprintf(stderr, "%s: mem size mismatch: %ld vs %ld\n", + __func__, ctx->lowmem + ctx->highmem, size); + return (-1); + } + + if (vm_mem_read_from_file(vmmem_fd, ctx->baseaddr, + 0, ctx->lowmem) != 0) { + fprintf(stderr, + "%s: Could not read lowmem from file\r\n", __func__); + return (-1); + } + + if (ctx->highmem > 0) { + if (vm_mem_read_from_file(vmmem_fd, ctx->baseaddr + 4*GB, + ctx->lowmem, ctx->highmem) != 0) { + + fprintf(stderr, + "%s: Could not read highmem from file\r\n", + __func__); + return (-1); + } + } + + return (0); +} + +int +vm_restore_time(struct vmctx *ctx) +{ + int error, dummy; + + dummy = 0; + error = ioctl(ctx->fd, VM_RESTORE_TIME, &dummy); + + return (error); +} + int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) Index: sys/amd64/include/vmm.h =================================================================== --- sys/amd64/include/vmm.h +++ sys/amd64/include/vmm.h @@ -34,6 +34,8 @@ #include #include +struct vm_snapshot_meta; + #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); #endif @@ -130,6 +132,7 @@ struct vm_object; struct vm_guest_paging; struct pmap; +enum snapshot_req; struct vm_eventinfo { void *rptr; /* rendezvous cookie */ @@ -158,6 +161,10 @@ typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta); +typedef int (*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta, + int vcpu); +typedef int (*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now); struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ @@ -177,6 +184,11 @@ vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; + + /* checkpoint operations */ + vmi_snapshot_t vmsnapshot; + vmi_snapshot_vmcx_t vmcx_snapshot; + vmi_restore_tsc_t vm_restore_tsc; }; extern struct vmm_ops vmm_ops_intel; @@ -241,7 +253,7 @@ int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); -int vm_suspend_cpu(struct vm *vm, int vcpu); +int vm_suspend_cpu(struct vm *vm, int vcpu, int no_debug); int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); @@ -249,6 +261,9 @@ void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); +int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); +int vm_restore_time(struct vm *vm); + #ifdef _SYS__CPUSET_H_ /* @@ -386,6 +401,15 @@ int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2); +/* + * Function used to keep track of the guest's TSC offset. The + * offset is used by the virutalization extensions to provide a consistent + * value for the Time Stamp Counter to the guest. + * + * Return value is 0 on success and non-zero on failure. + */ +int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset); + enum vm_reg_name vm_segment_name(int seg_encoding); struct vm_copyinfo { Index: sys/amd64/include/vmm_dev.h =================================================================== --- sys/amd64/include/vmm_dev.h +++ sys/amd64/include/vmm_dev.h @@ -31,6 +31,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +struct vm_snapshot_meta; + #ifdef _KERNEL void vmmdev_init(void); int vmmdev_cleanup(void); @@ -200,6 +202,7 @@ struct vm_activate_cpu { int vcpuid; + int no_debug; }; struct vm_cpuset { @@ -312,6 +315,11 @@ IOCNUM_RTC_WRITE = 101, IOCNUM_RTC_SETTIME = 102, IOCNUM_RTC_GETTIME = 103, + + /* checkpoint */ + IOCNUM_SNAPSHOT_REQ = 113, + + IOCNUM_RESTORE_TIME = 115 }; #define VM_RUN \ @@ -422,4 +430,8 @@ _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) #define VM_RESTART_INSTRUCTION \ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) +#define VM_SNAPSHOT_REQ \ + _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta) +#define VM_RESTORE_TIME \ + _IOWR('v', IOCNUM_RESTORE_TIME, int) #endif Index: sys/amd64/include/vmm_snapshot.h =================================================================== --- /dev/null +++ sys/amd64/include/vmm_snapshot.h @@ -0,0 +1,116 @@ +#ifndef _VMM_SNAPSHOT_ +#define _VMM_SNAPSHOT_ + +#include +#include + +struct vmctx; + +enum snapshot_req { + STRUCT_VMX, + STRUCT_VIOAPIC, + STRUCT_VM, + STRUCT_VLAPIC, + VM_MEM, + STRUCT_VHPET, + STRUCT_VMCX, + STRUCT_VATPIC, + STRUCT_VATPIT, + STRUCT_VPMTMR, + STRUCT_VRTC, +}; + +struct vm_snapshot_buffer { + /* + * R/O for device-specific functions; + * written by generic snapshot functions. + */ + uint8_t *const buf_start; + const size_t buf_size; + + /* + * R/W for device-specific functions used to keep track of buffer + * current position and remaining size. + */ + uint8_t *buf; + size_t buf_rem; + + /* + * Length of the snapshot is either determined as (buf_size - buf_rem) + * or (buf - buf_start) -- the second variation returns a signed value + * so it may not be appropriate. + * + * Use vm_get_snapshot_size(meta). + */ +}; + +enum vm_snapshot_op { + VM_SNAPSHOT_SAVE, + VM_SNAPSHOT_RESTORE, +}; + +struct vm_snapshot_meta { + struct vmctx *ctx; + void *dev_data; + const char *dev_name; /* identify userspace devices */ + enum snapshot_req dev_req; /* identify kernel structs */ + + struct vm_snapshot_buffer buffer; + + enum vm_snapshot_op op; +}; + + +void vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op); +int vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta); +size_t vm_get_snapshot_size(struct vm_snapshot_meta *meta); +int vm_snapshot_guest2host_addr(void **addrp, size_t len, int restore_null, + struct vm_snapshot_meta *meta); +int vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta); + +#define SNAPSHOT_BUF_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_buf((DATA), (LEN), (META)); \ + if ((RES) != 0) { \ + vm_snapshot_buf_err(#DATA, (META)->op); \ + goto LABEL; \ + } \ +} while (0) + +#define SNAPSHOT_VAR_OR_LEAVE(DATA, META, RES, LABEL) \ + SNAPSHOT_BUF_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) + +/* + * Address variables are pointers to guest memory. + * + * When RNULL != 0, do not enforce invalid address checks; instead, make the + * pointer NULL at restore time. + */ +#define SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ADDR, LEN, RNULL, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_guest2host_addr((void **)&(ADDR), (LEN), (RNULL), \ + (META)); \ + if ((RES) != 0) { \ + if ((RES) == EFAULT) \ + fprintf(stderr, "%s: invalid address: %s\r\n", \ + __func__, #ADDR); \ + goto LABEL; \ + } \ +} while (0) + +/* compare the value in the meta buffer with the data */ +#define SNAPSHOT_BUF_CMP_OR_LEAVE(DATA, LEN, META, RES, LABEL) \ +do { \ + (RES) = vm_snapshot_buf_cmp((DATA), (LEN), (META)); \ + if ((RES) != 0) { \ + vm_snapshot_buf_err(#DATA, (META)->op); \ + goto LABEL; \ + } \ +} while (0) + +#define SNAPSHOT_VAR_CMP_OR_LEAVE(DATA, META, RES, LABEL) \ + SNAPSHOT_BUF_CMP_OR_LEAVE(&(DATA), sizeof(DATA), (META), (RES), LABEL) + +#endif Index: sys/amd64/vmm/amd/svm.h =================================================================== --- sys/amd64/vmm/amd/svm.h +++ sys/amd64/vmm/amd/svm.h @@ -32,6 +32,7 @@ #define _SVM_H_ struct pcpu; +struct svm_softc; /* * Guest register state that is saved outside the VMCB. @@ -66,5 +67,6 @@ }; void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); +int svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset); #endif /* _SVM_H_ */ Index: sys/amd64/vmm/amd/svm.c =================================================================== --- sys/amd64/vmm/amd/svm.c +++ sys/amd64/vmm/amd/svm.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "vmm_lapic.h" #include "vmm_stat.h" @@ -278,6 +279,23 @@ svm_enable(NULL); } +int +svm_set_tsc_offset(struct svm_softc *sc, int vcpu, uint64_t offset) +{ + int error; + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + ctrl->tsc_offset = offset; + + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR1(sc->vm, vcpu, "tsc offset changed to %#lx", offset); + + error = vm_set_tsc_offset(sc->vm, vcpu, offset); + + return (error); +} + /* Pentium compatible MSRs */ #define MSR_PENTIUM_START 0 #define MSR_PENTIUM_END 0x1FFF @@ -2198,6 +2216,34 @@ return (EINVAL); } +static int +svm_snapshot_reg(void *arg, int vcpu, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = svm_getreg(arg, vcpu, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = svm_setreg(arg, vcpu, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + static int svm_setcap(void *arg, int vcpu, int type, int val) { @@ -2280,6 +2326,304 @@ free(vlapic, M_SVM_VLAPIC); } +static int +svm_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta) +{ + /* struct svm_softc is AMD's representation for SVM softc */ + struct svm_softc *sc; + struct svm_vcpu *vcpu; + struct vmcb *vmcb; + uint64_t val; + int i; + int ret; + + sc = arg; + + KASSERT(sc != NULL, ("%s: arg was NULL", __func__)); + + SNAPSHOT_VAR_OR_LEAVE(sc->nptp, meta, ret, done); + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu = &sc->vcpu[i]; + vmcb = &vcpu->vmcb; + + /* VMCB fields for virtual cpu i */ + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.v_tpr, meta, ret, done); + val = vmcb->ctrl.v_tpr; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.v_tpr = val; + + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.asid, meta, ret, done); + val = vmcb->ctrl.np_enable; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.np_enable = val; + + val = vmcb->ctrl.intr_shadow; + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + vmcb->ctrl.intr_shadow = val; + SNAPSHOT_VAR_OR_LEAVE(vmcb->ctrl.tlb_ctrl, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad1, + sizeof(vmcb->state.pad1), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cpl, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad2, + sizeof(vmcb->state.pad2), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.efer, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad3, + sizeof(vmcb->state.pad3), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr4, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr7, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dr6, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rflags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rip, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad4, + sizeof(vmcb->state.pad4), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rsp, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad5, + sizeof(vmcb->state.pad5), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.rax, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.star, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.lstar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cstar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sfmask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.kernelgsbase, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_cs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_esp, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.sysenter_eip, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.cr2, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad6, + sizeof(vmcb->state.pad6), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.g_pat, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.dbgctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_from, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.br_to, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_from, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmcb->state.int_to, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vmcb->state.pad7, + sizeof(vmcb->state.pad7), + meta, ret, done); + + /* Snapshot swctx for virtual cpu i */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbp, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rcx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rsi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r8, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r9, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r10, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r11, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r12, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r13, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r14, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r15, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr3, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr6, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_dr7, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.host_debugctl, meta, ret, + done); + + /* Restore other svm_vcpu struct fields */ + + /* Restore NEXTRIP field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); + + /* Restore lastcpu field */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->lastcpu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->dirty, meta, ret, done); + + /* Restore EPTGEN field - EPT is Extended Page Tabel */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->eptgen, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.gen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.num, meta, ret, done); + + /* Set all caches dirty */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + svm_set_dirty(sc, i, VMCB_CACHE_ASID); + svm_set_dirty(sc, i, VMCB_CACHE_IOPM); + svm_set_dirty(sc, i, VMCB_CACHE_I); + svm_set_dirty(sc, i, VMCB_CACHE_TPR); + svm_set_dirty(sc, i, VMCB_CACHE_CR2); + svm_set_dirty(sc, i, VMCB_CACHE_CR); + svm_set_dirty(sc, i, VMCB_CACHE_DT); + svm_set_dirty(sc, i, VMCB_CACHE_SEG); + svm_set_dirty(sc, i, VMCB_CACHE_NP); + } + } + + if (meta->op == VM_SNAPSHOT_RESTORE) + flush_by_asid(); + +done: + return (ret); +} + +static int +svm_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + int err, running, hostcpu; + + sc = (struct svm_softc *)arg; + err = 0; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcb = svm_get_vmcb(sc, vcpu); + + running = vcpu_is_running(sc->vm, vcpu, &hostcpu); + if (running && hostcpu !=curcpu) { + printf("%s: %s%d is running", __func__, vm_name(sc->vm), vcpu); + return (EINVAL); + } + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR0, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR2, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR3, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CR4, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DR7, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RAX, meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RSP, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RIP, meta); + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + /* ES */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_ES, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_ES, meta); + + /* CS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_CS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_CS, meta); + + /* SS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_SS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_SS, meta); + + /* DS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_DS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_DS, meta); + + /* FS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_FS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_FS, meta); + + /* GS */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_GS, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GS, meta); + + /* TR */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_TR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_TR, meta); + + /* LDTR */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_LDTR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_LDTR, meta); + + /* EFER */ + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_EFER, meta); + + /* IDTR and GDTR */ + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_IDTR, meta); + err += vmcb_snapshot_desc(sc, vcpu, VM_REG_GUEST_GDTR, meta); + + /* Specific AMD registers */ + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_CS, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_ESP, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_SYSENTER_EIP, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_NPT_BASE, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_CR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_DR_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXC_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_INST1_INTERCEPT, 4), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_INST2_INTERCEPT, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_TLB_CTRL, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO1, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINFO2, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXITINTINFO, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_VIRQ, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_GUEST_PAT, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_BAR, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PAGE, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_LT, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_AVIC_PT, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_IO_PERM, 8), meta); + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_MSR_PERM, 8), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_ASID, 4), meta); + + err += vmcb_snapshot_any(sc, vcpu, + VMCB_ACCESS(VMCB_OFF_EXIT_REASON, 8), meta); + + err += svm_snapshot_reg(sc, vcpu, VM_REG_GUEST_INTR_SHADOW, meta); + + return (err); +} + +static int +svm_restore_tsc(void *arg, int vcpu, uint64_t offset) +{ + int err; + + err = svm_set_tsc_offset(arg, vcpu, offset); + + return (err); +} + struct vmm_ops vmm_ops_amd = { svm_init, svm_cleanup, @@ -2296,5 +2640,8 @@ svm_npt_alloc, svm_npt_free, svm_vlapic_init, - svm_vlapic_cleanup + svm_vlapic_cleanup, + svm_snapshot_vmi, + svm_snapshot_vmcx, + svm_restore_tsc, }; Index: sys/amd64/vmm/amd/svm_msr.c =================================================================== --- sys/amd64/vmm/amd/svm_msr.c +++ sys/amd64/vmm/amd/svm_msr.c @@ -162,6 +162,8 @@ * Ignore writes to microcode update register. */ break; + case MSR_TSC: + error = svm_set_tsc_offset(sc, vcpu, val - rdtsc()); case MSR_EXTFEATURES: break; default: Index: sys/amd64/vmm/amd/vmcb.h =================================================================== --- sys/amd64/vmm/amd/vmcb.h +++ sys/amd64/vmm/amd/vmcb.h @@ -209,6 +209,9 @@ #define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) #ifdef _KERNEL + +struct vm_snapshot_meta; + /* VMCB save state area segment format */ struct vmcb_segment { uint16_t selector; @@ -331,6 +334,12 @@ int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); +int vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val); +int vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val); +int vmcb_snapshot_desc(void *arg, int vcpu, int reg, + struct vm_snapshot_meta *meta); +int vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident, + struct vm_snapshot_meta *meta); #endif /* _KERNEL */ #endif /* _VMCB_H_ */ Index: sys/amd64/vmm/amd/vmcb.c =================================================================== --- sys/amd64/vmm/amd/vmcb.c +++ sys/amd64/vmm/amd/vmcb.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "vmm_ktr.h" @@ -452,3 +453,104 @@ return (0); } + +int +vmcb_getany(struct svm_softc *sc, int vcpu, int ident, uint64_t *val) +{ + int error = 0; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto err; + } + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vm_get_register(sc->vm, vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_setany(struct svm_softc *sc, int vcpu, int ident, uint64_t val) +{ + int error = 0; + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + error = EINVAL; + goto err; + } + + if (ident >= VM_REG_LAST) { + error = EINVAL; + goto err; + } + + error = vm_set_register(sc->vm, vcpu, ident, val); + +err: + return (error); +} + +int +vmcb_snapshot_desc(void *arg, int vcpu, int reg, struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getdesc(arg, vcpu, reg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcb_setdesc(arg, vcpu, reg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcb_snapshot_any(struct svm_softc *sc, int vcpu, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcb_getany(sc, vcpu, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcb_setany(sc, vcpu, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} Index: sys/amd64/vmm/intel/vmcs.h =================================================================== --- sys/amd64/vmm/intel/vmcs.h +++ sys/amd64/vmm/intel/vmcs.h @@ -32,6 +32,9 @@ #define _VMCS_H_ #ifdef _KERNEL + +struct vm_snapshot_meta; + struct vmcs { uint32_t identifier; uint32_t abort_code; @@ -55,6 +58,14 @@ struct seg_desc *desc); int vmcs_setdesc(struct vmcs *vmcs, int running, int ident, struct seg_desc *desc); +int vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val); +int vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val); +int vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta); +int vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta); /* * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h Index: sys/amd64/vmm/intel/vmcs.c =================================================================== --- sys/amd64/vmm/intel/vmcs.c +++ sys/amd64/vmm/intel/vmcs.c @@ -43,6 +43,7 @@ #include #include +#include #include "vmm_host.h" #include "vmx_cpufunc.h" #include "vmcs.h" @@ -428,6 +429,126 @@ return (error); } +int +vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmread(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val) +{ + int error; + + if (!running) + VMPTRLD(vmcs); + + error = vmwrite(ident, val); + + if (!running) + VMCLEAR(vmcs); + + return (error); +} + +int +vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getreg(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setreg(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg, + struct vm_snapshot_meta *meta) +{ + int ret; + struct seg_desc desc; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done); + + ret = vmcs_setdesc(vmcs, running, seg, &desc); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + +int +vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident, + struct vm_snapshot_meta *meta) +{ + int ret; + uint64_t val; + + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = vmcs_getany(vmcs, running, ident, &val); + if (ret != 0) + goto done; + + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done); + + ret = vmcs_setany(vmcs, running, ident, val); + if (ret != 0) + goto done; + } else { + ret = EINVAL; + goto done; + } + +done: + return (ret); +} + #ifdef DDB extern int vmxon_enabled[]; Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c +++ sys/amd64/vmm/intel/vmx.c @@ -56,6 +56,8 @@ #include #include #include +#include + #include "vmm_lapic.h" #include "vmm_host.h" #include "vmm_ioport.h" @@ -288,6 +290,7 @@ static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); static void vmx_inject_pir(struct vlapic *vlapic); +static int vmx_restore_tsc(void *arg, int vcpu, uint64_t now); #ifdef KTR static const char * @@ -1279,7 +1282,12 @@ } error = vmwrite(VMCS_TSC_OFFSET, offset); + if (error != 0) + goto done; + error = vm_set_tsc_offset(vmx->vm, vcpu, offset); + +done: return (error); } @@ -3785,6 +3793,151 @@ free(vlapic, M_VLAPIC); } +static int +vmx_snapshot_vmi(void *arg, struct vm_snapshot_meta *meta) +{ + struct vmx *vmx; + struct vmxctx *vmxctx; + int i; + int ret; + + vmx = arg; + + KASSERT(vmx != NULL, ("%s: arg was NULL", __func__)); + + for (i = 0; i < VM_MAXCPU; i++) { + SNAPSHOT_BUF_OR_LEAVE(vmx->guest_msrs[i], + sizeof(vmx->guest_msrs[i]), meta, ret, done); + + vmxctx = &vmx->ctx[i]; + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rsi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rdx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rcx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r8, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r9, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rax, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_rbp, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r10, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r11, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r12, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r13, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r14, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_r15, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_cr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vmxctx->guest_dr6, meta, ret, done); + } + +done: + return (ret); +} + +static int +vmx_snapshot_vmcx(void *arg, struct vm_snapshot_meta *meta, int vcpu) +{ + struct vmcs *vmcs; + struct vmx *vmx; + int err, run, hostcpu; + + vmx = (struct vmx *)arg; + err = 0; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcs = &vmx->vmcs[vcpu]; + + run = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (run && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); + return (EINVAL); + } + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR3, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CR4, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DR7, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RSP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RIP, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_RFLAGS, meta); + + /* Guest segments */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_ES, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_ES, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_CS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_CS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_SS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_SS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_DS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_DS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_FS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_FS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_GS, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GS, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_TR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_TR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_LDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_LDTR, meta); + + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_EFER, meta); + + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_IDTR, meta); + err += vmcs_snapshot_desc(vmcs, run, VM_REG_GUEST_GDTR, meta); + + /* Guest page tables */ + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE0, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE1, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE2, meta); + err += vmcs_snapshot_reg(vmcs, run, VM_REG_GUEST_PDPTE3, meta); + + /* Other guest state */ + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_CS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_ESP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_IA32_SYSENTER_EIP, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_INTERRUPTIBILITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_GUEST_ACTIVITY, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_ENTRY_CTLS, meta); + err += vmcs_snapshot_any(vmcs, run, VMCS_EXIT_CTLS, meta); + + return (err); +} + +static int +vmx_restore_tsc(void *arg, int vcpu, uint64_t offset) +{ + struct vmcs *vmcs; + struct vmx *vmx = (struct vmx *)arg; + int error, running, hostcpu; + + KASSERT(arg != NULL, ("%s: arg was NULL", __func__)); + vmcs = &vmx->vmcs[vcpu]; + + running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); + if (running && hostcpu != curcpu) { + printf("%s: %s%d is running", __func__, vm_name(vmx->vm), vcpu); + return (EINVAL); + } + + if (!running) + VMPTRLD(vmcs); + + error = vmx_set_tsc_offset(vmx, vcpu, offset); + + if (!running) + VMCLEAR(vmcs); + return (error); +} + struct vmm_ops vmm_ops_intel = { vmx_init, vmx_cleanup, @@ -3802,4 +3955,7 @@ ept_vmspace_free, vmx_vlapic_init, vmx_vlapic_cleanup, + vmx_snapshot_vmi, + vmx_snapshot_vmcx, + vmx_restore_tsc, }; Index: sys/amd64/vmm/io/vatpic.h =================================================================== --- sys/amd64/vmm/io/vatpic.h +++ sys/amd64/vmm/io/vatpic.h @@ -36,6 +36,8 @@ #define IO_ELCR1 0x4d0 #define IO_ELCR2 0x4d1 +struct vm_snapshot_meta; + struct vatpic *vatpic_init(struct vm *vm); void vatpic_cleanup(struct vatpic *vatpic); @@ -54,4 +56,6 @@ void vatpic_pending_intr(struct vm *vm, int *vecptr); void vatpic_intr_accepted(struct vm *vm, int vector); +int vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta); + #endif /* _VATPIC_H_ */ Index: sys/amd64/vmm/io/vatpic.c =================================================================== --- sys/amd64/vmm/io/vatpic.c +++ sys/amd64/vmm/io/vatpic.c @@ -42,6 +42,7 @@ #include #include +#include #include "vmm_ktr.h" #include "vmm_lapic.h" @@ -808,3 +809,41 @@ { free(vatpic, M_VATPIC); } + +int +vatpic_snapshot(struct vatpic *vatpic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct atpic *atpic; + + for (i = 0; i < nitems(vatpic->atpic); i++) { + atpic = &vatpic->atpic[i]; + + SNAPSHOT_VAR_OR_LEAVE(atpic->ready, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->icw_num, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rd_cmd_reg, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atpic->aeoi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->poll, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->rotate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->sfn, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->irq_base, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->request, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->service, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->mask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->smm, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(atpic->acnt, sizeof(atpic->acnt), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->lowprio, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atpic->intr_raised, meta, ret, done); + + } + + SNAPSHOT_BUF_OR_LEAVE(vatpic->elc, sizeof(vatpic->elc), + meta, ret, done); + +done: + return (ret); +} Index: sys/amd64/vmm/io/vatpit.h =================================================================== --- sys/amd64/vmm/io/vatpit.h +++ sys/amd64/vmm/io/vatpit.h @@ -36,6 +36,8 @@ #define NMISC_PORT 0x61 +struct vm_snapshot_meta; + struct vatpit *vatpit_init(struct vm *vm); void vatpit_cleanup(struct vatpit *vatpit); @@ -43,5 +45,6 @@ uint32_t *eax); int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); +int vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta); #endif /* _VATPIT_H_ */ Index: sys/amd64/vmm/io/vatpit.c =================================================================== --- sys/amd64/vmm/io/vatpit.c +++ sys/amd64/vmm/io/vatpit.c @@ -38,6 +38,7 @@ #include #include +#include #include "vmm_ktr.h" #include "vatpic.h" @@ -455,3 +456,36 @@ free(vatpit, M_VATPIT); } + +int +vatpit_snapshot(struct vatpit *vatpit, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct channel *channel; + + SNAPSHOT_VAR_OR_LEAVE(vatpit->freq_sbt, meta, ret, done); + + /* properly restore timers; they will NOT work currently */ + printf("%s: snapshot restore does not reset timers!\r\n", __func__); + + for (i = 0; i < nitems(vatpit->channel); i++) { + channel = &vatpit->channel[i]; + + SNAPSHOT_VAR_OR_LEAVE(channel->mode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->initial, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->now_sbt, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->cr, sizeof(channel->cr), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(channel->ol, sizeof(channel->ol), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->slatched, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->crbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->frbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(channel->callout_sbt, meta, ret, done); + } + +done: + return (ret); +} Index: sys/amd64/vmm/io/vhpet.h =================================================================== --- sys/amd64/vmm/io/vhpet.h +++ sys/amd64/vmm/io/vhpet.h @@ -35,6 +35,8 @@ #define VHPET_BASE 0xfed00000 #define VHPET_SIZE 1024 +struct vm_snapshot_meta; + struct vhpet *vhpet_init(struct vm *vm); void vhpet_cleanup(struct vhpet *vhpet); int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, @@ -42,5 +44,7 @@ int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, int size, void *arg); int vhpet_getcap(struct vm_hpet_cap *cap); +int vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta); +int vhpet_restore_time(struct vhpet *vhpet); #endif /* _VHPET_H_ */ Index: sys/amd64/vmm/io/vhpet.c =================================================================== --- sys/amd64/vmm/io/vhpet.c +++ sys/amd64/vmm/io/vhpet.c @@ -43,6 +43,7 @@ #include #include +#include #include "vmm_lapic.h" #include "vatpic.h" @@ -761,3 +762,47 @@ cap->capabilities = vhpet_capabilities(); return (0); } + +int +vhpet_snapshot(struct vhpet *vhpet, struct vm_snapshot_meta *meta) +{ + int i, ret; + uint32_t countbase; + + SNAPSHOT_VAR_OR_LEAVE(vhpet->freq_sbt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->isr, meta, ret, done); + + /* at restore time the countbase should have the value it had when the + * snapshot was created; since the value is not directly kept in + * vhpet->countbase, but rather computed relative to the current system + * uptime using countbase_sbt, save the value retured by vhpet_counter + */ + if (meta->op == VM_SNAPSHOT_SAVE) + countbase = vhpet_counter(vhpet, NULL); + SNAPSHOT_VAR_OR_LEAVE(countbase, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vhpet->countbase = countbase; + + for (i = 0; i < nitems(vhpet->timer); i++) { + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].cap_config, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].msireg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].compval, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].comprate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vhpet->timer[i].callout_sbt, + meta, ret, done); + } + +done: + return (ret); +} + +int +vhpet_restore_time(struct vhpet *vhpet) +{ + if (vhpet_counter_enabled(vhpet)) + vhpet_start_counting(vhpet); + + return (0); +} Index: sys/amd64/vmm/io/vioapic.h =================================================================== --- sys/amd64/vmm/io/vioapic.h +++ sys/amd64/vmm/io/vioapic.h @@ -32,6 +32,8 @@ #ifndef _VIOAPIC_H_ #define _VIOAPIC_H_ +struct vm_snapshot_meta; + #define VIOAPIC_BASE 0xFEC00000 #define VIOAPIC_SIZE 4096 @@ -49,4 +51,6 @@ int vioapic_pincount(struct vm *vm); void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector); +int vioapic_snapshot(struct vioapic *vioapic, + struct vm_snapshot_meta *meta); #endif Index: sys/amd64/vmm/io/vioapic.c =================================================================== --- sys/amd64/vmm/io/vioapic.c +++ sys/amd64/vmm/io/vioapic.c @@ -42,6 +42,7 @@ #include #include +#include #include "vmm_ktr.h" #include "vmm_lapic.h" @@ -499,3 +500,20 @@ return (REDIR_ENTRIES); } + +int +vioapic_snapshot(struct vioapic *vioapic, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + + SNAPSHOT_VAR_OR_LEAVE(vioapic->ioregsel, meta, ret, done); + + for (i = 0; i < nitems(vioapic->rtbl); i++) { + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].reg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vioapic->rtbl[i].acnt, meta, ret, done); + } + +done: + return (ret); +} Index: sys/amd64/vmm/io/vlapic.h =================================================================== --- sys/amd64/vmm/io/vlapic.h +++ sys/amd64/vmm/io/vlapic.h @@ -32,6 +32,7 @@ #define _VLAPIC_H_ struct vm; +struct vm_snapshot_meta; enum x2apic_state; int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, @@ -108,4 +109,8 @@ void vlapic_dcr_write_handler(struct vlapic *vlapic); void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); + +int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta); +int vlapic_lapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta); + #endif /* _VLAPIC_H_ */ Index: sys/amd64/vmm/io/vlapic.c =================================================================== --- sys/amd64/vmm/io/vlapic.c +++ sys/amd64/vmm/io/vlapic.c @@ -46,6 +46,7 @@ #include #include +#include #include "vmm_lapic.h" #include "vmm_ktr.h" @@ -1654,3 +1655,104 @@ VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } + +static void +vlapic_reset_callout(struct vlapic *vlapic, uint32_t ccr) +{ + /* The implementation is similar to the one in the + * `vlapic_icrtmr_write_handler` function + */ + sbintime_t sbt; + struct bintime bt; + + VLAPIC_TIMER_LOCK(vlapic); + + bt = vlapic->timer_freq_bt; + bintime_mul(&bt, ccr); + + if (ccr != 0) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, &bt); + + sbt = bttosbt(bt); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } else { + /* even if the CCR was 0, periodic timers should be reset */ + if (vlapic_periodic_timer(vlapic)) { + binuptime(&vlapic->timer_fire_bt); + bintime_add(&vlapic->timer_fire_bt, + &vlapic->timer_period_bt); + sbt = bttosbt(vlapic->timer_period_bt); + + callout_stop(&vlapic->callout); + callout_reset_sbt(&vlapic->callout, sbt, 0, + vlapic_callout_handler, vlapic, 0); + } + } + + VLAPIC_TIMER_UNLOCK(vlapic); +} + +int +vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int i, ret; + struct vlapic *vlapic; + struct LAPIC *lapic; + uint32_t ccr; + + KASSERT(vm != NULL, ("%s: arg was NULL", __func__)); + + ret = 0; + + for (i = 0; i < VM_MAXCPU; i++) { + vlapic = vm_lapic(vm, i); + + /* snapshot the page first; timer period depends on icr_timer */ + lapic = vlapic->apic_page; + SNAPSHOT_BUF_OR_LEAVE(lapic, PAGE_SIZE, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->esr_pending, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.sec, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->timer_freq_bt.frac, + meta, ret, done); + + /* + * Timer period is equal to 'icr_timer' ticks at a frequency of + * 'timer_freq_bt'. + */ + if (meta->op == VM_SNAPSHOT_RESTORE) { + vlapic->timer_period_bt = vlapic->timer_freq_bt; + bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); + } + + SNAPSHOT_BUF_OR_LEAVE(vlapic->isrvec_stk, + sizeof(vlapic->isrvec_stk), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last, + sizeof(vlapic->lvt_last), + meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) + ccr = vlapic_get_ccr(vlapic); + + SNAPSHOT_VAR_OR_LEAVE(ccr, meta, ret, done); + + if (meta->op == VM_SNAPSHOT_RESTORE) { + /* Reset the value of the 'timer_fire_bt' and the vlapic + * callout based on the value of the current count + * register saved when the VM snapshot was created + */ + vlapic_reset_callout(vlapic, ccr); + } + } + +done: + return (ret); +} Index: sys/amd64/vmm/io/vpmtmr.h =================================================================== --- sys/amd64/vmm/io/vpmtmr.h +++ sys/amd64/vmm/io/vpmtmr.h @@ -34,6 +34,7 @@ #define IO_PMTMR 0x408 struct vpmtmr; +struct vm_snapshot_meta; struct vpmtmr *vpmtmr_init(struct vm *vm); void vpmtmr_cleanup(struct vpmtmr *pmtmr); @@ -41,4 +42,6 @@ int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); +int vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta); + #endif Index: sys/amd64/vmm/io/vpmtmr.c =================================================================== --- sys/amd64/vmm/io/vpmtmr.c +++ sys/amd64/vmm/io/vpmtmr.c @@ -36,6 +36,7 @@ #include #include +#include #include "vpmtmr.h" @@ -103,3 +104,14 @@ return (0); } + +int +vpmtmr_snapshot(struct vpmtmr *vpmtmr, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(vpmtmr->baseval, meta, ret, done); + +done: + return (ret); +} Index: sys/amd64/vmm/io/vrtc.h =================================================================== --- sys/amd64/vmm/io/vrtc.h +++ sys/amd64/vmm/io/vrtc.h @@ -34,6 +34,7 @@ #include struct vrtc; +struct vm_snapshot_meta; struct vrtc *vrtc_init(struct vm *vm); void vrtc_cleanup(struct vrtc *vrtc); @@ -49,4 +50,6 @@ int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *val); +int vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta); + #endif Index: sys/amd64/vmm/io/vrtc.c =================================================================== --- sys/amd64/vmm/io/vrtc.c +++ sys/amd64/vmm/io/vrtc.c @@ -40,6 +40,7 @@ #include #include +#include #include @@ -1019,3 +1020,43 @@ callout_drain(&vrtc->callout); free(vrtc, M_VRTC); } + +int +vrtc_snapshot(struct vrtc *vrtc, struct vm_snapshot_meta *meta) +{ + int ret; + + VRTC_LOCK(vrtc); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->addr, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + vrtc->base_uptime = sbinuptime(); + SNAPSHOT_VAR_OR_LEAVE(vrtc->base_rtctime, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_min, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.alarm_hour, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_week, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.day_of_month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.month, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.year, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_a, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_b, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_c, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.reg_d, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram, sizeof(vrtc->rtcdev.nvram), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vrtc->rtcdev.century, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(vrtc->rtcdev.nvram2, sizeof(vrtc->rtcdev.nvram2), + meta, ret, done); + + vrtc_callout_reset(vrtc, vrtc_freq(vrtc)); + + VRTC_UNLOCK(vrtc); + +done: + return (ret); +} Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c +++ sys/amd64/vmm/vmm.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include @@ -53,6 +53,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include @@ -64,6 +69,7 @@ #include #include #include +#include #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -111,6 +117,7 @@ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ + uint64_t tsc_offset; /* (o) TSC offsetting */ }; #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) @@ -204,6 +211,12 @@ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) #define VLAPIC_CLEANUP(vmi, vlapic) \ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) +#define VM_SNAPSHOT_VMI(vmi, meta) \ + (ops != NULL ? (*ops->vmsnapshot)(vmi, meta) : ENXIO) +#define VM_SNAPSHOT_VMCX(vmi, meta, vcpuid) \ + (ops != NULL ? (*ops->vmcx_snapshot)(vmi, meta, vcpuid) : ENXIO) +#define VM_RESTORE_TSC(vmi, vcpuid, offset) \ + (ops != NULL ? (*ops->vm_restore_tsc)(vmi, vcpuid, offset) : ENXIO) #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() @@ -289,6 +302,7 @@ vcpu->hostcpu = NOCPU; vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); + vcpu->tsc_offset = 0; } vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); @@ -2329,7 +2343,7 @@ } int -vm_suspend_cpu(struct vm *vm, int vcpuid) +vm_suspend_cpu(struct vm *vm, int vcpuid, int no_debug) { int i; @@ -2337,7 +2351,8 @@ return (EINVAL); if (vcpuid == -1) { - vm->debug_cpus = vm->active_cpus; + if (no_debug == 0) + vm->debug_cpus = vm->active_cpus; for (i = 0; i < VM_MAXCPU; i++) { if (CPU_ISSET(i, &vm->active_cpus)) vcpu_notify_event(vm, i, false); @@ -2346,7 +2361,8 @@ if (!CPU_ISSET(vcpuid, &vm->active_cpus)) return (EINVAL); - CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); + if (no_debug == 0) + CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); vcpu_notify_event(vm, vcpuid, false); } return (0); @@ -2709,3 +2725,175 @@ VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); + +static int +vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + struct vcpu *vcpu; + + for (i = 0; i < VM_MAXCPU; i++) { + vcpu = &vm->vcpu[i]; + + SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done); + /* XXX we're cheating here, since the value of tsc_offset as + * saved here is actually the value of the guest's TSC value. + * + * It will be turned turned back into an actual offset when the + * TSC restore function is called + */ + SNAPSHOT_VAR_OR_LEAVE(vcpu->tsc_offset, meta, ret, done); + } + +done: + return (ret); +} + +static int +vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret; + int i; + uint64_t now; + + ret = 0; + now = rdtsc(); + + if (meta->op == VM_SNAPSHOT_SAVE) { + /* XXX make tsc_offset take the value TSC proper as seen by the + * guest + */ + for (i = 0; i < VM_MAXCPU; i++) + vm->vcpu[i].tsc_offset += now; + } + + ret = vm_snapshot_vcpus(vm, meta); + if (ret != 0) { + printf("%s: failed to copy vm data to user buffer", __func__); + goto done; + } + + if (meta->op == VM_SNAPSHOT_SAVE) { + /* XXX turn tsc_offset back into an offset; actual value is only + * required for restore; using it otherwise would be wrong + */ + for (i = 0; i < VM_MAXCPU; i++) + vm->vcpu[i].tsc_offset -= now; + } + +done: + return (ret); +} + +static int +vm_snapshot_vmcx(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int i, error; + + error = 0; + + for (i = 0; i < VM_MAXCPU; i++) { + error = VM_SNAPSHOT_VMCX(vm->cookie, meta, i); + if (error != 0) { + printf("%s: failed to snapshot vmcs/vmcb data for " + "vCPU: %d; error: %d\n", __func__, i, error); + goto done; + } + } + +done: + return (error); +} + +/* + * Save kernel-side structures to user-space for snapshotting. + */ +int +vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta) +{ + int ret = 0; + + switch (meta->dev_req) { + case STRUCT_VMX: + ret = VM_SNAPSHOT_VMI(vm->cookie, meta); + break; + case STRUCT_VMCX: + ret = vm_snapshot_vmcx(vm, meta); + break; + case STRUCT_VM: + ret = vm_snapshot_vm(vm, meta); + break; + case STRUCT_VIOAPIC: + ret = vioapic_snapshot(vm_ioapic(vm), meta); + break; + case STRUCT_VLAPIC: + ret = vlapic_snapshot(vm, meta); + break; + case STRUCT_VHPET: + ret = vhpet_snapshot(vm_hpet(vm), meta); + break; + case STRUCT_VATPIC: + ret = vatpic_snapshot(vm_atpic(vm), meta); + break; + case STRUCT_VATPIT: + ret = vatpit_snapshot(vm_atpit(vm), meta); + break; + case STRUCT_VPMTMR: + ret = vpmtmr_snapshot(vm_pmtmr(vm), meta); + break; + case STRUCT_VRTC: + ret = vrtc_snapshot(vm_rtc(vm), meta); + break; + default: + printf("%s: failed to find the requested type %#x\n", + __func__, meta->dev_req); + ret = (EINVAL); + } + return (ret); +} + +int +vm_set_tsc_offset(struct vm *vm, int vcpuid, uint64_t offset) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu->tsc_offset = offset; + + return (0); +} + +int +vm_restore_time(struct vm *vm) +{ + int error, i; + uint64_t now; + struct vcpu *vcpu; + + now = rdtsc(); + + error = vhpet_restore_time(vm_hpet(vm)); + if (error) + return (error); + + for (i = 0; i < nitems(vm->vcpu); i++) { + vcpu = &vm->vcpu[i]; + + error = VM_RESTORE_TSC(vm->cookie, i, vcpu->tsc_offset - now); + if (error) + return (error); + } + + return (0); +} Index: sys/amd64/vmm/vmm_dev.c =================================================================== --- sys/amd64/vmm/vmm_dev.c +++ sys/amd64/vmm/vmm_dev.c @@ -53,8 +53,9 @@ #include #include -#include #include +#include +#include #include "vmm_lapic.h" #include "vmm_stat.h" @@ -369,6 +370,7 @@ struct vm_cpu_topology *topology; uint64_t *regvals; int *regnums; + struct vm_snapshot_meta *snapshot_meta; error = vmm_priv_check(curthread->td_ucred); if (error) @@ -724,7 +726,7 @@ break; case VM_SUSPEND_CPU: vac = (struct vm_activate_cpu *)data; - error = vm_suspend_cpu(sc->vm, vac->vcpuid); + error = vm_suspend_cpu(sc->vm, vac->vcpuid, vac->no_debug); break; case VM_RESUME_CPU: vac = (struct vm_activate_cpu *)data; @@ -772,6 +774,13 @@ &topology->threads, &topology->maxcpus); error = 0; break; + case VM_SNAPSHOT_REQ: + snapshot_meta = (struct vm_snapshot_meta *)data; + error = vm_snapshot_req(sc->vm, snapshot_meta); + break; + case VM_RESTORE_TIME: + error = vm_restore_time(sc->vm); + break; default: error = ENOTTY; break; Index: sys/amd64/vmm/vmm_snapshot.c =================================================================== --- /dev/null +++ sys/amd64/vmm/vmm_snapshot.c @@ -0,0 +1,111 @@ +#include +#include + +#include + +void +vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) +{ + const char *opstr; + + if (op == VM_SNAPSHOT_SAVE) + opstr = "save"; + else if (op == VM_SNAPSHOT_RESTORE) + opstr = "restore"; + else + opstr = "unknown"; + + printf("%s: snapshot-%s failed for %s\r\n", __func__, opstr, bufname); +} + +int +vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + void *nv_data; + + nv_data = __DEVOLATILE(void *, data); + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + printf("%s: buffer too small\r\n", __func__); + return (E2BIG); + } + + if (op == VM_SNAPSHOT_SAVE) + copyout(nv_data, buffer->buf, data_size); + else if (op == VM_SNAPSHOT_RESTORE) + copyin(buffer->buf, nv_data, data_size); + else + return (EINVAL); + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + + return (0); +} + +size_t +vm_get_snapshot_size(struct vm_snapshot_meta *meta) +{ + size_t length; + struct vm_snapshot_buffer *buffer; + + buffer = &meta->buffer; + + if (buffer->buf_size < buffer->buf_rem) { + printf("%s: Invalid buffer: size = %zu, rem = %zu\r\n", + __func__, buffer->buf_size, buffer->buf_rem); + length = 0; + } else { + length = buffer->buf_size - buffer->buf_rem; + } + + return (length); +} + +int +vm_snapshot_guest2host_addr(void **addrp, size_t len, int restore_null, + struct vm_snapshot_meta *meta) +{ + /* The kernel devices/structures should not map guest memory */ + return (0); +} + +int +vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + int ret; + void *_data = *(void **)(void *)&data; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + printf("%s: buffer too small\r\n", __func__); + ret = E2BIG; + goto done; + } + + if (op == VM_SNAPSHOT_SAVE) { + ret = 0; + copyout(_data, buffer->buf, data_size); + } else if (op == VM_SNAPSHOT_RESTORE) { + ret = memcmp(_data, buffer->buf, data_size); + } else { + ret = EINVAL; + goto done; + } + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + +done: + return (ret); +} Index: sys/modules/vmm/Makefile =================================================================== --- sys/modules/vmm/Makefile +++ sys/modules/vmm/Makefile @@ -18,6 +18,7 @@ vmm_dev.c \ vmm_host.c \ vmm_instruction_emul.c \ + vmm_snapshot.c \ vmm_ioport.c \ vmm_lapic.c \ vmm_mem.c \ Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -57,6 +57,7 @@ rfb.c \ rtc.c \ smbiostbl.c \ + snapshot.c \ sockstream.c \ task_switch.c \ uart_emul.c \ @@ -71,7 +72,7 @@ .PATH: ${BHYVE_SYSDIR}/sys/amd64/vmm SRCS+= vmm_instruction_emul.c -LIBADD= vmmapi md pthread z util sbuf cam +LIBADD= vmmapi md pthread z util sbuf cam ucl xo .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET @@ -92,6 +93,10 @@ .ifdef GDB_LOG CFLAGS+=-DGDB_LOG .endif +CFLAGS+= -I${SRCTOP}/contrib/libucl/include + +# Temporary disable capsicum, until we integrate checkpoint code with it. +CFLAGS+= -DWITHOUT_CAPSICUM WARNS?= 2 Index: usr.sbin/bhyve/Makefile.depend =================================================================== --- usr.sbin/bhyve/Makefile.depend +++ usr.sbin/bhyve/Makefile.depend @@ -12,8 +12,10 @@ lib/libcompiler_rt \ lib/libmd \ lib/libthr \ + lib/libucl \ lib/libutil \ lib/libvmmapi \ + lib/libxo \ lib/libz \ secure/lib/libcrypto \ Index: usr.sbin/bhyve/atkbdc.h =================================================================== --- usr.sbin/bhyve/atkbdc.h +++ usr.sbin/bhyve/atkbdc.h @@ -30,9 +30,12 @@ #define _ATKBDC_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct vmctx; void atkbdc_init(struct vmctx *ctx); void atkbdc_event(struct atkbdc_softc *sc, int iskbd); +int atkbdc_snapshot(struct vm_snapshot_meta *meta); + #endif /* _ATKBDC_H_ */ Index: usr.sbin/bhyve/atkbdc.c =================================================================== --- usr.sbin/bhyve/atkbdc.c +++ usr.sbin/bhyve/atkbdc.c @@ -33,6 +33,7 @@ #include #include +#include #include @@ -137,6 +138,8 @@ struct aux_dev aux; }; +static struct atkbdc_softc *atkbdc_sc = NULL; + static void atkbdc_assert_kbd_intr(struct atkbdc_softc *sc) { @@ -548,6 +551,43 @@ sc->ps2kbd_sc = ps2kbd_init(sc); sc->ps2mouse_sc = ps2mouse_init(sc); + + assert(atkbdc_sc == NULL); + atkbdc_sc = sc; +} + +int +atkbdc_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->outport, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->ram, + sizeof(atkbdc_sc->ram), meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->curcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->ctrlbyte, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.irq, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(atkbdc_sc->kbd.buffer, + sizeof(atkbdc_sc->kbd.buffer), meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.brd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bwr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->kbd.bcnt, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(atkbdc_sc->aux.irq, meta, ret, done); + + ret = ps2kbd_snapshot(atkbdc_sc->ps2kbd_sc, meta); + if (ret != 0) + goto done; + + ret = ps2mouse_snapshot(atkbdc_sc->ps2mouse_sc, meta); + +done: + return (ret); } static void Index: usr.sbin/bhyve/bhyverun.h =================================================================== --- usr.sbin/bhyve/bhyverun.h +++ usr.sbin/bhyve/bhyverun.h @@ -28,6 +28,8 @@ * $FreeBSD$ */ + #include + #ifndef _FBSDRUN_H_ #define _FBSDRUN_H_ @@ -37,9 +39,10 @@ struct vmctx; extern int guest_ncpus; extern char *guest_uuid_str; -extern char *vmname; +extern const char *vmname; void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len); +uintptr_t paddr_host2guest(struct vmctx *ctx, void *addr); void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c +++ usr.sbin/bhyve/bhyverun.c @@ -36,6 +36,7 @@ #include #endif #include +#include #include #include @@ -81,10 +82,20 @@ #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" +#include "snapshot.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" +#include +#include +#include +#include + +#include +#include +#include + #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) @@ -157,10 +168,12 @@ [EXIT_REASON_XRSTORS] = "XRSTORS" }; +#define MAX_SOCK_NAME 200 + typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); -char *vmname; +const char *vmname; int guest_ncpus; uint16_t cores, maxcpus, sockets, threads; @@ -223,6 +236,7 @@ " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" + " -r: path to checkpoint file\n" " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" @@ -382,6 +396,12 @@ return (vm_map_gpa(ctx, gaddr, len)); } +uintptr_t +paddr_host2guest(struct vmctx *ctx, void *addr) +{ + return (vm_rev_map_gpa(ctx, addr)); +} + int fbsdrun_vmexit_on_pause(void) { @@ -972,17 +992,35 @@ return (ctx); } +void +spinup_vcpu(struct vmctx *ctx, int vcpu) +{ + int error; + uint64_t rip; + + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, vcpu); + error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); + assert(error == 0); + + fbsdrun_addcpu(ctx, BSP, vcpu, rip); +} + int main(int argc, char *argv[]) { int c, error, dbg_port, gdb_port, err, bvmcons; - int max_vcpus, mptgen, memflags; + int max_vcpus, mptgen, memflags, vcpu; int rtc_localtime; bool gdb_stop; struct vmctx *ctx; - uint64_t rip; size_t memsize; - char *optstr; + char *optstr, *restore_file; + struct restore_state rstate; + + restore_file = NULL; bvmcons = 0; progname = basename(argv[0]); @@ -997,7 +1035,7 @@ rtc_localtime = 1; memflags = 0; - optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:"; + optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:U:r:"; while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': @@ -1043,6 +1081,9 @@ "configuration '%s'", optarg); } break; + case 'r': + restore_file = optarg; + break; case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { pci_print_supported_devices(); @@ -1104,12 +1145,41 @@ argc -= optind; argv += optind; - if (argc != 1) + if (argc > 1 || (argc == 0 && restore_file == NULL)) usage(1); + if (restore_file != NULL) { + error = load_restore_file(restore_file, &rstate); + if (error) { + fprintf(stderr, "Failed to read checkpoint info from " + "file: '%s'.\n", restore_file); + exit(1); + } + } + + if (argc == 1) { vmname = argv[0]; + } else { + vmname = lookup_vmname(&rstate); + if (vmname == NULL) { + fprintf(stderr, "Cannot find VM name in restore file. " + "Please specify one.\n"); + exit(1); + } + } ctx = do_open(vmname); + if (restore_file != NULL) { + guest_ncpus = lookup_guest_ncpus(&rstate); + memflags = lookup_memflags(&rstate); + memsize = lookup_memsize(&rstate); + } + + if (guest_ncpus < 1) { + fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); + exit(1); + } + max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", @@ -1117,8 +1187,6 @@ exit(4); } - fbsdrun_set_capabilities(ctx, BSP); - vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { @@ -1168,8 +1236,37 @@ assert(error == 0); } - error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); - assert(error == 0); + if (restore_file != NULL) { + fprintf(stdout, "Pausing pci devs...\r\n"); + if (vm_pause_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to pause PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Restoring vm mem...\r\n"); + if (restore_vm_mem(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore VM memory.\n"); + exit(1); + } + + fprintf(stdout, "Restoring pci devs...\r\n"); + if (vm_restore_user_devs(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Restoring kernel structs...\r\n"); + if (vm_restore_kern_structs(ctx, &rstate) != 0) { + fprintf(stderr, "Failed to restore kernel structs.\n"); + exit(1); + } + + fprintf(stdout, "Resuming pci devs...\r\n"); + if (vm_resume_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to resume PCI device state.\n"); + exit(1); + } + } /* * build the guest tables, MP etc. @@ -1208,10 +1305,34 @@ errx(EX_OSERR, "cap_enter() failed"); #endif + if (restore_file != NULL) + destroy_restore_state(&rstate); + /* - * Add CPU 0 + * checkpointing thread for communication with bhyvectl + */ + if (init_checkpoint_thread(ctx) < 0) + printf("Failed to start checkpoint thread!\r\n"); + + /* + * Change the proc title to include the VM name. */ - fbsdrun_addcpu(ctx, BSP, BSP, rip); + setproctitle("%s", vmname); + + if (restore_file != NULL) { + vm_restore_time(ctx); + } + + /* Add CPU 0 + * If we restore a VM, start all vCPUs now (including APs), otherwise, + * let the guest OS to spin them up later via vmexits. + */ + + for (vcpu = 0; vcpu < guest_ncpus; vcpu++) + if (vcpu == BSP || restore_file) { + fprintf(stdout, "spinning up vcpu no %d...\r\n", vcpu); + spinup_vcpu(ctx, vcpu); + } /* * Head off to the main event dispatch loop Index: usr.sbin/bhyve/block_if.h =================================================================== --- usr.sbin/bhyve/block_if.h +++ usr.sbin/bhyve/block_if.h @@ -41,6 +41,9 @@ #include #include +struct vm_snapshot_meta; + + #define BLOCKIF_IOV_MAX 33 /* not practical to be IOV_MAX */ struct blockif_req { @@ -68,5 +71,11 @@ int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); +void blockif_pause(struct blockif_ctxt *bc); +void blockif_resume(struct blockif_ctxt *bc); +int blockif_snapshot_req(struct blockif_req *br, + struct vm_snapshot_meta *meta); +int blockif_snapshot(struct blockif_ctxt *bc, + struct vm_snapshot_meta *meta); #endif /* _BLOCK_IF_H_ */ Index: usr.sbin/bhyve/block_if.c =================================================================== --- usr.sbin/bhyve/block_if.c +++ usr.sbin/bhyve/block_if.c @@ -57,6 +57,7 @@ #include #include +#include #include "bhyverun.h" #include "mevent.h" @@ -67,6 +68,9 @@ #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) +#define NO_THREAD_IDX (-2) +#define REQ_IDX_SEPARATOR (-1) + enum blockop { BOP_READ, BOP_WRITE, @@ -103,9 +107,13 @@ int bc_psectsz; int bc_psectoff; int bc_closing; + int bc_paused; + int bc_work_count; pthread_t bc_btid[BLOCKIF_NUMTHR]; pthread_mutex_t bc_mtx; pthread_cond_t bc_cond; + pthread_cond_t bc_paused_cond; + pthread_cond_t bc_work_done_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; @@ -208,6 +216,18 @@ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); } +static int +blockif_flush_bc(struct blockif_ctxt *bc) +{ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + return (errno); + } else if (fsync(bc->bc_fd)) + return (errno); + + return (0); +} + static void blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { @@ -298,11 +318,7 @@ } break; case BOP_FLUSH: - if (bc->bc_ischr) { - if (ioctl(bc->bc_fd, DIOCGFLUSH)) - err = errno; - } else if (fsync(bc->bc_fd)) - err = errno; + err = blockif_flush_bc(bc); break; case BOP_DELETE: if (!bc->bc_candelete) @@ -346,15 +362,34 @@ pthread_mutex_lock(&bc->bc_mtx); for (;;) { - while (blockif_dequeue(bc, t, &be)) { + bc->bc_work_count++; + + /* We cannot process work if the interface is paused */ + while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } - /* Check ctxt status here to see if exit requested */ + + bc->bc_work_count--; + + /* If none of the workers is busy, notify the main thread */ + if (bc->bc_work_count == 0) + pthread_cond_broadcast(&bc->bc_work_done_cond); + + /* + * Check ctxt status here to see if exit requested + * + * No sense to wait while paused if closing anyway + */ if (bc->bc_closing) break; + + /* Make all worker threads wait here if the device is paused */ + while (bc->bc_paused) + pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx); + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } pthread_mutex_unlock(&bc->bc_mtx); @@ -558,6 +593,10 @@ bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); + bc->bc_paused = 0; + bc->bc_work_count = 0; + pthread_cond_init(&bc->bc_paused_cond, NULL); + pthread_cond_init(&bc->bc_work_done_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); @@ -589,6 +628,7 @@ err = 0; pthread_mutex_lock(&bc->bc_mtx); + /* should make thread wait if interface paused ? */ if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread @@ -650,6 +690,8 @@ assert(bc->bc_magic == BLOCKIF_SIG); pthread_mutex_lock(&bc->bc_mtx); + /* XXX: not waiting while paused */ + /* * Check pending requests. */ @@ -848,3 +890,96 @@ assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_candelete); } + +void +blockif_pause(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 1; + /* The interface is paused. Wait for workers to finish their work */ + while (bc->bc_work_count) + pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx); + pthread_mutex_unlock(&bc->bc_mtx); + + if (blockif_flush_bc(bc)) + fprintf(stderr, "%s: [WARN] failed to flush backing file.\r\n", + __func__); +} + +void +blockif_resume(struct blockif_ctxt *bc) +{ + assert(bc != NULL); + assert(bc->bc_magic == BLOCKIF_SIG); + + pthread_mutex_lock(&bc->bc_mtx); + bc->bc_paused = 0; + /* resume the threads waiting for paused */ + pthread_cond_broadcast(&bc->bc_paused_cond); + /* kick the threads after restore */ + pthread_cond_broadcast(&bc->bc_cond); + pthread_mutex_unlock(&bc->bc_mtx); +} + +int +blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta) +{ + int i; + struct iovec *iov; + int ret; + + SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done); + + /* XXX: The callback and parameter must be filled by the virtualized + * device that uses the interface, during its init; we're not touching + * them here + */ + + /* Snapshot the iovecs */ + for (i = 0; i < br->br_iovcnt; i++) { + iov = &br->br_iov[i]; + + SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done); + /* we assume the iov is a guest-mapped address */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len, + false, meta, ret, done); + } + +done: + return (ret); +} + +int +blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta) +{ + int ret; + + if (bc->bc_paused == 0) { + fprintf(stderr, "%s: Snapshot failed: " + "interface not paused.\r\n", __func__); + return (ENXIO); + } + + pthread_mutex_lock(&bc->bc_mtx); + + SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done); + +done: + pthread_mutex_unlock(&bc->bc_mtx); + return (ret); +} + Index: usr.sbin/bhyve/pci_ahci.c =================================================================== --- usr.sbin/bhyve/pci_ahci.c +++ usr.sbin/bhyve/pci_ahci.c @@ -41,6 +41,8 @@ #include #include +#include + #include #include #include @@ -131,6 +133,7 @@ uint32_t done; int slot; int more; + int readop; }; struct ahci_port { @@ -724,6 +727,7 @@ aior->slot = slot; aior->len = len; aior->done = done; + aior->readop = readop; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); @@ -1420,6 +1424,7 @@ aior->slot = slot; aior->len = len; aior->done = done; + aior->readop = 1; breq = &aior->io_req; breq->br_offset = lba + done; ahci_build_iov(p, aior, prdt, hdr->prdtl); @@ -2446,6 +2451,285 @@ return (pci_ahci_init(ctx, pi, opts, 1)); } +static int +pci_ahci_snapshot_save_queues(struct ahci_port *port, + struct vm_snapshot_meta *meta) +{ + int ret; + int idx; + struct ahci_ioreq *ioreq; + + STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) { + idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + } + + idx = -1; + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + + TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) { + idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq); + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + + /* snapshot only the busy requests + * other requests are not valid + */ + ret = blockif_snapshot_req(&ioreq->io_req, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to snapshot req\r\n", + __func__); + goto done; + } + } + + idx = -1; + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + +done: + return (ret); +} + +static int +pci_ahci_snapshot_restore_queues(struct ahci_port *port, + struct vm_snapshot_meta *meta) +{ + int ret; + int idx; + struct ahci_ioreq *ioreq; + + /* empty the free queue before restoring */ + while (!STAILQ_EMPTY(&port->iofhd)) + STAILQ_REMOVE_HEAD(&port->iofhd, io_flist); + + /* restore the free queue */ + while (1) { + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + if (idx == -1) + break; + + STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist); + } + + /* restore the busy queue */ + while (1) { + SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done); + if (idx == -1) + break; + + ioreq = &port->ioreq[idx]; + TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist); + + /* restore only the busy requests + * other requests are not valid + */ + ret = blockif_snapshot_req(&ioreq->io_req, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to restore request\r\n", + __func__); + + goto done; + } + + /* re-enqueue the requests in the block interface */ + if (ioreq->readop) + ret = blockif_read(port->bctx, &ioreq->io_req); + else + ret = blockif_write(port->bctx, &ioreq->io_req); + + if (ret != 0) { + fprintf(stderr, + "%s: failed to re-enqueue request\r\n", + __func__); + + goto done; + } + } + +done: + return (ret); +} + +static int +pci_ahci_snapshot(struct vm_snapshot_meta *meta) +{ + int i, j, ret; + void *bctx; + struct pci_devinst *pi; + struct pci_ahci_softc *sc; + struct ahci_port *port; + struct ahci_cmd_hdr *hdr; + struct ahci_ioreq *ioreq; + + pi = meta->dev_data; + sc = pi->pi_arg; + + /* TODO: add mtx lock/unlock */ + + SNAPSHOT_VAR_OR_LEAVE(sc->ports, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cap, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ghc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->is, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->pi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->vs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ccc_ctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ccc_pts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->em_loc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->em_ctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cap2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bohc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lintr, meta, ret, done); + + for (i = 0; i < MAX_PORTS; i++) { + port = &sc->port[i]; + + if (meta->op == VM_SNAPSHOT_SAVE) + bctx = port->bctx; + + SNAPSHOT_VAR_OR_LEAVE(bctx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->port, meta, ret, done); + + /* mostly for restore; save is ensured by the lines above */ + if (((bctx == NULL) && (port->bctx != NULL)) || + ((bctx != NULL) && (port->bctx == NULL))) { + fprintf(stderr, "%s: ports not matching\r\n", __func__); + + ret = EINVAL; + goto done; + } + + if (port->bctx == NULL) + continue; + + if (port->port != i) { + fprintf(stderr, "%s: ports not matching: " + "actual: %d expected: %d\r\n", + __func__, port->port, i); + + ret = EINVAL; + goto done; + } + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->cmd_lst, + AHCI_CL_SIZE * AHCI_MAX_SLOTS, false, meta, ret, done); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(port->rfis, 256, false, meta, + ret, done); + + SNAPSHOT_VAR_OR_LEAVE(port->ident, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->atapi, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->reset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->waitforclear, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->mult_sectors, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->xfermode, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->err_cfis, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sense_key, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->asc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ccs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->pending, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(port->clb, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->clbu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fb, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fbu, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->is, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ie, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->cmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->unused0, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->tfd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sig, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ssts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sctl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->serr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sact, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ci, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->sntf, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done); + + for (j = 0; j < port->ioqsz; j++) { + ioreq = &port->ioreq[j]; + + /* blockif_req snapshot done only for busy requests */ + hdr = (struct ahci_cmd_hdr *)(port->cmd_lst + + ioreq->slot * AHCI_CL_SIZE); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis, + 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry), + false, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done); + } + + /* Perform save / restore specific operations */ + if (meta->op == VM_SNAPSHOT_SAVE) { + ret = pci_ahci_snapshot_save_queues(port, meta); + if (ret != 0) + goto done; + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + ret = pci_ahci_snapshot_restore_queues(port, meta); + if (ret != 0) + goto done; + } else { + /* error */ + ret = EINVAL; + goto done; + } + + ret = blockif_snapshot(port->bctx, meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to restore blockif\r\n", + __func__); + + goto done; + } + } + +done: + return (ret); +} + +static int +pci_ahci_pause(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct pci_ahci_softc *sc; + struct blockif_ctxt *bctxt; + int i; + + sc = pi->pi_arg; + + for (i = 0; i < MAX_PORTS; i++) { + bctxt = sc->port[i].bctx; + if (bctxt == NULL) + continue; + + blockif_pause(bctxt); + } + + return (0); +} + +static int +pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi) +{ + struct pci_ahci_softc *sc; + struct blockif_ctxt *bctxt; + int i; + + sc = pi->pi_arg; + + for (i = 0; i < MAX_PORTS; i++) { + bctxt = sc->port[i].bctx; + if (bctxt == NULL) + continue; + + blockif_resume(bctxt); + } + + return (0); +} + /* * Use separate emulation names to distinguish drive and atapi devices */ @@ -2453,7 +2737,10 @@ .pe_emu = "ahci", .pe_init = pci_ahci_hd_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, }; PCI_EMUL_SET(pci_de_ahci); @@ -2461,7 +2748,10 @@ .pe_emu = "ahci-hd", .pe_init = pci_ahci_hd_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, }; PCI_EMUL_SET(pci_de_ahci_hd); @@ -2469,6 +2759,9 @@ .pe_emu = "ahci-cd", .pe_init = pci_ahci_atapi_init, .pe_barwrite = pci_ahci_write, - .pe_barread = pci_ahci_read + .pe_barread = pci_ahci_read, + .pe_snapshot = pci_ahci_snapshot, + .pe_pause = pci_ahci_pause, + .pe_resume = pci_ahci_resume, }; PCI_EMUL_SET(pci_de_ahci_cd); Index: usr.sbin/bhyve/pci_e82545.c =================================================================== --- usr.sbin/bhyve/pci_e82545.c +++ usr.sbin/bhyve/pci_e82545.c @@ -46,6 +46,8 @@ #ifndef WITHOUT_CAPSICUM #include #endif +#include + #include #include #include @@ -2381,11 +2383,158 @@ return (0); } +static int +e82545_snapshot(struct vm_snapshot_meta *meta) +{ + int i; + int ret; + struct e82545_softc *sc; + struct pci_devinst *pi; + uint64_t bitmap_value; + + pi = meta->dev_data; + sc = pi->pi_arg; + + /* esc_mevp and esc_mevpitr should be reinitiated at init */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_mac, meta, ret, done); + + /* General */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_CTRL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_VET, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCTTV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_LEDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_PBA, meta, ret, done); + + /* Interrupt control */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_irq_asserted, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ITR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_ICS, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMS, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_IMC, meta, ret, done); + + /* Transmit */ + /* The fields in the unions are in superposition to access certain + * bytes in the larger uint variables + * e.g., ip_config = [ipcss|ipcso|ipcse0|ipcse1] + */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.lower_setup.ip_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.upper_setup.tcp_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.cmd_and_length, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_txctx.tcp_seg_setup.data, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tx_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXCW, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIPG, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_AIT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_tdba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDBAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDLEN, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDHr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TDT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TIDV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TXDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_TADV, meta, ret, done); + + /* Has dependency on esc_TDLEN; reoreder of fields from struct */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_txdesc, sc->esc_TDLEN, + true, meta, ret, done); + + /* L2 frame acceptance */ + for (i = 0; i < nitems(sc->esc_uni); i++) { + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_valid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_addrsel, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_uni[i].eu_eth, meta, ret, done); + } + + SNAPSHOT_BUF_OR_LEAVE(sc->esc_fmcast, sizeof(sc->esc_fmcast), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->esc_fvlan, sizeof(sc->esc_fvlan), + meta, ret, done); + + /* Receive */ + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_active, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rx_loopback, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_FCRTH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_rdba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDBAH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDLEN, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDH, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDT, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RDTR, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXDCTL, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RADV, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RSRPD, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->esc_RXCSUM, meta, ret, done); + + /* Has dependency on esc_RDLEN; reoreder of fields from struct */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->esc_rxdesc, sc->esc_TDLEN, + true, meta, ret, done); + + /* IO Port register access */ + SNAPSHOT_VAR_OR_LEAVE(sc->io_addr, meta, ret, done); + /* Shadow copy of MDIC */ + SNAPSHOT_VAR_OR_LEAVE(sc->mdi_control, meta, ret, done); + /* Shadow copy of EECD */ + SNAPSHOT_VAR_OR_LEAVE(sc->eeprom_control, meta, ret, done); + /* Latest NVM in/out */ + SNAPSHOT_VAR_OR_LEAVE(sc->nvm_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->nvm_opaddr, meta, ret, done); + /* stats */ + SNAPSHOT_VAR_OR_LEAVE(sc->missed_pkt_count, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->pkt_rx_by_size, sizeof(sc->pkt_rx_by_size), + meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->pkt_tx_by_size, sizeof(sc->pkt_tx_by_size), + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->bcast_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcast_pkt_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->oversize_rx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->tso_tx_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_rx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->good_octets_tx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->missed_octets, meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) + bitmap_value = sc->nvm_bits; + SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + sc->nvm_bits = bitmap_value; + + if (meta->op == VM_SNAPSHOT_SAVE) + bitmap_value = sc->nvm_bits; + SNAPSHOT_VAR_OR_LEAVE(bitmap_value, meta, ret, done); + if (meta->op == VM_SNAPSHOT_RESTORE) + sc->nvm_bits = bitmap_value; + + /* EEPROM data */ + SNAPSHOT_BUF_OR_LEAVE(sc->eeprom_data, sizeof(sc->eeprom_data), + meta, ret, done); + +done: + return (ret); +} + struct pci_devemu pci_de_e82545 = { .pe_emu = "e1000", .pe_init = e82545_init, .pe_barwrite = e82545_write, - .pe_barread = e82545_read + .pe_barread = e82545_read, + .pe_snapshot = e82545_snapshot, }; PCI_EMUL_SET(pci_de_e82545); Index: usr.sbin/bhyve/pci_emul.h =================================================================== --- usr.sbin/bhyve/pci_emul.h +++ usr.sbin/bhyve/pci_emul.h @@ -45,6 +45,7 @@ struct vmctx; struct pci_devinst; struct memory_region; +struct vm_snapshot_meta; struct pci_devemu { char *pe_emu; /* Name of device emulation */ @@ -71,6 +72,11 @@ uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size); + + /* Save/restore device state */ + int (*pe_snapshot)(struct vm_snapshot_meta *meta); + int (*pe_pause)(struct vmctx *ctx, struct pci_devinst *pi); + int (*pe_resume)(struct vmctx *ctx, struct pci_devinst *pi); }; #define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x); @@ -245,6 +251,9 @@ void pci_write_dsdt(void); uint64_t pci_ecfg_base(void); int pci_bus_configured(int bus); +int pci_snapshot(struct vm_snapshot_meta *meta); +int pci_pause(struct vmctx *ctx, const char *dev_name); +int pci_resume(struct vmctx *ctx, const char *dev_name); static __inline void pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val) Index: usr.sbin/bhyve/pci_emul.c =================================================================== --- usr.sbin/bhyve/pci_emul.c +++ usr.sbin/bhyve/pci_emul.c @@ -45,6 +45,7 @@ #include #include +#include #include #include "acpi.h" @@ -1936,6 +1937,187 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata); INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); +/* + * Saves/restores PCI device emulated state. Returns 0 on success. + */ +static int +pci_snapshot_pci_dev(struct vm_snapshot_meta *meta) +{ + struct pci_devinst *pi; + int i; + int ret; + + pi = meta->dev_data; + + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.addr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.msg_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msi.maxmsgnum, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_bar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_bar, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table_count, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_offset, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.function_mask, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.pba_page_offset, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(pi->pi_cfgdata, sizeof(pi->pi_cfgdata), + meta, ret, done); + + for (i = 0; i < nitems(pi->pi_bar); i++) { + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].type, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].size, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_bar[i].addr, meta, ret, done); + } + + /* Restore MSI-X table. */ + for (i = 0; i < pi->pi_msix.table_count; i++) { + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].addr, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].msg_data, + meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(pi->pi_msix.table[i].vector_control, + meta, ret, done); + } + +done: + return (ret); +} + +static int +pci_find_slotted_dev(const char *dev_name, struct pci_devemu **pde, + struct pci_devinst **pdi) +{ + struct businfo *bi; + struct slotinfo *si; + struct funcinfo *fi; + int bus, slot, func; + + assert(dev_name != NULL); + assert(pde != NULL); + assert(pdi != NULL); + + for (bus = 0; bus < MAXBUSES; bus++) { + if ((bi = pci_businfo[bus]) == NULL) + continue; + + for (slot = 0; slot < MAXSLOTS; slot++) { + si = &bi->slotinfo[slot]; + for (func = 0; func < MAXFUNCS; func++) { + fi = &si->si_funcs[func]; + if (fi->fi_name == NULL) + continue; + if (strcmp(dev_name, fi->fi_name)) + continue; + + *pde = pci_emul_finddev(fi->fi_name); + assert(*pde != NULL); + + *pdi = fi->fi_devi; + return (0); + } + } + } + + return (EINVAL); +} + +int +pci_snapshot(struct vm_snapshot_meta *meta) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(meta->dev_name != NULL); + + ret = pci_find_slotted_dev(meta->dev_name, &pde, &pdi); + if (ret != 0) { + fprintf(stderr, "%s: no such name: %s\r\n", + __func__, meta->dev_name); + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + return (0); + } + + meta->dev_data = pdi; + + if (pde->pe_snapshot == NULL) { + fprintf(stderr, "%s: not implemented yet for: %s\r\n", + __func__, meta->dev_name); + return (-1); + } + + ret = pci_snapshot_pci_dev(meta); + if (ret != 0) { + fprintf(stderr, "%s: failed to snapshot pci dev\r\n", + __func__); + return (-1); + } + + ret = (*pde->pe_snapshot)(meta); + + return (ret); +} + +int +pci_pause(struct vmctx *ctx, const char *dev_name) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(dev_name != NULL); + + ret = pci_find_slotted_dev(dev_name, &pde, &pdi); + if (ret != 0) { + /* it is possible to call this function without checking that + * the device is inserted first + */ + fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); + return (0); + } + + if (pde->pe_pause == NULL) { + /* The pause/resume functionality is optional */ + fprintf(stderr, "%s: not implemented for: %s\n", + __func__, dev_name); + return (0); + } + + return (*pde->pe_pause)(ctx, pdi); +} + +int +pci_resume(struct vmctx *ctx, const char *dev_name) +{ + struct pci_devemu *pde; + struct pci_devinst *pdi; + int ret; + + assert(dev_name != NULL); + + ret = pci_find_slotted_dev(dev_name, &pde, &pdi); + if (ret != 0) { + /* it is possible to call this function without checking that + * the device is inserted first + */ + fprintf(stderr, "%s: no such name: %s\n", __func__, dev_name); + return (0); + } + + if (pde->pe_resume == NULL) { + /* The pause/resume functionality is optional */ + fprintf(stderr, "%s: not implemented for: %s\n", + __func__, dev_name); + return (0); + } + + return (*pde->pe_resume)(ctx, pdi); +} + #define PCI_EMUL_TEST #ifdef PCI_EMUL_TEST /* @@ -2105,11 +2287,18 @@ return (value); } +int +pci_emul_snapshot(struct vm_snapshot_meta *meta) +{ + return (0); +} + struct pci_devemu pci_dummy = { .pe_emu = "dummy", .pe_init = pci_emul_dinit, .pe_barwrite = pci_emul_diow, - .pe_barread = pci_emul_dior + .pe_barread = pci_emul_dior, + .pe_snapshot = pci_emul_snapshot, }; PCI_EMUL_SET(pci_dummy); Index: usr.sbin/bhyve/pci_fbuf.c =================================================================== --- usr.sbin/bhyve/pci_fbuf.c +++ usr.sbin/bhyve/pci_fbuf.c @@ -35,6 +35,7 @@ #include #include +#include #include #include @@ -440,10 +441,22 @@ return (error); } +static int +pci_fbuf_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_BUF_OR_LEAVE(fbuf_sc->fb_base, FB_SIZE, meta, ret, err); + +err: + return (ret); +} + struct pci_devemu pci_fbuf = { .pe_emu = "fbuf", .pe_init = pci_fbuf_init, .pe_barwrite = pci_fbuf_write, - .pe_barread = pci_fbuf_read + .pe_barread = pci_fbuf_read, + .pe_snapshot = pci_fbuf_snapshot, }; PCI_EMUL_SET(pci_fbuf); Index: usr.sbin/bhyve/pci_lpc.c =================================================================== --- usr.sbin/bhyve/pci_lpc.c +++ usr.sbin/bhyve/pci_lpc.c @@ -34,6 +34,7 @@ #include #include +#include #include #include @@ -451,12 +452,31 @@ pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5)); } +static int +pci_lpc_snapshot(struct vm_snapshot_meta *meta) +{ + int unit, ret; + struct uart_softc *sc; + + for (unit = 0; unit < LPC_UART_NUM; unit++) { + sc = lpc_uart_softc[unit].uart_softc; + + ret = uart_snapshot(sc, meta); + if (ret != 0) + goto done; + } + +done: + return (ret); +} + struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, .pe_write_dsdt = pci_lpc_write_dsdt, .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, - .pe_barread = pci_lpc_read + .pe_barread = pci_lpc_read, + .pe_snapshot = pci_lpc_snapshot, }; PCI_EMUL_SET(pci_de_lpc); Index: usr.sbin/bhyve/pci_virtio_block.c =================================================================== --- usr.sbin/bhyve/pci_virtio_block.c +++ usr.sbin/bhyve/pci_virtio_block.c @@ -38,6 +38,8 @@ #include #include +#include + #include #include #include @@ -144,6 +146,9 @@ }; static void pci_vtblk_reset(void *); +static void pci_vtblk_pause(void *); +static void pci_vtblk_resume(void *); +static int pci_vtblk_snapshot(void *, struct vm_snapshot_meta *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); @@ -158,6 +163,9 @@ pci_vtblk_cfgwrite, /* write PCI config */ NULL, /* apply negotiated features */ VTBLK_S_HOSTCAPS, /* our capabilities */ + pci_vtblk_pause, /* pause blockif threads */ + pci_vtblk_resume, /* resume blockif threads */ + pci_vtblk_snapshot, /* save / restore device state */ }; static void @@ -169,6 +177,38 @@ vi_reset_dev(&sc->vbsc_vs); } +static void +pci_vtblk_pause(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device pause requested !\n")); + blockif_pause(sc->bc); +} + +static void +pci_vtblk_resume(void *vsc) +{ + struct pci_vtblk_softc *sc = vsc; + + DPRINTF(("vtblk: device resume requested !\n")); + blockif_resume(sc->bc); +} + +static int +pci_vtblk_snapshot(void *vsc, struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_vtblk_softc *sc = vsc; + + SNAPSHOT_VAR_OR_LEAVE(sc->vbsc_cfg, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->vbsc_ident, sizeof(sc->vbsc_ident), + meta, ret, done); + +done: + return (ret); +} + static void pci_vtblk_done(struct blockif_req *br, int err) { @@ -408,6 +448,7 @@ .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, .pe_barwrite = vi_pci_write, - .pe_barread = vi_pci_read + .pe_barread = vi_pci_read, + .pe_snapshot = vi_pci_snapshot, }; PCI_EMUL_SET(pci_de_vblk); Index: usr.sbin/bhyve/pci_virtio_net.c =================================================================== --- usr.sbin/bhyve/pci_virtio_net.c +++ usr.sbin/bhyve/pci_virtio_net.c @@ -49,6 +49,8 @@ #ifndef WITHOUT_CAPSICUM #include #endif +#include + #include #include #include @@ -172,6 +174,9 @@ }; static void pci_vtnet_reset(void *); +static void pci_vtnet_pause(void *); +static void pci_vtnet_resume(void *); +static int pci_vtnet_snapshot(void *, struct vm_snapshot_meta *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); @@ -187,6 +192,9 @@ pci_vtnet_cfgwrite, /* write PCI config */ pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ + pci_vtnet_pause, /* pause rx/tx threads */ + pci_vtnet_resume, /* resume rx/tx threads */ + pci_vtnet_snapshot, /* save / restore device state */ }; /* @@ -247,6 +255,65 @@ sc->resetting = 0; } +static void +pci_vtnet_pause(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device pause requested !\n")); + + pthread_mutex_lock(&sc->tx_mtx); + pthread_mutex_lock(&sc->rx_mtx); + sc->resetting = 1; + pthread_mutex_unlock(&sc->rx_mtx); + pthread_mutex_unlock(&sc->tx_mtx); + + /* + * Wait for the transmit and receive threads to finish their + * processing. + */ + pci_vtnet_txwait(sc); + pci_vtnet_rxwait(sc); +} + +static void +pci_vtnet_resume(void *vsc) +{ + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device resume requested !\n")); + + pthread_mutex_lock(&sc->tx_mtx); + pthread_mutex_lock(&sc->rx_mtx); + sc->resetting = 0; + pthread_mutex_unlock(&sc->rx_mtx); + pthread_mutex_unlock(&sc->tx_mtx); +} + +static int +pci_vtnet_snapshot(void *vsc, struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_vtnet_softc *sc = vsc; + + DPRINTF(("vtnet: device snapshot requested !\n")); + + /* + * Queues and consts should have been saved by the more generic + * vi_pci_snapshot function. We need to save only our features and + * config. + */ + + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_features, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_config, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rx_vhdrlen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rx_merge, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->vsc_rx_ready, meta, ret, done); + +done: + return (ret); +} + /* * Called to send a buffer chain out to the tap device */ @@ -990,6 +1057,7 @@ .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_barwrite = vi_pci_write, - .pe_barread = vi_pci_read + .pe_barread = vi_pci_read, + .pe_snapshot = vi_pci_snapshot, }; PCI_EMUL_SET(pci_de_vnet); Index: usr.sbin/bhyve/pci_xhci.c =================================================================== --- usr.sbin/bhyve/pci_xhci.c +++ usr.sbin/bhyve/pci_xhci.c @@ -48,6 +48,8 @@ #include #include +#include + #include #include #include @@ -150,6 +152,8 @@ #define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ (((b) & ((m) << (s))))) +#define SNAP_DEV_NAME_LEN 128 + struct pci_xhci_trb_ring { uint64_t ringaddr; /* current dequeue guest address */ uint32_t ccs; /* consumer cycle state */ @@ -285,9 +289,10 @@ #define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) +#define XHCI_GADDR_SIZE(a) (XHCI_PADDR_SZ - \ + (((uint64_t) (a)) & (XHCI_PADDR_SZ - 1))) #define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ - (a), \ - XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1))) + (a), XHCI_GADDR_SIZE(a)) static int xhci_in_use; @@ -2827,12 +2832,261 @@ return (error); } +static void +pci_xhci_map_devs_slots(struct pci_xhci_softc *sc, int maps[]) +{ + int i, j; + struct pci_xhci_dev_emu *dev, *slot; + + memset(maps, 0, sizeof(maps[0]) * XHCI_MAX_SLOTS); + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + for (j = 1; j <= XHCI_MAX_DEVS; j++) { + slot = XHCI_SLOTDEV_PTR(sc, i); + dev = XHCI_DEVINST_PTR(sc, j); + + if (slot == dev) + maps[i] = j; + } + } +} +static int +pci_xhci_snapshot_ep(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, + int idx, struct vm_snapshot_meta *meta) +{ + int k; + int ret; + struct usb_data_xfer *xfer; + struct usb_data_xfer_block *xfer_block; + + /* some sanity checks */ + if (meta->op == VM_SNAPSHOT_SAVE) + xfer = dev->eps[idx].ep_xfer; + + SNAPSHOT_VAR_OR_LEAVE(xfer, meta, ret, done); + if (xfer == NULL) { + ret = 0; + goto done; + } + + if (meta->op == VM_SNAPSHOT_RESTORE) { + pci_xhci_init_ep(dev, idx); + xfer = dev->eps[idx].ep_xfer; + } + + /* save / restore proper */ + for (k = 0; k < USB_MAX_XFER_BLOCKS; k++) { + xfer_block = &xfer->data[k]; + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(xfer_block->buf, + XHCI_GADDR_SIZE(xfer_block->buf), true, meta, ret, + done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->blen, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->bdone, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->processed, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->hci_data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->ccs, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->streamid, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer_block->trbnext, meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(xfer->ureq, meta, ret, done); + if (xfer->ureq) { + /* xfer->ureq is not allocated at restore time */ + if (meta->op == VM_SNAPSHOT_RESTORE) + xfer->ureq = malloc(sizeof(struct usb_device_request)); + + SNAPSHOT_BUF_OR_LEAVE(xfer->ureq, + sizeof(struct usb_device_request), + meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(xfer->ndata, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer->head, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(xfer->tail, meta, ret, done); + +done: + return (ret); +} + +static int +pci_xhci_snapshot(struct vm_snapshot_meta *meta) +{ + int i, j; + int ret; + int restore_idx; + struct pci_devinst *pi; + struct pci_xhci_softc *sc; + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + char dname[SNAP_DEV_NAME_LEN]; + int maps[XHCI_MAX_SLOTS + 1]; + + pi = meta->dev_data; + sc = pi->pi_arg; + + SNAPSHOT_VAR_OR_LEAVE(sc->caplength, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hcsparams3, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hccparams1, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->dboff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsoff, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hccparams2, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->regsend, meta, ret, done); + + /* opregs */ + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.usbsts, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.pgsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dnctrl, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.crcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.dcbaap, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->opregs.config, meta, ret, done); + + /* opregs.cr_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.cr_p, + XHCI_GADDR_SIZE(sc->opregs.cr_p), false, meta, ret, done); + + /* opregs.dcbaa_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->opregs.dcbaa_p, + XHCI_GADDR_SIZE(sc->opregs.dcbaa_p), false, meta, ret, done); + + /* rtsregs */ + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.mfindex, meta, ret, done); + + /* rtsregs.intrreg */ + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.iman, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.imod, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstsz, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.rsvd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erstba, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.intrreg.erdp, meta, ret, done); + + /* rtsregs.erstba_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erstba_p, + XHCI_GADDR_SIZE(sc->rtsregs.erstba_p), false, meta, ret, done); + + /* rtsregs.erst_p */ + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(sc->rtsregs.erst_p, + XHCI_GADDR_SIZE(sc->rtsregs.erst_p), false, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_deq_seg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_idx, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_enq_seg, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.er_events_cnt, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rtsregs.event_pcs, meta, ret, done); + + /* sanity checking */ + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + dev = XHCI_DEVINST_PTR(sc, i); + if (dev == NULL) + continue; + + if (meta->op == VM_SNAPSHOT_SAVE) + restore_idx = i; + SNAPSHOT_VAR_OR_LEAVE(restore_idx, meta, ret, done); + + /* check if the restored device (when restoring) is sane */ + if (restore_idx != i) { + fprintf(stderr, "%s: idx not matching: actual: %d, " + "expected: %d\r\n", __func__, restore_idx, i); + ret = EINVAL; + goto done; + } + + if (meta->op == VM_SNAPSHOT_SAVE) { + memset(dname, 0, sizeof(dname)); + strncpy(dname, dev->dev_ue->ue_emu, sizeof(dname) - 1); + } + + SNAPSHOT_BUF_OR_LEAVE(dname, sizeof(dname), meta, ret, done); + + if (meta->op == VM_SNAPSHOT_RESTORE) { + dname[sizeof(dname) - 1] = '\0'; + if (strcmp(dev->dev_ue->ue_emu, dname)) { + fprintf(stderr, "%s: device names mismatch: " + "actual: %s, expected: %s\r\n", + __func__, dname, dev->dev_ue->ue_emu); + + ret = EINVAL; + goto done; + } + } + } + + /* portregs */ + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + port = XHCI_PORTREG_PTR(sc, i); + dev = XHCI_DEVINST_PTR(sc, i); + + if (dev == NULL) + continue; + + SNAPSHOT_VAR_OR_LEAVE(port->portsc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->portpmsc, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->portli, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(port->porthlpmc, meta, ret, done); + } + + /* slots */ + if (meta->op == VM_SNAPSHOT_SAVE) + pci_xhci_map_devs_slots(sc, maps); + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + SNAPSHOT_VAR_OR_LEAVE(maps[i], meta, ret, done); + + if (meta->op == VM_SNAPSHOT_SAVE) { + dev = XHCI_SLOTDEV_PTR(sc, i); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + if (maps[i] != 0) + dev = XHCI_DEVINST_PTR(sc, maps[i]); + else + dev = NULL; + + XHCI_SLOTDEV_PTR(sc, i) = dev; + } else { + /* error */ + ret = EINVAL; + goto done; + } + + if (dev == NULL) + continue; + + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(dev->dev_ctx, + XHCI_GADDR_SIZE(dev->dev_ctx), false, meta, ret, done); + + for (j = 1; j < XHCI_MAX_ENDPOINTS; j++) { + ret = pci_xhci_snapshot_ep(sc, dev, j, meta); + if (ret != 0) + goto done; + } + + SNAPSHOT_VAR_OR_LEAVE(dev->dev_slotstate, meta, ret, done); + + /* devices[i]->dev_sc */ + dev->dev_ue->ue_snapshot(dev->dev_sc, meta); + + /* devices[i]->hci */ + SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_address, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(dev->hci.hci_port, meta, ret, done); + } + + SNAPSHOT_VAR_OR_LEAVE(sc->ndevices, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->usb2_port_start, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->usb3_port_start, meta, ret, done); + +done: + return (ret); +} struct pci_devemu pci_de_xhci = { .pe_emu = "xhci", .pe_init = pci_xhci_init, .pe_barwrite = pci_xhci_write, - .pe_barread = pci_xhci_read + .pe_barread = pci_xhci_read, + .pe_snapshot = pci_xhci_snapshot, }; PCI_EMUL_SET(pci_de_xhci); Index: usr.sbin/bhyve/ps2kbd.h =================================================================== --- usr.sbin/bhyve/ps2kbd.h +++ usr.sbin/bhyve/ps2kbd.h @@ -32,10 +32,13 @@ #define _PS2KBD_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc); int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val); void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val); +int ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta); + #endif /* _PS2KBD_H_ */ Index: usr.sbin/bhyve/ps2kbd.c =================================================================== --- usr.sbin/bhyve/ps2kbd.c +++ usr.sbin/bhyve/ps2kbd.c @@ -32,10 +32,13 @@ #include +#include + #include #include #include #include +#include #include #include #include @@ -381,3 +384,14 @@ return (sc); } +int +ps2kbd_snapshot(struct ps2kbd_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->enabled, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done); + +done: + return (ret); +} Index: usr.sbin/bhyve/ps2mouse.h =================================================================== --- usr.sbin/bhyve/ps2mouse.h +++ usr.sbin/bhyve/ps2mouse.h @@ -32,6 +32,7 @@ #define _PS2MOUSE_H_ struct atkbdc_softc; +struct vm_snapshot_meta; struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc); @@ -40,4 +41,6 @@ void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable); int ps2mouse_fifocnt(struct ps2mouse_softc *sc); +int ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta); + #endif /* _PS2MOUSE_H_ */ Index: usr.sbin/bhyve/ps2mouse.c =================================================================== --- usr.sbin/bhyve/ps2mouse.c +++ usr.sbin/bhyve/ps2mouse.c @@ -32,10 +32,13 @@ #include +#include + #include #include #include #include +#include #include #include #include @@ -415,4 +418,21 @@ return (sc); } - +int +ps2mouse_snapshot(struct ps2mouse_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->resolution, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->sampling_rate, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ctrlenable, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->curcmd, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cur_x, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->cur_y, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->delta_x, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->delta_y, meta, ret, done); + +done: + return (ret); +} Index: usr.sbin/bhyve/snapshot.h =================================================================== --- /dev/null +++ usr.sbin/bhyve/snapshot.h @@ -0,0 +1,68 @@ +#ifndef _BHYVE_SNAPSHOT_ +#define _BHYVE_SNAPSHOT_ + +#include +#include +#include + +struct vmctx; + +struct restore_state { + int kdata_fd; + int vmmem_fd; + + void *kdata_map; + size_t kdata_len; + + size_t vmmem_len; + + struct ucl_parser *meta_parser; + ucl_object_t *meta_root_obj; +}; + +struct checkpoint_thread_info { + struct vmctx *ctx; + int socket_fd; + struct sockaddr_un *addr; +} checkpoint_info; + +typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); +typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *); +typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *); + +struct vm_snapshot_dev_info { + const char *dev_name; /* device name */ + vm_snapshot_dev_cb snapshot_cb; /* callback for device snapshot */ + vm_pause_dev_cb pause_cb; /* callback for device pause */ + vm_resume_dev_cb resume_cb; /* callback for device resume */ +}; + +struct vm_snapshot_kern_info { + const char *struct_name; /* kernel structure name*/ + enum snapshot_req req; /* request type */ +}; + + +void destroy_restore_state(struct restore_state *rstate); + +const char * lookup_vmname(struct restore_state *rstate); +int lookup_memflags(struct restore_state *rstate); +size_t lookup_memsize(struct restore_state *rstate); +int lookup_guest_ncpus(struct restore_state *rstate); + + +int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); +int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); + +int vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate); +int vm_pause_user_devs(struct vmctx *ctx); +int vm_resume_user_devs(struct vmctx *ctx); + +int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); +void *checkpoint_thread(void *param); +int init_checkpoint_thread(struct vmctx *ctx); + + +int load_restore_file(const char *filename, struct restore_state *rstate); + +#endif Index: usr.sbin/bhyve/snapshot.c =================================================================== --- /dev/null +++ usr.sbin/bhyve/snapshot.c @@ -0,0 +1,1411 @@ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#include "bhyverun.h" +#include "acpi.h" +#include "atkbdc.h" +#include "inout.h" +#include "dbgport.h" +#include "fwctl.h" +#include "ioapic.h" +#include "mem.h" +#include "mevent.h" +#include "mptbl.h" +#include "pci_emul.h" +#include "pci_irq.h" +#include "pci_lpc.h" +#include "smbiostbl.h" +#include "snapshot.h" +#include "xmsr.h" +#include "spinup_ap.h" +#include "rtc.h" + +#include +#include + +extern int guest_ncpus; + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +#define BHYVE_RUN_DIR "/var/run/bhyve" +#define CHECKPOINT_RUN_DIR BHYVE_RUN_DIR "/checkpoint" +#define MAX_VMNAME 100 + +#define MAX_MSG_SIZE 1024 + +#define SNAPSHOT_BUFFER_SIZE (20 * MB) + +#define JSON_STRUCT_ARR_KEY "structs" +#define JSON_DEV_ARR_KEY "devices" +#define JSON_BASIC_METADATA_KEY "basic metadata" +#define JSON_SNAPSHOT_REQ_KEY "snapshot_req" +#define JSON_SIZE_KEY "size" +#define JSON_FILE_OFFSET_KEY "file_offset" + +#define JSON_NCPUS_KEY "ncpus" +#define JSON_VMNAME_KEY "vmname" +#define JSON_MEMSIZE_KEY "memsize" +#define JSON_MEMFLAGS_KEY "memflags" + +const struct vm_snapshot_dev_info snapshot_devs[] = { + { "atkbdc", atkbdc_snapshot, NULL, NULL }, + { "virtio-net", pci_snapshot, NULL, NULL }, + { "virtio-blk", pci_snapshot, NULL, NULL }, + { "lpc", pci_snapshot, NULL, NULL }, + { "fbuf", pci_snapshot, NULL, NULL }, + { "xhci", pci_snapshot, NULL, NULL }, + { "e1000", pci_snapshot, NULL, NULL }, + { "ahci", pci_snapshot, pci_pause, pci_resume }, + { "ahci-hd", pci_snapshot, pci_pause, pci_resume }, + { "ahci-cd", pci_snapshot, NULL, NULL }, +}; + +const struct vm_snapshot_kern_info snapshot_kern_structs[] = { + { "vhpet", STRUCT_VHPET }, + { "vm", STRUCT_VM }, + { "vmx", STRUCT_VMX }, + { "vioapic", STRUCT_VIOAPIC }, + { "vlapic", STRUCT_VLAPIC }, + { "vmcx", STRUCT_VMCX }, + { "vatpit", STRUCT_VATPIT }, + { "vatpic", STRUCT_VATPIC }, + { "vpmtmr", STRUCT_VPMTMR }, + { "vrtc", STRUCT_VRTC }, +}; + +/* + * TODO: Harden this function and all of its callers since 'base_str' is a user + * provided string. + */ +static char * +strcat_extension(const char *base_str, const char *ext) +{ + char *res; + size_t base_len, ext_len; + + base_len = strnlen(base_str, MAX_VMNAME); + ext_len = strnlen(ext, MAX_VMNAME); + + if (base_len + ext_len > MAX_VMNAME) { + fprintf(stderr, "Filename exceeds maximum length.\n"); + return (NULL); + } + + res = malloc(base_len + ext_len + 1); + if (res == NULL) { + perror("Failed to allocate memory."); + return (NULL); + } + + memcpy(res, base_str, base_len); + memcpy(res + base_len, ext, ext_len); + res[base_len + ext_len] = 0; + + return (res); +} + +void +destroy_restore_state(struct restore_state *rstate) +{ + if (rstate == NULL) { + fprintf(stderr, "Attempting to destroy NULL restore struct.\n"); + return; + } + + if (rstate->kdata_map != MAP_FAILED) + munmap(rstate->kdata_map, rstate->kdata_len); + + if (rstate->kdata_fd > 0) + close(rstate->kdata_fd); + if (rstate->vmmem_fd > 0) + close(rstate->vmmem_fd); + + if (rstate->meta_root_obj != NULL) + ucl_object_unref(rstate->meta_root_obj); + if (rstate->meta_parser != NULL) + ucl_parser_free(rstate->meta_parser); +} + +static int +load_vmmem_file(const char *filename, struct restore_state *rstate) +{ + struct stat sb; + int err; + + rstate->vmmem_fd = open(filename, O_RDONLY); + if (rstate->vmmem_fd < 0) { + perror("Failed to open restore file"); + return (-1); + } + + err = fstat(rstate->vmmem_fd, &sb); + if (err < 0) { + perror("Failed to stat restore file"); + goto err_load_vmmem; + } + + if (sb.st_size == 0) { + fprintf(stderr, "Restore file is empty.\n"); + goto err_load_vmmem; + } + + rstate->vmmem_len = sb.st_size; + + return (0); + +err_load_vmmem: + if (rstate->vmmem_fd > 0) + close(rstate->vmmem_fd); + return (-1); +} + +static int +load_kdata_file(const char *filename, struct restore_state *rstate) +{ + struct stat sb; + int err; + + rstate->kdata_fd = open(filename, O_RDONLY); + if (rstate->kdata_fd < 0) { + perror("Failed to open kernel data file"); + return (-1); + } + + err = fstat(rstate->kdata_fd, &sb); + if (err < 0) { + perror("Failed to stat kernel data file"); + goto err_load_kdata; + } + + if (sb.st_size == 0) { + fprintf(stderr, "Kernel data file is empty.\n"); + goto err_load_kdata; + } + + rstate->kdata_len = sb.st_size; + rstate->kdata_map = mmap(NULL, rstate->kdata_len, PROT_READ, + MAP_SHARED, rstate->kdata_fd, 0); + if (rstate->kdata_map == MAP_FAILED) { + perror("Failed to map restore file"); + goto err_load_kdata; + } + + return (0); + +err_load_kdata: + if (rstate->kdata_fd > 0) + close(rstate->kdata_fd); + return (-1); +} + +static int +load_metadata_file(const char *filename, struct restore_state *rstate) +{ + const ucl_object_t *obj; + struct ucl_parser *parser; + int err; + + parser = ucl_parser_new(UCL_PARSER_DEFAULT); + if (parser == NULL) { + fprintf(stderr, "Failed to initialize UCL parser.\n"); + goto err_load_metadata; + } + + err = ucl_parser_add_file(parser, filename); + if (err == 0) { + fprintf(stderr, "Failed to parse metadata file: '%s'\n", + filename); + err = -1; + goto err_load_metadata; + } + + obj = ucl_parser_get_object(parser); + if (obj == NULL) { + fprintf(stderr, "Failed to parse object.\n"); + err = -1; + goto err_load_metadata; + } + + rstate->meta_parser = parser; + rstate->meta_root_obj = (ucl_object_t *)obj; + + return (0); + +err_load_metadata: + if (parser != NULL) + ucl_parser_free(parser); + return (err); +} + +int +load_restore_file(const char *filename, struct restore_state *rstate) +{ + int err = 0; + char *kdata_filename = NULL, *meta_filename = NULL; + + assert(filename != NULL); + assert(rstate != NULL); + + memset(rstate, 0, sizeof(*rstate)); + rstate->kdata_map = MAP_FAILED; + + err = load_vmmem_file(filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest RAM file.\n"); + goto err_restore; + } + + kdata_filename = strcat_extension(filename, ".kern"); + if (kdata_filename == NULL) { + fprintf(stderr, "Failed to construct kernel data filename.\n"); + goto err_restore; + } + + err = load_kdata_file(kdata_filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest kernel data file.\n"); + goto err_restore; + } + + meta_filename = strcat_extension(filename, ".meta"); + if (meta_filename == NULL) { + fprintf(stderr, "Failed to construct kernel metadata filename.\n"); + goto err_restore; + } + + err = load_metadata_file(meta_filename, rstate); + if (err != 0) { + fprintf(stderr, "Failed to load guest metadata file.\n"); + goto err_restore; + } + + return (0); + +err_restore: + destroy_restore_state(rstate); + if (kdata_filename != NULL) + free(kdata_filename); + if (meta_filename != NULL) + free(meta_filename); + return (-1); +} + +#define JSON_GET_INT_OR_RETURN(key, obj, result_ptr, ret) \ +do { \ + const ucl_object_t *obj__; \ + obj__ = ucl_object_lookup(obj, key); \ + if (obj__ == NULL) { \ + fprintf(stderr, "Missing key: '%s'", key); \ + return (ret); \ + } \ + if (!ucl_object_toint_safe(obj__, result_ptr)) { \ + fprintf(stderr, "Cannot convert '%s' value to int.", key); \ + return (ret); \ + } \ +} while(0) + +#define JSON_GET_STRING_OR_RETURN(key, obj, result_ptr, ret) \ +do { \ + const ucl_object_t *obj__; \ + obj__ = ucl_object_lookup(obj, key); \ + if (obj__ == NULL) { \ + fprintf(stderr, "Missing key: '%s'", key); \ + return (ret); \ + } \ + if (!ucl_object_tostring_safe(obj__, result_ptr)) { \ + fprintf(stderr, "Cannot convert '%s' value to string.", key); \ + return (ret); \ + } \ +} while(0) + +static void * +lookup_struct(enum snapshot_req struct_id, struct restore_state *rstate, + size_t *struct_size) +{ + const ucl_object_t *structs = NULL, *obj = NULL; + ucl_object_iter_t it = NULL; + int64_t snapshot_req, size, file_offset; + + structs = ucl_object_lookup(rstate->meta_root_obj, JSON_STRUCT_ARR_KEY); + if (structs == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_STRUCT_ARR_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)structs) != UCL_ARRAY) { + fprintf(stderr, "Object '%s' is not an array.\n", + JSON_STRUCT_ARR_KEY); + return (NULL); + } + + while ((obj = ucl_object_iterate(structs, &it, true)) != NULL) { + snapshot_req = -1; + JSON_GET_INT_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, + &snapshot_req, NULL); + assert(snapshot_req >= 0); + if ((enum snapshot_req) snapshot_req == struct_id) { + JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, + &size, NULL); + assert(size >= 0); + + JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, + &file_offset, NULL); + assert(file_offset >= 0); + assert(file_offset + size <= rstate->kdata_len); + + *struct_size = (size_t)size; + return (rstate->kdata_map + file_offset); + } + } + + return (NULL); +} + +static void * +lookup_check_dev(const char *dev_name, struct restore_state *rstate, + const ucl_object_t *obj, size_t *data_size) +{ + const char *snapshot_req; + int64_t size, file_offset; + + snapshot_req = NULL; + JSON_GET_STRING_OR_RETURN(JSON_SNAPSHOT_REQ_KEY, obj, + &snapshot_req, NULL); + assert(snapshot_req != NULL); + if (!strcmp(snapshot_req, dev_name)) { + JSON_GET_INT_OR_RETURN(JSON_SIZE_KEY, obj, + &size, NULL); + assert(size >= 0); + + JSON_GET_INT_OR_RETURN(JSON_FILE_OFFSET_KEY, obj, + &file_offset, NULL); + assert(file_offset >= 0); + assert(file_offset + size <= rstate->kdata_len); + + *data_size = (size_t)size; + return (rstate->kdata_map + file_offset); + } + + return (NULL); +} + +static void* +lookup_dev(const char *dev_name, struct restore_state *rstate, + size_t *data_size) +{ + const ucl_object_t *devs = NULL, *obj = NULL; + ucl_object_iter_t it = NULL; + void *ret; + + devs = ucl_object_lookup(rstate->meta_root_obj, JSON_DEV_ARR_KEY); + if (devs == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_DEV_ARR_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)devs) != UCL_ARRAY) { + fprintf(stderr, "Object '%s' is not an array.\n", + JSON_DEV_ARR_KEY); + return (NULL); + } + + while ((obj = ucl_object_iterate(devs, &it, true)) != NULL) { + ret = lookup_check_dev(dev_name, rstate, obj, data_size); + if (ret != NULL) + return (ret); + } + + return (NULL); +} + +static const ucl_object_t * +lookup_basic_metadata_object(struct restore_state *rstate) +{ + const ucl_object_t *basic_meta_obj = NULL; + + basic_meta_obj = ucl_object_lookup(rstate->meta_root_obj, + JSON_BASIC_METADATA_KEY); + if (basic_meta_obj == NULL) { + fprintf(stderr, "Failed to find '%s' object.\n", + JSON_BASIC_METADATA_KEY); + return (NULL); + } + + if (ucl_object_type((ucl_object_t *)basic_meta_obj) != UCL_OBJECT) { + fprintf(stderr, "Object '%s' is not a JSON object.\n", + JSON_BASIC_METADATA_KEY); + return (NULL); + } + + return (basic_meta_obj); +} + +const char * +lookup_vmname(struct restore_state *rstate) +{ + const char *vmname; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (NULL); + + JSON_GET_STRING_OR_RETURN(JSON_VMNAME_KEY, obj, &vmname, NULL); + return (vmname); +} + +int +lookup_memflags(struct restore_state *rstate) +{ + int64_t memflags; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_MEMFLAGS_KEY, obj, &memflags, 0); + + return ((int)memflags); +} + +size_t +lookup_memsize(struct restore_state *rstate) +{ + int64_t memsize; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_MEMSIZE_KEY, obj, &memsize, 0); + if (memsize < 0) + memsize = 0; + + return ((size_t)memsize); +} + + +int +lookup_guest_ncpus(struct restore_state *rstate) +{ + int64_t ncpus; + const ucl_object_t *obj; + + obj = lookup_basic_metadata_object(rstate); + if (obj == NULL) + return (0); + + JSON_GET_INT_OR_RETURN(JSON_NCPUS_KEY, obj, &ncpus, 0); + return ((int)ncpus); +} + +int +restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate) +{ + return vm_restore_mem(ctx, rstate->vmmem_fd, rstate->vmmem_len); +} + +static int +vm_restore_kern_struct(struct vmctx *ctx, struct restore_state *rstate, + const struct vm_snapshot_kern_info *info) +{ + void *struct_ptr; + size_t struct_size; + int ret; + struct vm_snapshot_meta *meta; + + struct_ptr = lookup_struct(info->req, rstate, &struct_size); + if (struct_ptr == NULL) { + fprintf(stderr, "%s: Failed to lookup struct %s\r\n", + __func__, info->struct_name); + ret = -1; + goto done; + } + + if (struct_size == 0) { + fprintf(stderr, "%s: Kernel struct size was 0 for: %s\r\n", + __func__, info->struct_name); + ret = -1; + goto done; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + .dev_name = info->struct_name, + .dev_req = info->req, + + .buffer.buf_start = struct_ptr, + .buffer.buf_size = struct_size, + + .buffer.buf = struct_ptr, + .buffer.buf_rem = struct_size, + + .op = VM_SNAPSHOT_RESTORE, + }; + + ret = vm_snapshot_req(meta); + if (ret != 0) { + fprintf(stderr, "%s: Failed to restore struct: %s\r\n", + __func__, info->struct_name); + goto done; + } + +done: + return (ret); +} + +int +vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate) +{ + int ret; + int i; + + for (i = 0; i < nitems(snapshot_kern_structs); i++) { + ret = vm_restore_kern_struct(ctx, rstate, + &snapshot_kern_structs[i]); + if (ret != 0) + return (ret); + } + + return (0); +} + +int +vm_restore_user_dev(struct vmctx *ctx, struct restore_state *rstate, + const struct vm_snapshot_dev_info *info) +{ + void *dev_ptr; + size_t dev_size; + int ret; + struct vm_snapshot_meta *meta; + + dev_ptr = lookup_dev(info->dev_name, rstate, &dev_size); + if (dev_ptr == NULL) { + fprintf(stderr, "Failed to lookup dev: %s\r\n", info->dev_name); + fprintf(stderr, "Continuing the restore/migration process\r\n"); + return (0); + } + + if (dev_size == 0) { + fprintf(stderr, "%s: Device size is 0. " + "Assuming %s is not used\r\n", + __func__, info->dev_name); + return (0); + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + .dev_name = info->dev_name, + + .buffer.buf_start = dev_ptr, + .buffer.buf_size = dev_size, + + .buffer.buf = dev_ptr, + .buffer.buf_rem = dev_size, + + .op = VM_SNAPSHOT_RESTORE, + }; + + ret = (*info->snapshot_cb)(meta); + if (ret != 0) { + fprintf(stderr, "Failed to restore dev: %s\r\n", + info->dev_name); + return (-1); + } + + return (0); +} + + +int +vm_restore_user_devs(struct vmctx *ctx, struct restore_state *rstate) +{ + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + ret = vm_restore_user_dev(ctx, rstate, &snapshot_devs[i]); + if (ret != 0) + return (ret); + } + + return 0; +} + +int +vm_pause_user_devs(struct vmctx *ctx) +{ + const struct vm_snapshot_dev_info *info; + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + info = &snapshot_devs[i]; + if (info->pause_cb == NULL) + continue; + + ret = info->pause_cb(ctx, info->dev_name); + if (ret != 0) + return (ret); + } + + return (0); +} + +int +vm_resume_user_devs(struct vmctx *ctx) +{ + const struct vm_snapshot_dev_info *info; + int ret; + int i; + + for (i = 0; i < nitems(snapshot_devs); i++) { + info = &snapshot_devs[i]; + if (info->resume_cb == NULL) + continue; + + ret = info->resume_cb(ctx, info->dev_name); + if (ret != 0) + return (ret); + } + + return (0); +} + +static int +vm_snapshot_kern_struct(int data_fd, xo_handle_t *xop, const char *array_key, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + size_t data_size; + ssize_t write_cnt; + + ret = vm_snapshot_req(meta); + if (ret != 0) { + fprintf(stderr, "%s: Failed to snapshot struct %s\r\n", + __func__, meta->dev_name); + ret = -1; + goto done; + } + + data_size = vm_get_snapshot_size(meta); + + write_cnt = write(data_fd, meta->buffer.buf_start, data_size); + if (write_cnt != data_size) { + perror("Failed to write all snapshotted data."); + ret = -1; + goto done; + } + + /* Write metadata. */ + xo_open_instance_h(xop, array_key); + xo_emit_h(xop, "{:debug_name/%s}\n", meta->dev_name); + xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%d}\n", + meta->dev_req); + xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); + xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); + xo_close_instance_h(xop, JSON_STRUCT_ARR_KEY); + + *offset += data_size; + +done: + return (ret); +} + +static int +vm_snapshot_kern_structs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) +{ + int ret, i, error; + size_t offset, buf_size; + char *buffer; + struct vm_snapshot_meta *meta; + + error = 0; + offset = 0; + buf_size = SNAPSHOT_BUFFER_SIZE; + + buffer = malloc(SNAPSHOT_BUFFER_SIZE * sizeof(char)); + if (buffer == NULL) { + error = ENOMEM; + perror("Failed to allocate memory for snapshot buffer"); + goto err_vm_snapshot_kern_data; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + + .buffer.buf_start = buffer, + .buffer.buf_size = buf_size, + + .op = VM_SNAPSHOT_SAVE, + }; + + xo_open_list_h(xop, JSON_STRUCT_ARR_KEY); + for (i = 0; i < nitems(snapshot_kern_structs); i++) { + meta->dev_name = snapshot_kern_structs[i].struct_name; + meta->dev_req = snapshot_kern_structs[i].req; + + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + ret = vm_snapshot_kern_struct(data_fd, xop, JSON_DEV_ARR_KEY, + meta, &offset); + if (ret != 0) { + error = -1; + goto err_vm_snapshot_kern_data; + } + } + xo_close_list_h(xop, JSON_STRUCT_ARR_KEY); + +err_vm_snapshot_kern_data: + if (buffer != NULL) + free(buffer); + return (error); +} + +static int +vm_snapshot_basic_metadata(struct vmctx *ctx, xo_handle_t *xop) +{ + int error; + size_t memsize; + int memflags; + char vmname_buf[MAX_VMNAME]; + + memset(vmname_buf, 0, MAX_VMNAME); + error = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (error != 0) { + perror("Failed to get VM name"); + goto err; + } + + memsize = vm_get_lowmem_size(ctx) + vm_get_highmem_size(ctx); + memflags = vm_get_memflags(ctx); + + xo_open_container_h(xop, JSON_BASIC_METADATA_KEY); + xo_emit_h(xop, "{:" JSON_NCPUS_KEY "/%ld}\n", guest_ncpus); + xo_emit_h(xop, "{:" JSON_VMNAME_KEY "/%s}\n", vmname_buf); + xo_emit_h(xop, "{:" JSON_MEMSIZE_KEY "/%lu}\n", memsize); + xo_emit_h(xop, "{:" JSON_MEMFLAGS_KEY "/%d}\n", memflags); + xo_close_container_h(xop, JSON_BASIC_METADATA_KEY); + +err: + return (error); +} + +static int +vm_snapshot_dev_write_data(int data_fd, xo_handle_t *xop, const char *array_key, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + size_t data_size; + + data_size = vm_get_snapshot_size(meta); + + ret = write(data_fd, meta->buffer.buf_start, data_size); + if (ret != data_size) { + perror("Failed to write all snapshotted data."); + return (-1); + } + + /* Write metadata. */ + xo_open_instance_h(xop, array_key); + xo_emit_h(xop, "{:" JSON_SNAPSHOT_REQ_KEY "/%s}\n", meta->dev_name); + xo_emit_h(xop, "{:" JSON_SIZE_KEY "/%lu}\n", data_size); + xo_emit_h(xop, "{:" JSON_FILE_OFFSET_KEY "/%lu}\n", *offset); + xo_close_instance_h(xop, array_key); + + *offset += data_size; + + return (0); +} + +static int +vm_snapshot_user_dev(const struct vm_snapshot_dev_info *info, + int data_fd, xo_handle_t *xop, + struct vm_snapshot_meta *meta, off_t *offset) +{ + int ret; + + ret = (*info->snapshot_cb)(meta); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot %s; ret=%d\r\n", + meta->dev_name, ret); + return (ret); + } + + ret = vm_snapshot_dev_write_data(data_fd, xop, JSON_DEV_ARR_KEY, meta, + offset); + if (ret != 0) + return (ret); + + return (0); +} + +static int +vm_snapshot_user_devs(struct vmctx *ctx, int data_fd, xo_handle_t *xop) +{ + int ret, i; + off_t offset; + void *buffer; + size_t buf_size; + struct vm_snapshot_meta *meta; + + buf_size = SNAPSHOT_BUFFER_SIZE; + + offset = lseek(data_fd, 0, SEEK_CUR); + if (offset < 0) { + perror("Failed to get data file current offset."); + return (-1); + } + + buffer = malloc(buf_size); + if (buffer == NULL) { + perror("Failed to allocate memory for snapshot buffer"); + ret = ENOSPC; + goto snapshot_err; + } + + meta = &(struct vm_snapshot_meta) { + .ctx = ctx, + + .buffer.buf_start = buffer, + .buffer.buf_size = buf_size, + + .op = VM_SNAPSHOT_SAVE, + }; + + xo_open_list_h(xop, JSON_DEV_ARR_KEY); + + /* Restore other devices that support this feature */ + for (i = 0; i < nitems(snapshot_devs); i++) { + meta->dev_name = snapshot_devs[i].dev_name; + + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + ret = vm_snapshot_user_dev(&snapshot_devs[i], data_fd, xop, + meta, &offset); + if (ret != 0) + goto snapshot_err; + } + + xo_close_list_h(xop, JSON_DEV_ARR_KEY); + +snapshot_err: + if (buffer != NULL) + free(buffer); + return (ret); +} + +static int +vm_mem_write_to_file(int fd, const void *src, size_t dst_offset, size_t len) +{ + size_t write_total; + ssize_t cnt_write; + size_t to_write; + + write_total = 0; + to_write = len; + + if (lseek(fd, dst_offset, SEEK_SET) < 0 ) { + perror("Failed to changed file offset"); + return (-1); + } + + while (write_total < len) { + cnt_write = write(fd, src + write_total, to_write); + if (cnt_write < 0) { + perror("Failed to write in file"); + return (-1); + } + to_write -= cnt_write; + write_total += cnt_write; + } + + return (0); +} + +static int +vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm) +{ + int fd_checkpoint = 0, kdata_fd = 0; + int ret = 0; + int error = 0; + size_t guest_lowmem, guest_highmem, guest_memsize; + char *guest_baseaddr; + char *guest_lowmem_addr, *guest_highmem_addr; + xo_handle_t *xop = NULL; + char *meta_filename = NULL; + char *kdata_filename = NULL; + FILE *meta_file = NULL; + + kdata_filename = strcat_extension(checkpoint_file, ".kern"); + if (kdata_filename == NULL) { + fprintf(stderr, "Failed to construct kernel data filename.\n"); + return (-1); + } + + kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); + if (kdata_fd < 0) { + perror("Failed to open kernel data snapshot file."); + error = -1; + goto done; + } + + fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); + + if (fd_checkpoint < 0) { + perror("Failed to create checkpoint file"); + error = -1; + goto done; + } + + ret = vm_get_guestmem_from_ctx(ctx, &guest_baseaddr, &guest_lowmem, &guest_highmem); + guest_memsize = guest_lowmem + guest_highmem; + if (ret < 0) { + fprintf(stderr, "Failed to get guest mem information (base, low, high)\n"); + error = -1; + goto done; + } + + /* make space for VMs address space */ + ret = ftruncate(fd_checkpoint, guest_memsize); + if (ret < 0) { + perror("Failed to truncate checkpoint file\n"); + goto done; + } + + meta_filename = strcat_extension(checkpoint_file, ".meta"); + if (meta_filename == NULL) { + fprintf(stderr, "Failed to construct vm metadata filename.\n"); + goto done; + } + + meta_file = fopen(meta_filename, "w"); + if (meta_file == NULL) { + perror("Failed to open vm metadata snapshot file."); + goto done; + } + + xop = xo_create_to_file(meta_file, XO_STYLE_JSON, XOF_PRETTY); + if (xop == NULL) { + perror("Failed to get libxo handle on metadata file."); + goto done; + } + + ret = vm_snapshot_basic_metadata(ctx, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot vm basic metadata.\n"); + error = -1; + goto done; + } + + guest_lowmem_addr = guest_baseaddr; + if (guest_highmem > 0) + guest_highmem_addr = guest_baseaddr + 4*GB; + + ret = vm_pause_user_devs(ctx); + if (ret != 0) { + fprintf(stderr, "Could not pause devices\r\n"); + error = ret; + goto done; + } + + vm_vcpu_pause(ctx); + + ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot vm kernel data.\n"); + error = -1; + goto done_unlock; + } + + ret = vm_snapshot_user_devs(ctx, kdata_fd, xop); + if (ret != 0) { + fprintf(stderr, "Failed to snapshot device state.\n"); + error = -1; + goto done_unlock; + } + + if (vm_mem_write_to_file(fd_checkpoint, guest_lowmem_addr, + 0, guest_lowmem) != 0) { + perror("Could not write lowmem"); + error = -1; + goto done_unlock; + } + + if (guest_highmem > 0) { + if (vm_mem_write_to_file(fd_checkpoint, guest_highmem_addr, + guest_lowmem, guest_highmem) != 0) { + perror("Could not write highmem"); + error = -1; + goto done_unlock; + } + } + + xo_finish_h(xop); + + if (stop_vm) { + vm_destroy(ctx); + exit(0); + } + +done_unlock: + vm_vcpu_resume(ctx); +done: + ret = vm_resume_user_devs(ctx); + if (ret != 0) + fprintf(stderr, "Could not resume devices\r\n"); + if (fd_checkpoint > 0) + close(fd_checkpoint); + if (meta_filename != NULL) + free(meta_filename); + if (kdata_filename != NULL) + free(kdata_filename); + if (xop != NULL) + xo_destroy(xop); + if (meta_file != NULL) + fclose(meta_file); + if (kdata_fd > 0) + close(kdata_fd); + return (error); +} + +int get_checkpoint_msg(int conn_fd, struct vmctx *ctx) +{ + unsigned char buf[MAX_MSG_SIZE]; + struct checkpoint_op *checkpoint_op; + int len, recv_len, total_recv = 0; + int err = 0; + + len = sizeof(struct checkpoint_op); /* expected length */ + while ((recv_len = recv(conn_fd, buf + total_recv, len - total_recv, 0)) > 0) { + total_recv += recv_len; + } + if (recv_len < 0) { + perror("Error while receiving data from bhyvectl"); + err = -1; + goto done; + } + + checkpoint_op = (struct checkpoint_op *)buf; + switch (checkpoint_op->op) { + case START_CHECKPOINT: + err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, false); + break; + case START_SUSPEND: + err = vm_checkpoint(ctx, checkpoint_op->snapshot_filename, true); + break; + default: + fprintf(stderr, "Unrecognized checkpoint operation.\n"); + err = -1; + } + +done: + close(conn_fd); + return (err); +} + +/* + * Listen for commands from bhyvectl + */ +void * checkpoint_thread(void *param) +{ + struct checkpoint_thread_info *thread_info; + socklen_t addr_len; + int conn_fd, ret; + + thread_info = (struct checkpoint_thread_info *)param; + + addr_len = sizeof(thread_info->addr); + while ((conn_fd = accept(thread_info->socket_fd, + (struct sockaddr *) thread_info->addr, + &addr_len)) > -1) { + ret = get_checkpoint_msg(conn_fd, thread_info->ctx); + if (ret != 0) { + fprintf(stderr, "Failed to read message on checkpoint " + "socket. Retrying.\n"); + } + + addr_len = sizeof(struct sockaddr_un); + } + if (conn_fd < -1) { + perror("Failed to accept connection"); + } + + return (NULL); +} + +/* + * Create directory tree to store runtime specific information: + * i.e. UNIX sockets for IPC with bhyvectl. + */ +static int +make_checkpoint_dir() +{ + int err; + + err = mkdir(BHYVE_RUN_DIR, 0755); + if (err < 0 && errno != EEXIST) + return (err); + + err = mkdir(CHECKPOINT_RUN_DIR, 0755); + if (err < 0 && errno != EEXIST) + return (err); + + return 0; +} + +/* + * Create the listening socket for IPC with bhyvectl + */ +int +init_checkpoint_thread(struct vmctx *ctx) +{ + struct sockaddr_un addr; + int socket_fd; + pthread_t checkpoint_pthread; + char vmname_buf[MAX_VMNAME]; + int ret, err = 0; + + socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (socket_fd < 0) { + perror("Socket creation failed (IPC with bhyvectl"); + err = -1; + goto fail; + } + + err = make_checkpoint_dir(); + if (err < 0) { + perror("Failed to create checkpoint runtime directory"); + goto fail; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + + err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (err != 0) { + perror("Failed to get VM name"); + goto fail; + } + + snprintf(addr.sun_path, PATH_MAX, "%s/%s", + CHECKPOINT_RUN_DIR, vmname_buf); + unlink(addr.sun_path); + + if (bind(socket_fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_un)) != 0) { + perror("Failed to bind socket (IPC with bhyvectl)"); + err = -1; + goto fail; + } + + if (listen(socket_fd, 10) < 0) { + perror("Failed to listen on socket (IPC with bhyvectl)"); + err = -1; + goto fail; + } + + memset(&checkpoint_info, 0, sizeof(struct checkpoint_thread_info)); + checkpoint_info.ctx = ctx; + checkpoint_info.socket_fd = socket_fd; + checkpoint_info.addr = &addr; + + + /* TODO: start thread for listening connections */ + pthread_set_name_np(checkpoint_pthread, "checkpoint thread"); + ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, + &checkpoint_info); + if (ret < 0) { + err = ret; + goto fail; + } + + return (0); +fail: + if (socket_fd > 0) + close(socket_fd); + unlink(addr.sun_path); + + return (err); +} + +void +vm_snapshot_buf_err(const char *bufname, const enum vm_snapshot_op op) +{ + const char *__op; + + if (op == VM_SNAPSHOT_SAVE) + __op = "save"; + else if (op == VM_SNAPSHOT_RESTORE) + __op = "restore"; + else + __op = "unknown"; + + fprintf(stderr, "%s: snapshot-%s failed for %s\r\n", + __func__, __op, bufname); +} + +int +vm_snapshot_buf(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + fprintf(stderr, "%s: buffer too small\r\n", __func__); + return (E2BIG); + } + + if (op == VM_SNAPSHOT_SAVE) + memcpy(buffer->buf, (uint8_t *) data, data_size); + else if (op == VM_SNAPSHOT_RESTORE) + memcpy((uint8_t *) data, buffer->buf, data_size); + else + return (EINVAL); + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + + return (0); +} + +size_t +vm_get_snapshot_size(struct vm_snapshot_meta *meta) +{ + size_t length; + struct vm_snapshot_buffer *buffer; + + buffer = &meta->buffer; + + if (buffer->buf_size < buffer->buf_rem) { + fprintf(stderr, "%s: Invalid buffer: size = %zu, rem = %zu\r\n", + __func__, buffer->buf_size, buffer->buf_rem); + length = 0; + } else { + length = buffer->buf_size - buffer->buf_rem; + } + + return (length); +} + +int +vm_snapshot_guest2host_addr(void **addrp, size_t len, int restore_null, + struct vm_snapshot_meta *meta) +{ + int ret; + vm_paddr_t gaddr; + + if (meta->op == VM_SNAPSHOT_SAVE) { + gaddr = paddr_host2guest(meta->ctx, *addrp); + if (gaddr == (vm_paddr_t) -1) { + if ((restore_null == false) || + ((restore_null == true) && (*addrp != NULL))) { + ret = EFAULT; + goto done; + } + } + + SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); + } else if (meta->op == VM_SNAPSHOT_RESTORE) { + SNAPSHOT_VAR_OR_LEAVE(gaddr, meta, ret, done); + if (gaddr == (vm_paddr_t) -1) { + if (restore_null == false) { + ret = EFAULT; + goto done; + } + } + + *addrp = paddr_guest2host(meta->ctx, gaddr, len); + } else { + ret = EINVAL; + } + +done: + return (ret); +} + +int +vm_snapshot_buf_cmp(volatile void *data, size_t data_size, + struct vm_snapshot_meta *meta) +{ + struct vm_snapshot_buffer *buffer; + int op; + int ret; + + buffer = &meta->buffer; + op = meta->op; + + if (buffer->buf_rem < data_size) { + fprintf(stderr, "%s: buffer too small\r\n", __func__); + ret = E2BIG; + goto done; + } + + if (op == VM_SNAPSHOT_SAVE) { + ret = 0; + memcpy(buffer->buf, (uint8_t *) data, data_size); + } else if (op == VM_SNAPSHOT_RESTORE) { + ret = memcmp((uint8_t *) data, buffer->buf, data_size); + } else { + ret = EINVAL; + goto done; + } + + buffer->buf += data_size; + buffer->buf_rem -= data_size; + +done: + return (ret); +} Index: usr.sbin/bhyve/uart_emul.h =================================================================== --- usr.sbin/bhyve/uart_emul.h +++ usr.sbin/bhyve/uart_emul.h @@ -31,10 +31,10 @@ #ifndef _UART_EMUL_H_ #define _UART_EMUL_H_ - #define UART_IO_BAR_SIZE 8 struct uart_softc; +struct vm_snapshot_meta; typedef void (*uart_intr_func_t)(void *arg); struct uart_softc *uart_init(uart_intr_func_t intr_assert, @@ -44,4 +44,5 @@ uint8_t uart_read(struct uart_softc *sc, int offset); void uart_write(struct uart_softc *sc, int offset, uint8_t value); int uart_set_backend(struct uart_softc *sc, const char *opt); +int uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta); #endif Index: usr.sbin/bhyve/uart_emul.c =================================================================== --- usr.sbin/bhyve/uart_emul.c +++ usr.sbin/bhyve/uart_emul.c @@ -39,6 +39,8 @@ #include #endif +#include + #include #include #include @@ -699,3 +701,33 @@ return (retval); } + +int +uart_snapshot(struct uart_softc *sc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(sc->data, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->ier, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->mcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->lsr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->msr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->fcr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->scr, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->dll, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->dlh, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.rindex, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.windex, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.num, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->rxfifo.size, meta, ret, done); + SNAPSHOT_BUF_OR_LEAVE(sc->rxfifo.buf, sizeof(sc->rxfifo.buf), + meta, ret, done); + + sc->thre_int_pending = 1; + +done: + return (ret); +} Index: usr.sbin/bhyve/usb_emul.h =================================================================== --- usr.sbin/bhyve/usb_emul.h +++ usr.sbin/bhyve/usb_emul.h @@ -41,10 +41,10 @@ #define USB_XFER_IN 1 - struct usb_hci; struct usb_device_request; struct usb_data_xfer; +struct vm_snapshot_meta; /* Device emulation handlers */ struct usb_devemu { @@ -62,6 +62,7 @@ int (*ue_reset)(void *sc); int (*ue_remove)(void *sc); int (*ue_stop)(void *sc); + int (*ue_snapshot)(void *scarg, struct vm_snapshot_meta *meta); }; #define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x); @@ -148,7 +149,6 @@ pthread_mutex_unlock(&((x)->mtx)); \ } while (0) - struct usb_devemu *usb_emu_finddev(char *name); struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer, Index: usr.sbin/bhyve/usb_mouse.c =================================================================== --- usr.sbin/bhyve/usb_mouse.c +++ usr.sbin/bhyve/usb_mouse.c @@ -31,6 +31,8 @@ #include +#include + #include #include #include @@ -787,6 +789,27 @@ return (0); } +static int +umouse_snapshot(void *scarg, struct vm_snapshot_meta *meta) +{ + int ret; + struct umouse_softc *sc; + + sc = scarg; + + SNAPSHOT_VAR_OR_LEAVE(sc->um_report, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->newdata, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.idle, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.protocol, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->hid.feature, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(sc->polling, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_sec, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(sc->prev_evt.tv_usec, meta, ret, done); + +done: + return (ret); +} struct usb_devemu ue_mouse = { .ue_emu = "tablet", @@ -797,6 +820,7 @@ .ue_data = umouse_data_handler, .ue_reset = umouse_reset, .ue_remove = umouse_remove, - .ue_stop = umouse_stop + .ue_stop = umouse_stop, + .ue_snapshot = umouse_snapshot, }; USB_EMUL_SET(ue_mouse); Index: usr.sbin/bhyve/virtio.h =================================================================== --- usr.sbin/bhyve/virtio.h +++ usr.sbin/bhyve/virtio.h @@ -285,6 +285,7 @@ struct vmctx; struct pci_devinst; struct vqueue_info; +struct vm_snapshot_meta; /* * A virtual device, with some number (possibly 0) of virtual @@ -359,6 +360,10 @@ void (*vc_apply_features)(void *, uint64_t); /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ + void (*vc_pause)(void *); /* called to pause device activity */ + void (*vc_resume)(void *); /* called to resume device activity */ + int (*vc_snapshot)(void *, struct vm_snapshot_meta *); + /* called to save / restore device state */ }; /* @@ -465,4 +470,7 @@ int baridx, uint64_t offset, int size); void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value); +int vi_pci_snapshot(struct vm_snapshot_meta *meta); +int vi_pci_pause(struct pci_devinst *pi); +int vi_pci_resume(struct pci_devinst *pi); #endif /* _VIRTIO_H_ */ Index: usr.sbin/bhyve/virtio.c =================================================================== --- usr.sbin/bhyve/virtio.c +++ usr.sbin/bhyve/virtio.c @@ -32,6 +32,8 @@ #include #include +#include + #include #include #include @@ -777,3 +779,147 @@ if (vs->vs_mtx) pthread_mutex_unlock(vs->vs_mtx); } + +int +vi_pci_pause(struct pci_devinst *pi) +{ + struct virtio_softc *vs; + struct virtio_consts *vc; + + vs = pi->pi_arg; + vc = vs->vs_vc; + + vc = vs->vs_vc; + assert(vc->vc_pause != NULL); + (*vc->vc_pause)(DEV_SOFTC(vs)); + + return (0); +} + +int +vi_pci_resume(struct pci_devinst *pi) +{ + struct virtio_softc *vs; + struct virtio_consts *vc; + + vs = pi->pi_arg; + vc = vs->vs_vc; + + vc = vs->vs_vc; + assert(vc->vc_resume != NULL); + (*vc->vc_resume)(DEV_SOFTC(vs)); + + return (0); +} + +static int +vi_pci_snapshot_softc(struct virtio_softc *vs, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_OR_LEAVE(vs->vs_flags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_negotiated_caps, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_curq, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_status, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_isr, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vs->vs_msix_cfg_idx, meta, ret, done); + +done: + return (ret); +} + +static int +vi_pci_snapshot_consts(struct virtio_consts *vc, struct vm_snapshot_meta *meta) +{ + int ret; + + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_nvq, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_cfgsize, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vc->vc_hv_caps, meta, ret, done); + +done: + return (ret); +} + +static int +vi_pci_snapshot_queues(struct virtio_softc *vs, struct vm_snapshot_meta *meta) +{ + int i; + int ret; + struct virtio_consts *vc; + struct vqueue_info *vq; + uint64_t addr_size; + + vc = vs->vs_vc; + + /* Save virtio queue info */ + for (i = 0; i < vc->vc_nvq; i++) { + vq = &vs->vs_queues[i]; + + SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_qsize, meta, ret, done); + SNAPSHOT_VAR_CMP_OR_LEAVE(vq->vq_num, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vq->vq_flags, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_last_avail, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_save_used, meta, ret, done); + SNAPSHOT_VAR_OR_LEAVE(vq->vq_msix_idx, meta, ret, done); + + SNAPSHOT_VAR_OR_LEAVE(vq->vq_pfn, meta, ret, done); + + addr_size = vq->vq_qsize * sizeof(struct virtio_desc); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_desc, addr_size, + false, meta, ret, done); + + addr_size = (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_avail, addr_size, + false, meta, ret, done); + + addr_size = (2 + 2 * vq->vq_qsize + 1) * sizeof(uint16_t); + SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(vq->vq_used, addr_size, + false, meta, ret, done); + + SNAPSHOT_BUF_OR_LEAVE(vq->vq_desc, vring_size(vq->vq_qsize), + meta, ret, done); + } + +done: + return (ret); +} + +int +vi_pci_snapshot(struct vm_snapshot_meta *meta) +{ + int ret; + struct pci_devinst *pi; + struct virtio_softc *vs; + struct virtio_consts *vc; + + pi = meta->dev_data; + vs = pi->pi_arg; + vc = vs->vs_vc; + + /* Save virtio softc */ + ret = vi_pci_snapshot_softc(vs, meta); + if (ret != 0) + goto done; + + /* Save virtio consts */ + ret = vi_pci_snapshot_consts(vc, meta); + if (ret != 0) + goto done; + + /* Save virtio queue info */ + ret = vi_pci_snapshot_queues(vs, meta); + if (ret != 0) + goto done; + + /* Save device softc, if needed */ + if (vc->vc_snapshot != NULL) { + ret = (*vc->vc_snapshot)(DEV_SOFTC(vs), meta); + if (ret != 0) + goto done; + } + +done: + return (ret); +} Index: usr.sbin/bhyvectl/bhyvectl.c =================================================================== --- usr.sbin/bhyvectl/bhyvectl.c +++ usr.sbin/bhyvectl/bhyvectl.c @@ -57,6 +57,9 @@ #include #include +#include +#include + #include "amd/vmcb.h" #include "intel/vmcs.h" @@ -67,6 +70,9 @@ #define NO_ARG no_argument #define OPT_ARG optional_argument +#define CHECKPOINT_RUN_DIR "/var/run/bhyve/checkpoint" +#define MAX_VMNAME 100 + static const char *progname; static void @@ -78,6 +84,8 @@ " [--cpu=]\n" " [--create]\n" " [--destroy]\n" + " [--checkpoint=]\n" + " [--suspend=]\n" " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" @@ -287,6 +295,8 @@ static int unassign_pptdev, bus, slot, func; static int run; static int get_cpu_topology; +static int vm_checkpoint_opt; +static int vm_suspend_opt; /* * VMCB specific. @@ -591,6 +601,8 @@ SET_RTC_TIME, SET_RTC_NVRAM, RTC_NVRAM_OFFSET, + SET_CHECKPOINT_FILE, + SET_SUSPEND_FILE, }; static void @@ -1459,6 +1471,8 @@ { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, { "get-intinfo", NO_ARG, &get_intinfo, 1 }, { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, + { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE}, + { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE}, }; const struct option intel_opts[] = { @@ -1676,6 +1690,80 @@ } } +static int +send_checkpoint_op_req(struct vmctx *ctx, struct checkpoint_op *op) +{ + struct sockaddr_un addr; + int socket_fd, len, len_sent, total_sent; + int err = 0; + char vmname_buf[MAX_VMNAME]; + + socket_fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (socket_fd < 0) { + perror("Error creating bhyvectl socket"); + err = -1; + goto done; + } + + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + + err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); + if (err != 0) { + perror("Failed to get VM name"); + goto done; + } + + snprintf(addr.sun_path, PATH_MAX, "%s/%s", CHECKPOINT_RUN_DIR, vmname_buf); + + if (connect(socket_fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_un)) != 0) { + perror("Connect to VM socket failed"); + err = -1; + goto done; + } + + len = sizeof(*op); + total_sent = 0; + while ((len_sent = send(socket_fd, (char *)op + total_sent, len - total_sent, 0)) > 0) { + total_sent += len_sent; + } + + if (len_sent < 0) { + perror("Failed to send checkpoint operation request"); + err = -1; + } + +done: + if (socket_fd > 0) + close(socket_fd); + return (err); +} + +static int +send_start_checkpoint(struct vmctx *ctx, const char *checkpoint_file) +{ + struct checkpoint_op op; + + op.op = START_CHECKPOINT; + strncpy(op.snapshot_filename, checkpoint_file, MAX_SNAPSHOT_VMNAME); + op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0; + + return send_checkpoint_op_req(ctx, &op); +} + +static int +send_start_suspend(struct vmctx *ctx, const char *suspend_file) +{ + struct checkpoint_op op; + + op.op = START_SUSPEND; + strncpy(op.snapshot_filename, suspend_file, MAX_SNAPSHOT_VMNAME); + op.snapshot_filename[MAX_SNAPSHOT_VMNAME - 1] = 0; + + return send_checkpoint_op_req(ctx, &op); +} + int main(int argc, char *argv[]) { @@ -1692,6 +1780,7 @@ uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; struct tm tm; struct option *opts; + char *checkpoint_file, *suspend_file; cpu_intel = cpu_vendor_intel(); opts = setup_options(cpu_intel); @@ -1858,6 +1947,14 @@ case ASSERT_LAPIC_LVT: assert_lapic_lvt = atoi(optarg); break; + case SET_CHECKPOINT_FILE: + vm_checkpoint_opt = 1; + checkpoint_file = optarg; + break; + case SET_SUSPEND_FILE: + vm_suspend_opt = 1; + suspend_file = optarg; + break; default: usage(cpu_intel); } @@ -2343,6 +2440,12 @@ if (!error && destroy) vm_destroy(ctx); + if (!error && vm_checkpoint_opt) + error = send_start_checkpoint(ctx, checkpoint_file); + + if (!error && vm_suspend_opt) + error = send_start_suspend(ctx, suspend_file); + free (opts); exit(error); }