diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -262,4 +262,16 @@ int vm_snapshot_req(struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); +int vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages, + size_t *highmem_pages); +int vm_set_vmm_migration_segments(struct vmctx *ctx, + struct vmm_migration_segment *lowmem, + struct vmm_migration_segment *highmem); +int vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num); + +int vm_copy_vmm_pages(struct vmctx *ctx, + struct vmm_migration_pages_req *pages_req); +int vm_init_vmm_migration_pages_req(struct vmctx *ctx, + struct vmm_migration_pages_req *req); + #endif /* _VMMAPI_H_ */ diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1630,6 +1630,116 @@ return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } +int +vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages, size_t *highmem_pages) +{ + /* ctx cannot be NULL */ + if (ctx == NULL) + return (-1); + + if (lowmem_pages != NULL) + *lowmem_pages = ctx->lowmem / PAGE_SIZE; + + if (highmem_pages != NULL) + *highmem_pages = ctx->highmem / PAGE_SIZE; + + return (0); +} + +int +vm_set_vmm_migration_segments(struct vmctx *ctx, + struct vmm_migration_segment *lowmem, + struct vmm_migration_segment *highmem) +{ + + if (lowmem != NULL) { + lowmem->start = 0; + lowmem->end = ctx->lowmem; + } + + if (highmem != NULL) { + if (ctx->highmem != 0) { + highmem->start = 4 * GB; + highmem->end = 4 * GB + ctx->highmem; + } + } + + return (0); +} + +int +vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num_pages) +{ + int error; + struct vm_get_dirty_page_list list; + + bzero(&list, sizeof(struct vm_get_dirty_page_list)); + list.page_list = (uint8_t *)page_list; + list.num_pages = num_pages; + + error = vm_set_vmm_migration_segments(ctx, &(list.lowmem), + &(list.highmem)); + + error = ioctl(ctx->fd, VM_GET_DIRTY_PAGE_LIST, &list); + + return (error); +} + +int +vm_copy_vmm_pages(struct vmctx *ctx, struct vmm_migration_pages_req *pages_req) +{ + int error; + size_t index; + + if (pages_req == NULL) + return (-1); + + if (pages_req->pages_required > VMM_PAGE_CHUNK) + return (E2BIG); + + for (index = 0; index < pages_req->pages_required; index ++) { + if (pages_req->pages[index].page == NULL) + return (-1); + + if (pages_req->req_type == VMM_GET_PAGES) + memset(pages_req->pages[index].page, 0, PAGE_SIZE); + } + + error = ioctl(ctx->fd, VM_COPY_VMM_PAGES, pages_req); + + return (error); +} + +int +vm_init_vmm_migration_pages_req(struct vmctx *ctx, + struct vmm_migration_pages_req *req) +{ + size_t index; + struct vmm_migration_page *page; + + vm_set_vmm_migration_segments(ctx, &(req->lowmem_segment), + &(req->highmem_segment)); + + for (index = 0; index < VMM_PAGE_CHUNK; index++) { + page = &req->pages[index]; + page->page = malloc(PAGE_SIZE * sizeof(uint8_t)); + if (page->page == NULL) + goto deallocate_error; + memset(page->page, 0, PAGE_SIZE * sizeof(uint8_t)); + } + + return (0); + +deallocate_error: + for (index = 0; index < VMM_PAGE_CHUNK; index ++) { + page = &req->pages[index]; + if (page->page != NULL) + free(page->page); + } + + return (-1); +} + int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -35,6 +35,8 @@ #include struct vm_snapshot_meta; +struct vm_get_dirty_page_list; +struct vmm_migration_pages_req; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); @@ -287,6 +289,8 @@ void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); +int vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list); +int vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req); #ifdef _SYS__CPUSET_H_ /* diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -31,6 +31,8 @@ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ +#include "vmm_migration.h" + struct vm_snapshot_meta; #ifdef _KERNEL @@ -256,6 +258,13 @@ }; _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); +struct vm_get_dirty_page_list { + uint8_t *page_list; + size_t num_pages; + struct vmm_migration_segment lowmem; + struct vmm_migration_segment highmem; +}; + enum { /* general routines */ IOCNUM_ABIVERS = 0, @@ -344,7 +353,9 @@ /* checkpoint */ IOCNUM_SNAPSHOT_REQ = 113, - IOCNUM_RESTORE_TIME = 115 + IOCNUM_RESTORE_TIME = 115, + IOCNUM_VM_GET_DIRTY_PAGE_LIST = 117, + IOCNUM_VM_COPY_VMM_PAGES = 118, }; #define VM_RUN \ @@ -475,4 +486,8 @@ _IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta) #define VM_RESTORE_TIME \ _IOWR('v', IOCNUM_RESTORE_TIME, int) +#define VM_GET_DIRTY_PAGE_LIST \ + _IOWR('v', IOCNUM_VM_GET_DIRTY_PAGE_LIST, struct vm_get_dirty_page_list) +#define VM_COPY_VMM_PAGES \ + _IOWR('v', IOCNUM_VM_COPY_VMM_PAGES, struct vmm_migration_pages_req) #endif diff --git a/sys/amd64/include/vmm_migration.h b/sys/amd64/include/vmm_migration.h new file mode 100644 --- /dev/null +++ b/sys/amd64/include/vmm_migration.h @@ -0,0 +1,66 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _VMM_MIGRATION_H_ +#define _VMM_MIGRATION_H_ + +#define VMM_PAGE_CHUNK 10 + +enum migration_req_type { + VMM_GET_PAGES = 0, + VMM_SET_PAGES = 1, +}; + +struct vmm_migration_page { + vm_pindex_t pindex; + uint8_t *page; +}; + +/* + * A bhyve guest has two memory segments: + * - lowmem segment: mapped from 0GB to 3GB (which is lowmem_limit) + * - highmem segment: mapped starting from 4GB + * The object that represents a segment is identified by start and end values. + * */ +struct vmm_migration_segment { + vm_offset_t start; + vm_offset_t end; +}; + +struct vmm_migration_pages_req { + size_t pages_required; + enum migration_req_type req_type; + struct vmm_migration_segment lowmem_segment; + struct vmm_migration_segment highmem_segment; + struct vmm_migration_page pages[VMM_PAGE_CHUNK]; +}; + +#endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,7 @@ #include #include #include +#include #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -146,6 +148,9 @@ }; #define VM_MAX_MEMMAPS 8 +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + /* * Initialization: * (o) initialized the first time the VM is created @@ -2949,4 +2954,191 @@ return (0); } -#endif + +static inline void +vm_search_dirty_pages_in_object(vm_object_t object, size_t start, size_t end, + size_t offset, uint8_t *page_list) +{ + vm_pindex_t pindex; + vm_page_t m; + uint8_t result; + + for (pindex = start / PAGE_SIZE; pindex < end / PAGE_SIZE; pindex ++) { + VM_OBJECT_WLOCK(object); + m = vm_page_lookup(object, pindex); + VM_OBJECT_WUNLOCK(object); + if (m != NULL) { + result = vm_page_test_vmm_dirty(m); + copyout(&result, page_list + pindex - offset, sizeof(result)); + } + } + +} + +int +vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list) +{ + int error = 0; + struct vmspace *vm_vmspace; + struct vm_map *vmmap; + struct vm_map_entry *entry; + struct vm_object *object; + uint8_t *page_list; + size_t offset; + + page_list = list->page_list; + + if (page_list == NULL) + return (-1); + + vm_vmspace = vm->vmspace; + + if (vm_vmspace == NULL) { + printf("%s: vm_vmspace is null\r\n", __func__); + error = -1; + return (error); + } + + vmmap = &vm_vmspace->vm_map; + + vm_map_lock(vmmap); + if (vmmap->busy) + vm_map_wait_busy(vmmap); + + for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) { + object = entry->object.vm_object; + + if (entry->start == list->lowmem.start && + entry->end == list->lowmem.end) { + // if object is lowmem + if (object == NULL) + continue; + vm_search_dirty_pages_in_object(object, + list->lowmem.start, + list->lowmem.end, + 0, + page_list); + } + + if (entry->start == list->highmem.start && + entry->end == list->highmem.end) { + if (object == NULL) + continue; + // if object is highmem + offset = (list->highmem.start - list->lowmem.end) / PAGE_SIZE; + vm_search_dirty_pages_in_object(object, + list->highmem.start, + list->highmem.end, + offset, + page_list); + } + } + + vm_map_unlock(vmmap); + + return (error); +} + +static inline void +vm_copy_object_pages(vm_object_t lowmem_object, vm_object_t highmem_object, + struct vmm_migration_pages_req *page_req) +{ + vm_pindex_t pindex; + vm_object_t object; + struct vmm_migration_page migration_page; + size_t page_idx, limit_page; + void *dst; + size_t pindex_offset; + enum migration_req_type req_type; + + req_type = page_req->req_type; + + if (lowmem_object == NULL) { + printf("%s: lowmem_object is NULL\r\n", __func__); + return; + } + limit_page = 3UL * GB / PAGE_SIZE; + for (page_idx = 0; page_idx < page_req->pages_required; page_idx ++) { + migration_page = page_req->pages[page_idx]; + pindex = migration_page.pindex; + dst = (void *) migration_page.page; + if (pindex >= limit_page) { + if (highmem_object == NULL) { + printf("%s: highmem_object is NULL\r\n", __func__); + return; + } + object = highmem_object; + pindex_offset = 1UL * GB / PAGE_SIZE; + } else { + object = lowmem_object; + pindex_offset = 0; + } + + if (req_type == VMM_GET_PAGES) { + VM_OBJECT_WLOCK(object); + vm_object_get_page(object, pindex + pindex_offset, dst); + VM_OBJECT_WUNLOCK(object); + } + else if (req_type == VMM_SET_PAGES) { + VM_OBJECT_WLOCK(object); + vm_object_set_page(object, pindex + pindex_offset, dst); + VM_OBJECT_WUNLOCK(object); + } + else + return; + } +} + +int +vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req) +{ + int error = 0; + struct vmspace *vm_vmspace; + struct vm_map *vmmap; + struct vm_map_entry *entry; + struct vm_object *lowmem_object, *highmem_object, *object; + struct vmm_migration_segment lowmem_segment, highmem_segment; + + lowmem_segment = pages_req->lowmem_segment; + highmem_segment = pages_req->highmem_segment; + vm_vmspace = vm->vmspace; + + if (vm_vmspace == NULL) { + printf("%s: vm_vmspace is null\r\n", __func__); + error = -1; + return (error); + } + + vmmap = &vm_vmspace->vm_map; + + vm_map_lock(vmmap); + if (vmmap->busy) + vm_map_wait_busy(vmmap); + + lowmem_object = NULL; + highmem_object = NULL; + for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) { + object = entry->object.vm_object; + + if (entry->start == lowmem_segment.start && + entry->end == lowmem_segment.end) { + lowmem_object = object; + } + + if (entry->start == highmem_segment.start && + entry->end == highmem_segment.end) { + highmem_object = object; + } + } + + if (lowmem_object == NULL) + return (-1); + + vm_copy_object_pages(lowmem_object, highmem_object, pages_req); + + vm_map_unlock(vmmap); + + return (error); +} +#endif /* BHYVE_SNAPSHOT */ + diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "vmm_lapic.h" #include "vmm_stat.h" @@ -388,6 +389,8 @@ int *regnums; #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; + struct vm_get_dirty_page_list *page_list; + struct vmm_migration_pages_req *pages_req; #endif error = vmm_priv_check(curthread->td_ucred); @@ -871,6 +874,14 @@ case VM_RESTORE_TIME: error = vm_restore_time(sc->vm); break; + case VM_GET_DIRTY_PAGE_LIST: + page_list = (struct vm_get_dirty_page_list *)data; + error = vm_get_dirty_page_list(sc->vm, page_list); + break; + case VM_COPY_VMM_PAGES: + pages_req = (struct vmm_migration_pages_req *)data; + error = vm_copy_vmm_pages(sc->vm, pages_req); + break; #endif default: error = ENOTTY; diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -390,6 +390,8 @@ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); +int vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst); +int vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -2592,6 +2592,52 @@ CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); +int +vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst) +{ + vm_page_t page; + vm_offset_t page_src; + + VM_OBJECT_ASSERT_WLOCKED(object); + + page = vm_page_lookup(object, pindex); + if (page == NULL) { + // cannot find page + return (-1); + } + + vm_page_xbusy(page); + page->oflags &= ~VPO_VMM_DIRTY; + + pmap_clear_modify(page); + + page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page)); + copyout((void *)page_src, dst, PAGE_SIZE); + vm_page_xunbusy(page); + + return (0); +} + +int +vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src) +{ + vm_page_t page; + vm_offset_t page_src; + + VM_OBJECT_ASSERT_WLOCKED(object); + + page = vm_page_lookup(object, pindex); + if (page == NULL) { + // cannot find page + return (-1); + } + + page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page)); + copyin(src, (void *)page_src, PAGE_SIZE); + + return (0); +} + #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -295,6 +295,7 @@ #define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */ #define VPO_UNMANAGED 0x04 /* no PV management for page */ #define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */ +#define VPO_VMM_DIRTY 0x80 /* dirty bit used for bhyve migration */ /* * Busy page implementation details. @@ -693,6 +694,7 @@ void vm_page_valid(vm_page_t m); int vm_page_is_valid(vm_page_t, int, int); void vm_page_test_dirty(vm_page_t); +uint8_t vm_page_test_vmm_dirty(vm_page_t m); vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count); @@ -890,6 +892,7 @@ vm_page_dirty_KBI(m); #else m->dirty = VM_PAGE_BITS_ALL; + m->oflags |= VPO_VMM_DIRTY; #endif } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1483,6 +1483,28 @@ /* Refer to this operation by its public name. */ KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; + m->oflags |= VPO_VMM_DIRTY; +} + +uint8_t +vm_page_test_vmm_dirty(vm_page_t m) +{ + uint64_t value; + + vm_page_xbusy(m); + vm_page_test_dirty(m); + vm_page_xunbusy(m); + +// VM_OBJECT_ASSERT_WLOCKED(m->object); + + value = m->oflags & VPO_VMM_DIRTY; + if (value == 0 && pmap_is_modified(m)) + value = 1; + + if (value == 0) + return (0); + else + return (1); } /* diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -34,6 +34,7 @@ kernemu_dev.c \ mem.c \ mevent.c \ + migration.c \ mptbl.c \ net_backends.c \ net_utils.c \ diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -96,6 +96,9 @@ #include "kernemu_dev.h" #include "mem.h" #include "mevent.h" +#ifdef BHYVE_SNAPSHOT +#include "migration.h" +#endif #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" @@ -243,6 +246,7 @@ " -p: pin 'vcpu' to 'hostcpu'\n" #ifdef BHYVE_SNAPSHOT " -r: path to checkpoint file\n" + " -R: the source vm host and port for migration\n" #endif " -S: guest memory cannot be swapped\n" " -s: PCI slot config\n" @@ -1214,10 +1218,12 @@ char *optstr; #ifdef BHYVE_SNAPSHOT char *restore_file; + char *receive_migration; struct restore_state rstate; int vcpu; restore_file = NULL; + receive_migration = NULL; #endif init_config(); @@ -1225,7 +1231,7 @@ progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:"; + optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:R:"; #else optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:"; #endif @@ -1278,6 +1284,9 @@ case 'r': restore_file = optarg; break; + case 'R': + receive_migration = optarg; + break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { @@ -1494,7 +1503,25 @@ exit(1); } } -#endif + + if (receive_migration != NULL) { + if (vm_pause_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to pause PCI device state.\n"); + exit(1); + } + + fprintf(stdout, "Starting the migration process...\r\n"); + if (receive_vm_migration(ctx, receive_migration) != 0) { + fprintf(stderr, "Failed to migrate the vm.\r\n"); + exit(1); + } + + if (vm_resume_user_devs(ctx) != 0) { + fprintf(stderr, "Failed to resume PCI device state.\n"); + exit(1); + } + } +#endif /* BHYVE_SNAPSHOT */ error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); @@ -1549,7 +1576,7 @@ if (init_checkpoint_thread(ctx) < 0) printf("Failed to start checkpoint thread!\r\n"); - if (restore_file != NULL) + if ((restore_file != NULL) || (receive_migration != NULL)) vm_restore_time(ctx); #endif @@ -1563,7 +1590,7 @@ * If we restore a VM, start all vCPUs now (including APs), otherwise, * let the guest OS to spin them up later via vmexits. */ - if (restore_file != NULL) { + if ((restore_file != NULL) || (receive_migration != NULL)) { for (vcpu = 0; vcpu < guest_ncpus; vcpu++) { if (vcpu == BSP) continue; diff --git a/usr.sbin/bhyve/migration.h b/usr.sbin/bhyve/migration.h new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/migration.h @@ -0,0 +1,87 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017-2020 Elena Mihailescu + * Copyright (c) 2017-2020 Darius Mihai + * Copyright (c) 2017-2020 Mihai Carabas + * All rights reserved. + * The migration feature was developed under sponsorships + * from Matthew Grooms. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _BHYVE_MIGRATION_ +#define _BHYVE_MIGRATION_ + +#include +#include +#include "snapshot.h" + +struct vmctx; + +int receive_vm_migration(struct vmctx *ctx, char *migration_data); + +/* Warm Migration */ +#define MAX_DEV_NAME_LEN 64 + +#define MAX_IP_LEN 64 +#define MAX_SPEC_LEN 256 + +#define MIGRATION_SPECS_OK 0 +#define MIGRATION_SPECS_NOT_OK 1 + +#define NO_KERN_STRUCT -1 + +enum migration_transfer_req { + MIGRATION_SEND_REQ = 0, + MIGRATION_RECV_REQ = 1 +}; + +enum message_types { + MESSAGE_TYPE_SPECS = 1, + MESSAGE_TYPE_METADATA = 2, + MESSAGE_TYPE_RAM = 3, + MESSAGE_TYPE_KERN = 4, + MESSAGE_TYPE_DEV = 5, + MESSAGE_TYPE_UNKNOWN = 8, +}; + +struct __attribute__((packed)) migration_message_type { + size_t len; + unsigned int type; /* enum message_type */ + unsigned int req_type; /* enum snapshot_req */ + char name[MAX_DEV_NAME_LEN]; +}; + +struct __attribute__((packed)) migration_system_specs { + char hw_machine[MAX_SPEC_LEN]; + char hw_model[MAX_SPEC_LEN]; + size_t hw_pagesize; +}; + +int vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool live); +int vm_recv_migrate_req(struct vmctx *ctx, struct migrate_req req); + +#endif diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c new file mode 100644 --- /dev/null +++ b/usr.sbin/bhyve/migration.c @@ -0,0 +1,1444 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017-2020 Elena Mihailescu + * Copyright (c) 2017-2020 Darius Mihai + * Copyright (c) 2017-2020 Mihai Carabas + * All rights reserved. + * The migration feature was developed under sponsorships + * from Matthew Grooms. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#include +#endif + +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "migration.h" +#include "pci_emul.h" +#include "snapshot.h" + +#define MB (1024UL * 1024) +#define GB (1024UL * MB) + +#define ALLOCA_VM_SNAPSHOT_META(CTX, DEV_NAME, DEV_REQ, BUFFER, BUFFER_SIZE, OP) \ +({ \ + &(struct vm_snapshot_meta) { \ + .ctx = CTX, \ + .dev_name = DEV_NAME, \ + .dev_req = DEV_REQ, \ + \ + .buffer.buf_start = BUFFER, \ + .buffer.buf_size = BUFFER_SIZE, \ + .op = OP, \ + }; \ + \ +}) + +#ifdef BHYVE_DEBUG +#define DPRINTF(FMT, ...) \ +({ \ + fprintf(stderr, "%s: " FMT "\r\n", __func__, ##__VA_ARGS__); \ + }) +#else +#define DPRINTF(FMT, ...) +#endif + +#define EPRINTF(FMT, ...) \ +({ \ + fprintf(stderr, "%s: " FMT "\r\n", __func__, ##__VA_ARGS__); \ + }) + +int +receive_vm_migration(struct vmctx *ctx, char *migration_data) +{ + struct migrate_req req; + char *hostname, *pos; + int rc; + + memset(req.host, 0, MAX_HOSTNAME_LEN); + hostname = strdup(migration_data); + + if ((pos = strchr(hostname, ',')) != NULL) { + *pos = '\0'; + strlcpy(req.host, hostname, MAX_HOSTNAME_LEN); + pos = pos + 1; + + rc = sscanf(pos, "%d", &(req.port)); + + if (rc == 0) { + EPRINTF("Could not parse the port"); + free(hostname); + return -1; + } + } else { + strlcpy(req.host, hostname, MAX_HOSTNAME_LEN); + + /* If only one variable could be read, it should be the host */ + req.port = DEFAULT_MIGRATION_PORT; + } + + rc = vm_recv_migrate_req(ctx, req); + + free(hostname); + return (rc); +} + +static int +get_system_specs_for_migration(struct migration_system_specs *specs) +{ + int mib[2]; + size_t len_machine, len_model, len_pagesize; + char interm[MAX_SPEC_LEN]; + int rc; + int num; + + mib[0] = CTL_HW; + mib[1] = HW_MACHINE; + memset(interm, 0, MAX_SPEC_LEN); + len_machine = sizeof(interm); + + rc = sysctl(mib, 2, interm, &len_machine, NULL, 0); + if (rc != 0) { + perror("Could not retrieve HW_MACHINE specs"); + return (rc); + } + strlcpy(specs->hw_machine, interm, MAX_SPEC_LEN); + + memset(interm, 0, MAX_SPEC_LEN); + mib[0] = CTL_HW; + mib[1] = HW_MODEL; + len_model = sizeof(interm); + rc = sysctl(mib, 2, interm, &len_model, NULL, 0); + if (rc != 0) { + perror("Could not retrieve HW_MODEL specs"); + return (rc); + } + strlcpy(specs->hw_model, interm, MAX_SPEC_LEN); + + mib[0] = CTL_HW; + mib[1] = HW_PAGESIZE; + len_pagesize = sizeof(num); + rc = sysctl(mib, 2, &num, &len_pagesize, NULL, 0); + if (rc != 0) { + perror("Could not retrieve HW_PAGESIZE specs"); + return (rc); + } + specs->hw_pagesize = num; + + return (0); +} + +static int +migration_transfer_data(int socket, void *msg, size_t len, enum migration_transfer_req req) +{ + uint64_t to_transfer, total_transferred; + int64_t transferred; + + to_transfer = len; + total_transferred = 0; + + while (to_transfer > 0) { + switch (req) { + case MIGRATION_SEND_REQ: + transferred = send(socket, msg + total_transferred, + to_transfer, 0); + break; + case MIGRATION_RECV_REQ: + transferred = recv(socket, msg + total_transferred, + to_transfer, 0); + break; + default: + DPRINTF("Unknown transfer option"); + return (-1); + break; + } + + if (transferred == 0) + break; + if (transferred < 0) { + perror("Error while transfering data"); + return (transferred); + } + + to_transfer -= transferred; + total_transferred += transferred; + } + + return (0); +} + +static int +migration_check_specs(int socket, enum migration_transfer_req req) +{ + struct migration_system_specs local_specs; + struct migration_system_specs remote_specs; + struct migration_system_specs transfer_specs; + struct migration_message_type msg; + enum migration_transfer_req rev_req; + size_t response; + int rc; + + if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) { + DPRINTF("Unknown option for migration req"); + return (-1); + } + + if (req == MIGRATION_SEND_REQ) + rev_req = MIGRATION_RECV_REQ; + else + rev_req = MIGRATION_SEND_REQ; + + rc = get_system_specs_for_migration(&local_specs); + if (rc != 0) { + EPRINTF("Could not retrieve local specs"); + return (rc); + } + + if (req == MIGRATION_SEND_REQ) { + /* Send message type to server: specs & len */ + msg.type = MESSAGE_TYPE_SPECS; + msg.len = sizeof(local_specs); + } + + rc = migration_transfer_data(socket, &msg, sizeof(msg), req); + if (rc < 0) { + DPRINTF("Could not send message type"); + return (-1); + } + + if ((req == MIGRATION_RECV_REQ) && (msg.type != MESSAGE_TYPE_SPECS)) { + DPRINTF(" Wrong message type received from remote"); + return (-1); + } + + /* For the send req, we send the local specs and for the receive req + * we receive the remote specs. + */ + if (req == MIGRATION_SEND_REQ) + transfer_specs = local_specs; + + rc = migration_transfer_data(socket, &transfer_specs, sizeof(transfer_specs), req); + if (rc < 0) { + DPRINTF("Could not transfer system specs"); + return (-1); + } + + if (req == MIGRATION_RECV_REQ) { + remote_specs = transfer_specs; + + /* Check specs */ + response = MIGRATION_SPECS_OK; + if ((strncmp(local_specs.hw_model, remote_specs.hw_model, MAX_SPEC_LEN) != 0) + || (strncmp(local_specs.hw_machine, remote_specs.hw_machine, MAX_SPEC_LEN) != 0) + || (local_specs.hw_pagesize != remote_specs.hw_pagesize) + ) { + EPRINTF("System specification mismatch"); + DPRINTF("Local specs vs Remote Specs: \r\n" + "\tmachine: %s vs %s\r\n" + "\tmodel: %s vs %s\r\n" + "\tpagesize: %zu vs %zu\r\n", + local_specs.hw_machine, + remote_specs.hw_machine, + local_specs.hw_model, + remote_specs.hw_model, + local_specs.hw_pagesize, + remote_specs.hw_pagesize + ); + response = MIGRATION_SPECS_NOT_OK; + } + } + + /* The source will receive the result of the checkup (i.e. + * whether the migration is possible or the source and destination + * are incompatible for migration) and the destination will send the + * result of the checkup. + */ + rc = migration_transfer_data(socket, &response, sizeof(response), rev_req); + if (rc < 0) { + DPRINTF("Could not transfer response from server"); + return (-1); + } + + if (response == MIGRATION_SPECS_NOT_OK) + return (-1); + + fprintf(stdout, "%s: System specification accepted\r\n", __func__); + + return (0); + +} + +static int +get_migration_host_and_type(const char *hostname, unsigned char *ipv4_addr, + unsigned char *ipv6_addr, int *type) +{ + struct addrinfo hints, *res; + void *addr; + int rc; + + memset(&hints, 0, sizeof(hints)); + + hints.ai_family = AF_UNSPEC; + + rc = getaddrinfo(hostname, NULL, &hints, &res); + + if (rc != 0) { + DPRINTF("Could not get address info"); + return (-1); + } + + *type = res->ai_family; + switch(res->ai_family) { + case AF_INET: + addr = &((struct sockaddr_in *) res->ai_addr)->sin_addr; + inet_ntop(res->ai_family, addr, ipv4_addr, MAX_IP_LEN); + break; + case AF_INET6: + addr = &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr; + inet_ntop(res->ai_family, addr, ipv6_addr, MAX_IP_LEN); + break; + default: + DPRINTF("Unknown address family."); + return (-1); + } + + return (0); +} + +static int +migrate_check_memsize(size_t local_lowmem_size, size_t local_highmem_size, + size_t remote_lowmem_size, size_t remote_highmem_size) +{ + int ret; + + ret = MIGRATION_SPECS_OK; + + if (local_lowmem_size != remote_lowmem_size){ + ret = MIGRATION_SPECS_NOT_OK; + DPRINTF("Local and remote lowmem size mismatch"); + } + + if (local_highmem_size != remote_highmem_size){ + ret = MIGRATION_SPECS_NOT_OK; + DPRINTF("Local and remote highmem size mismatch"); + } + + return (ret); +} + +static int +migrate_recv_memory(struct vmctx *ctx, int socket) +{ + size_t local_lowmem_size, local_highmem_size; + size_t remote_lowmem_size, remote_highmem_size; + char *baseaddr; + int memsize_ok; + int rc; + + local_lowmem_size = local_highmem_size = 0; + remote_lowmem_size = remote_highmem_size = 0; + rc = 0; + + rc = vm_get_guestmem_from_ctx(ctx, + &baseaddr, &local_lowmem_size, + &local_highmem_size); + if (rc != 0) { + DPRINTF("Could not get guest lowmem size and highmem size"); + return (rc); + } + + rc = migration_transfer_data(socket, &remote_lowmem_size, sizeof(remote_lowmem_size), MIGRATION_RECV_REQ); + if (rc < 0) { + DPRINTF("Could not recv lowmem size"); + return (rc); + } + + rc = migration_transfer_data(socket, &remote_highmem_size, sizeof(remote_highmem_size), MIGRATION_RECV_REQ); + if (rc < 0) { + DPRINTF("Could not recv highmem size"); + return (rc); + } + + memsize_ok = migrate_check_memsize(local_lowmem_size, local_highmem_size, + remote_lowmem_size, remote_highmem_size); + + rc = migration_transfer_data(socket, + &memsize_ok, sizeof(memsize_ok), MIGRATION_SEND_REQ); + if (rc < 0) { + DPRINTF("Could not send migration_ok to remote"); + return (rc); + } + + if (memsize_ok != MIGRATION_SPECS_OK) { + DPRINTF("Memory size mismatch with remote host"); + return (-1); + } + + rc = migration_transfer_data(socket, baseaddr, local_lowmem_size, MIGRATION_RECV_REQ); + if (rc < 0) { + DPRINTF("Could not recv chunk lowmem."); + return (-1); + } + + if (local_highmem_size > 0){ + rc = migration_transfer_data(socket, baseaddr + 4 * GB, local_highmem_size, MIGRATION_RECV_REQ); + if (rc < 0) { + DPRINTF("Could not recv highmem"); + return (-1); + } + } + + return (0); +} + +static int +migrate_send_memory(struct vmctx *ctx, int socket) +{ + size_t lowmem_size, highmem_size; + char *mmap_vm_lowmem, *mmap_vm_highmem; + char *baseaddr; + int memsize_ok; + int rc; + + rc = 0; + mmap_vm_lowmem = MAP_FAILED; + mmap_vm_highmem = MAP_FAILED; + + rc = vm_get_guestmem_from_ctx(ctx, &baseaddr, + &lowmem_size, &highmem_size); + if (rc != 0) { + DPRINTF("Could not get guest lowmem size and highmem size"); + return (rc); + } + + /* Send the size of the lowmem segment */ + rc = migration_transfer_data(socket, &lowmem_size, sizeof(lowmem_size), MIGRATION_SEND_REQ); + if (rc < 0) { + DPRINTF("Could not send lowmem size"); + return (rc); + } + + /* Send the size of the highmem segment */ + rc = migration_transfer_data(socket, &highmem_size, sizeof(lowmem_size), MIGRATION_SEND_REQ); + if (rc < 0) { + DPRINTF("Could not send highmem size"); + return (rc); + } + + /* Wait for answer - params ok (if memory size matches) */ + rc = migration_transfer_data(socket, &memsize_ok, sizeof(memsize_ok), MIGRATION_RECV_REQ); + if (rc < 0) { + DPRINTF("Could not receive response from remote"); + return (rc); + } + + if (memsize_ok != MIGRATION_SPECS_OK) { + DPRINTF("Memory size mismatch with remote host"); + return (-1); + } + + mmap_vm_lowmem = baseaddr; + mmap_vm_highmem = baseaddr + 4 * GB; + + /* Send the lowmem segment */ + rc = migration_transfer_data(socket, mmap_vm_lowmem, lowmem_size, MIGRATION_SEND_REQ); + if (rc < 0) { + DPRINTF("Could not send lowmem"); + return (-1); + } + + /* Send the highmem segment */ + if (highmem_size > 0){ + rc = migration_transfer_data(socket, mmap_vm_highmem, highmem_size, MIGRATION_SEND_REQ); + if (rc < 0) { + DPRINTF("Could not send highmem"); + return (-1); + } + } + + return (0); +} + +/** + * The source host saves the state for the kernel structure that will be + * migrated and sends to the destination host a message that contains + * the type of data to be sent (MESSAGE_TYPE_KERN), the size of the structure + * to be received and the index that represents the kernel structure in order to + * be identified by the destination host. Then, the source host transfer the + * state of the kernel structure over the network and the destination host + * restores it. + */ +static inline int +migrate_kern_struct(struct vmctx *ctx, int socket, char *buffer, + enum snapshot_req struct_req, enum migration_transfer_req req) +{ + int rc; + struct migration_message_type msg; + struct vm_snapshot_meta *meta; + + if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) { + DPRINTF("Unknown request"); + return (-1); + } + + memset(&msg, 0, sizeof(msg)); + if (req == MIGRATION_SEND_REQ) { + msg.type = MESSAGE_TYPE_KERN; + + meta = ALLOCA_VM_SNAPSHOT_META(ctx, NULL, struct_req, buffer, SNAPSHOT_BUFFER_SIZE, VM_SNAPSHOT_SAVE); + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + rc = vm_snapshot_req(meta); + if (rc < 0) { + DPRINTF("Could not get struct with req %d", struct_req); + return (-1); + } + + msg.len = vm_get_snapshot_size(meta); + msg.req_type = struct_req; + + } + + rc = migration_transfer_data(socket, &msg, sizeof(msg), req); + if (rc < 0) { + DPRINTF("Could not transfer message type for kern struct %d", struct_req); + return (-1); + } + + if ((req == MIGRATION_RECV_REQ) && (msg.type != MESSAGE_TYPE_KERN)) { + DPRINTF("Receive wrong message type."); + return (-1); + } + + rc = migration_transfer_data(socket, buffer, msg.len, req); + if (rc < 0) { + DPRINTF("Could not transfer struct with req %d", struct_req); + return (-1); + } + + if (req == MIGRATION_RECV_REQ) { + meta = ALLOCA_VM_SNAPSHOT_META(ctx, NULL, msg.req_type, buffer, + msg.len, VM_SNAPSHOT_RESTORE); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + rc = vm_snapshot_req(meta); + if (rc != 0) { + DPRINTF("Failed to restore struct %d", msg.req_type); + return (-1); + } + } + + return (0); +} + +static int +migrate_kern_data(struct vmctx *ctx, int socket, enum migration_transfer_req req) +{ + int i, rc, error; + int ndevs; + char *buffer; + const struct vm_snapshot_kern_info *snapshot_kern_structs; + + error = 0; + snapshot_kern_structs = get_snapshot_kern_structs(&ndevs); + + buffer = malloc(SNAPSHOT_BUFFER_SIZE); + if (buffer == NULL) { + EPRINTF("Could not allocate memory."); + return (-1); + } + + for (i = 0; i < ndevs; i++) { + if (req == MIGRATION_RECV_REQ) { + rc = migrate_kern_struct(ctx, socket, buffer, NO_KERN_STRUCT, MIGRATION_RECV_REQ); + if (rc < 0) { + DPRINTF("Could not restore struct %s", snapshot_kern_structs[i].struct_name); + error = -1; + break; + } + } else if (req == MIGRATION_SEND_REQ) { + rc = migrate_kern_struct(ctx, socket, buffer, + snapshot_kern_structs[i].req, MIGRATION_SEND_REQ); + if (rc < 0) { + DPRINTF("Could not send %s", snapshot_kern_structs[i].struct_name); + error = -1; + break; + } + } else { + DPRINTF("Unknown transfer request"); + error = -1; + break; + } + } + + free(buffer); + + return (error); +} + +static inline const struct vm_snapshot_dev_info * +find_entry_for_dev(const char *name) +{ + int i; + int ndevs; + const struct vm_snapshot_dev_info *snapshot_devs; + + snapshot_devs = get_snapshot_devs(&ndevs); + + for (i = 0; i < ndevs; i++) { + if (strncmp(name, snapshot_devs[i].dev_name, MAX_DEV_NAME_LEN) == 0) { + return (&snapshot_devs[i]); + } + } + + return NULL; +} + +static inline int +migrate_transfer_dev(struct vmctx *ctx, int socket, const char *dev, + char *buffer, size_t len, enum migration_transfer_req req) +{ + int rc; + size_t data_size; + struct migration_message_type msg; + struct vm_snapshot_meta *meta; + const struct vm_snapshot_dev_info *dev_info; + + if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) { + DPRINTF("Unknown transfer request option"); + return (-1); + } + + memset(&msg, 0, sizeof(msg)); + memset(buffer, 0, len); + if (req == MIGRATION_SEND_REQ) { + dev_info = find_entry_for_dev(dev); + if (dev_info == NULL) { + EPRINTF("Could not find the device %s " + "or migration not implemented yet for it.", dev); + return (0); + } + + meta = ALLOCA_VM_SNAPSHOT_META(ctx, dev, 0, buffer, len, VM_SNAPSHOT_SAVE); + + memset(meta->buffer.buf_start, 0, meta->buffer.buf_size); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + rc = (*dev_info->snapshot_cb)(meta); + if (rc < 0) { + DPRINTF("Could not get info about %s dev", dev); + return (-1); + } + + data_size = vm_get_snapshot_size(meta); + + msg.type = MESSAGE_TYPE_DEV; + msg.len = data_size; + strlcpy(msg.name, dev, MAX_DEV_NAME_LEN); + } + + rc = migration_transfer_data(socket, &msg, sizeof(msg), req); + if (rc < 0) { + DPRINTF("Could not transfer msg for %s dev", dev); + return (-1); + } + + if (req == MIGRATION_RECV_REQ) { + if (msg.type != MESSAGE_TYPE_DEV) { + DPRINTF("Wrong message type for device."); + return (-1); + } + + data_size = msg.len; + } + + if (data_size == 0) + return (0); // this type of device is not used + + + rc = migration_transfer_data(socket, buffer, data_size, req); + if (rc < 0) { + DPRINTF("Could not transfer %s dev", dev); + return (-1); + } + + if (req == MIGRATION_RECV_REQ) { + dev_info = find_entry_for_dev(msg.name); + if (dev_info == NULL) { + EPRINTF("Could not find the device %s " + "or migration not implemented yet for it.", msg.name); + return (0); + } + meta = ALLOCA_VM_SNAPSHOT_META(ctx, msg.name, 0, buffer, data_size, VM_SNAPSHOT_RESTORE); + meta->buffer.buf = meta->buffer.buf_start; + meta->buffer.buf_rem = meta->buffer.buf_size; + + rc = (*dev_info->snapshot_cb)(meta); + if (rc != 0) { + EPRINTF("Could not restore %s dev", msg.name); + return (-1); + } + } + + return (0); +} + +static int +migrate_devs(struct vmctx *ctx, int socket, enum migration_transfer_req req) +{ + int i, num_items; + int rc, error; + char *buffer; + const struct vm_snapshot_dev_info *snapshot_devs; + + error = 0; + buffer = malloc(SNAPSHOT_BUFFER_SIZE); + if (buffer == NULL) { + EPRINTF("Could not allocate memory"); + error = -1; + goto end; + } + + if (req == MIGRATION_SEND_REQ) { + /* + * Send to the destination the number of devices that will + * be migrated. + */ + snapshot_devs = get_snapshot_devs(&num_items); + + rc = migration_transfer_data(socket, &num_items, sizeof(num_items), req); + if (rc < 0) { + DPRINTF("Could not send num_items to destination"); + return (-1); + } + + for (i = 0; i < num_items; i++) { + rc = migrate_transfer_dev(ctx, socket, snapshot_devs[i].dev_name, + buffer, SNAPSHOT_BUFFER_SIZE, req); + + if (rc < 0) { + DPRINTF("Could not send %s", snapshot_devs[i].dev_name); + error = -1; + goto end; + } + } + } else if (req == MIGRATION_RECV_REQ) { + /* receive the number of devices that will be migrated */ + rc = migration_transfer_data(socket, &num_items, sizeof(num_items), MIGRATION_RECV_REQ); + if (rc < 0) { + DPRINTF("Could not recv num_items from source"); + return (-1); + } + + for (i = 0; i < num_items; i++) { + rc = migrate_transfer_dev(ctx, socket, NULL, buffer, SNAPSHOT_BUFFER_SIZE, req); + if (rc < 0) { + DPRINTF("Could not recv device"); + error = -1; + goto end; + } + } + } + +end: + if (buffer != NULL) + free(buffer); + + return (error); +} + + +#define MIGRATION_ROUNDS 4 + +static size_t +num_dirty_pages(char *page_list, size_t size) +{ + size_t num = 0; + size_t i; + + for (i = 0; i < size; i++) + if (page_list[i] == 1) + num++; + + return (num); +} + +static int +migration_fill_vmm_migration_pages_req(struct vmctx *ctx, + struct vmm_migration_pages_req *req, + char *page_list, + size_t size, + size_t *current_position) +{ + size_t i, count; + + count = 0; + for (i = *current_position; i < size; i++) { + if (count == VMM_PAGE_CHUNK) + break; + + if (page_list[i] == 1) { + req->pages[count].pindex = i; + count ++; + } + } + + *current_position = i; + req->pages_required = count; + req->req_type = VMM_GET_PAGES; + + return vm_copy_vmm_pages(ctx, req); +} + +static int +migrate_pages(struct vmctx *ctx, int socket, struct vmm_migration_pages_req *req, + char *page_list, size_t page_list_size, int already_locked, + enum migration_transfer_req migration_req) +{ + size_t dirty_pages; + size_t current_pos, i, count; + int rc; + + if ((migration_req != MIGRATION_SEND_REQ) && (migration_req != MIGRATION_RECV_REQ)) { + EPRINTF("wrong migration transfer req"); + return (-1); + } + + /* + * Transfer the state of the pages (dirty/not dirty) from the source + * host to the destination host. The pages that are dirty will be + * transferred in the next steps. + */ + rc = migration_transfer_data(socket, page_list, page_list_size, migration_req); + if (rc < 0) { + DPRINTF("Could not transfer page_list remote"); + return (-1); + } + + dirty_pages = num_dirty_pages(page_list, page_list_size); + + current_pos = 0; + while (1) { + if (current_pos >= page_list_size) + break; + + for (i = 0; i < VMM_PAGE_CHUNK; i++) + req->pages[i].pindex = -1; + + req->pages_required = 0; + + /* Only the source host pauses the vcpus */ + if (migration_req == MIGRATION_SEND_REQ) { + if (!already_locked) + vm_vcpu_pause(ctx); + + rc = migration_fill_vmm_migration_pages_req(ctx, req, page_list, + page_list_size, + ¤t_pos); + + if (!already_locked) + vm_vcpu_resume(ctx); + + if (rc < 0) { + DPRINTF("Could not get pages"); + return (-1); + } + } else { + count = 0; + for (i = current_pos; i < page_list_size; i++) { + if (count == VMM_PAGE_CHUNK) + break; + + if (page_list[i] == 1) { + req->pages[count].pindex = i; + count ++; + } + } + + current_pos = i; + req->pages_required = count; + } + + for (i = 0; i < req->pages_required; i++) { + rc = migration_transfer_data(socket, req->pages[i].page, PAGE_SIZE, migration_req); + if (rc < 0) { + DPRINTF("Cound not transfer page %zu", req->pages[i].pindex); + return (-1); + } + } + + if (migration_req == MIGRATION_RECV_REQ) { + req->req_type = VMM_SET_PAGES; + + rc = vm_copy_vmm_pages(ctx, req); + if (rc < 0) { + EPRINTF("Could not copy pages into guest memory"); + return (-1); + } + } + } + + return (0); +} + +static int +search_dirty_pages(struct vmctx *ctx, char *page_list) +{ + size_t lowmem_pages, highmem_pages, pages; + int error; + + if (page_list == NULL) + return (-1); + + error = vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages); + if (error != 0) { + DPRINTF("Error while trying to get page number"); + return (-1); + } + + pages = lowmem_pages + highmem_pages; + vm_get_dirty_page_list(ctx, page_list, pages); + + return (0); +} + +static inline void +fill_page_list(char *page_list, size_t list_len, char c) +{ + size_t index; + + if (page_list == NULL) + return; + + for (index = 0; index < list_len; index ++) + page_list[index] = c; +} + +static int +live_migrate_send(struct vmctx *ctx, int socket) +{ + int error, i, rc; + uint8_t rounds; + size_t memory_size, lowmem_size, highmem_size; + size_t migration_completed; + size_t lowmem_pages, highmem_pages, pages; + char *baseaddr, *page_list_indexes; + struct vmm_migration_pages_req memory_req; + + error = 0; + memory_size = lowmem_size = highmem_size = 0; + page_list_indexes = NULL; + rounds = MIGRATION_ROUNDS; + + /* Send the number of memory rounds to destination */ + error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Could not send the number of rounds remote"); + goto done; + } + + /* Compute memory_size and pages*/ + vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size); + + memory_size = lowmem_size + highmem_size; + vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages); + pages = lowmem_pages + highmem_pages; + + /* alloc page_list_indexes */ + page_list_indexes = malloc (pages * sizeof(char)); + if (page_list_indexes == NULL) { + perror("Page list indexes could not be allocated"); + error = -1; + goto done; + } + + error = vm_init_vmm_migration_pages_req(ctx, &memory_req); + if (error < 0) { + DPRINTF("Could not initialize struct vmm_migration_pages_req"); + return (error); + } + + for (i = 0; i <= MIGRATION_ROUNDS; i++) { + if (i == MIGRATION_ROUNDS) { // Last Round + rc = vm_pause_user_devs(ctx); + if (rc != 0) { + DPRINTF("Could not pause devices"); + error = rc; + goto done; + } + + vm_vcpu_pause(ctx); + } + + if (i == 0) { // First Round + fill_page_list(page_list_indexes, pages, 1); + } else { + DPRINTF("ROUND: %d", i); + fill_page_list(page_list_indexes, pages, 0); + + if (i != MIGRATION_ROUNDS) { + vm_vcpu_pause(ctx); + } + + /* Search the dirty pages and populate page_list_index */ + error = search_dirty_pages(ctx, page_list_indexes); + + if (error != 0) { + DPRINTF("Couldn't search for the dirty pages"); + goto unlock_vm_and_exit; + } + + if (i != MIGRATION_ROUNDS) { + vm_vcpu_resume(ctx); + } + } + + error = migrate_pages(ctx, socket, &memory_req, page_list_indexes, + pages, i == MIGRATION_ROUNDS ? 1 : 0, MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Couldn't send dirty pages to dest"); + goto done; + } + } + + // Send kern data + error = migrate_kern_data(ctx, socket, MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Could not send kern data to destination"); + goto unlock_vm_and_exit; + } + + // Send PCI data + error = migrate_devs(ctx, socket, MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Could not send pci devs to destination"); + goto unlock_vm_and_exit; + } + + // Wait for migration completed + error = migration_transfer_data(socket, &migration_completed, + sizeof(migration_completed), MIGRATION_RECV_REQ); + if ((error < 0) || (migration_completed != MIGRATION_SPECS_OK)) { + DPRINTF("Could not recv migration completed remote or received error"); + goto unlock_vm_and_exit; + } + + // Poweroff the vm + vm_vcpu_resume(ctx); + + vm_destroy(ctx); + exit(0); + +unlock_vm_and_exit: + vm_vcpu_resume(ctx); +done: + rc = vm_resume_user_devs(ctx); + if (rc != 0) + EPRINTF("Could not resume devices"); + if (page_list_indexes != NULL) + free(page_list_indexes); + return (error); +} + +static int +live_migrate_recv(struct vmctx *ctx, int socket) +{ + int error, index; + uint8_t rounds; + size_t memory_size, lowmem_size, highmem_size; + size_t lowmem_pages, highmem_pages, pages; + char *baseaddr, *page_list_indexes; + struct vmm_migration_pages_req memory_req; + + memory_size = lowmem_size = highmem_size = 0; + page_list_indexes = NULL; + + error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_RECV_REQ); + if (error != 0) { + DPRINTF("Could not recv the number of rounds from remote"); + goto done; + } + + /* Compute memory_size and pages*/ + vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size); + + memory_size = lowmem_size + highmem_size; + vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages); + pages = lowmem_pages + highmem_pages; + + /* alloc page_list_indexes */ + page_list_indexes = malloc(pages * sizeof(char)); + if (page_list_indexes == NULL) { + perror("Page list indexes could not be allocated"); + error = -1; + goto done; + } + + error = vm_init_vmm_migration_pages_req(ctx, &memory_req); + if (error < 0) { + DPRINTF("Could not initialize struct vmm_migration_pages_req"); + return (error); + } + + /* The following iteration contains the preliminary round in which the + * entire memory is migrated to the destination. Then, for + * MIGRATION_ROUNDS - 1 rounds, only the dirtied pages will be migrated. + * In the final round, the rest of the pages are migrated. + * Since the vcpus are not started, we don't need to lock them, so we + * can do the memory migration pretty straight-forward. + */ + for (index = 0; index <= rounds; index ++) { + fill_page_list(page_list_indexes, pages, 0); + + error = migrate_pages(ctx, socket, &memory_req, page_list_indexes, pages, true, MIGRATION_RECV_REQ); + if (error != 0) { + DPRINTF("Couldn't recv dirty pages from source"); + goto done; + } + } + + error = 0; +done: + if (page_list_indexes != NULL) { + free(page_list_indexes); + } + return (error); +} + +static inline int +migrate_connections(struct migrate_req req, int *socket_fd, + int *connection_socket_fd, + enum migration_transfer_req type) +{ + unsigned char ipv4_addr[MAX_IP_LEN]; + unsigned char ipv6_addr[MAX_IP_LEN]; + int addr_type; + int error; + int s, con_socket; + struct sockaddr_in sa, client_sa; + socklen_t client_len; + int rc; + + rc = get_migration_host_and_type(req.host, ipv4_addr, + ipv6_addr, &addr_type); + + if (rc != 0) { + EPRINTF("Invalid address."); + DPRINTF("IP address used for migration: %s;\r\n" + "Port used for migration: %d", + req.host, req.port); + return (rc); + } + + if (addr_type == AF_INET6) { + EPRINTF("IPv6 is not supported yet for migration. " + "Please try again using a IPv4 address."); + + DPRINTF("IP address used for migration: %s;\r\nPort used for migration: %d", + ipv6_addr, req.port); + return (-1); + } + + s = socket(AF_INET, SOCK_STREAM, 0); + + if (s < 0) { + perror("Could not create socket"); + return (-1); + } + + bzero(&sa, sizeof(sa)); + + switch (type) { + case MIGRATION_SEND_REQ: + fprintf(stdout, "%s: Starting connection to %s on %d port...\r\n", + __func__, ipv4_addr, req.port); + + sa.sin_family = AF_INET; + sa.sin_port = htons(req.port); + + rc = inet_pton(AF_INET, ipv4_addr, &sa.sin_addr); + if (rc <= 0) { + DPRINTF("Could not retrive the IPV4 address"); + return (-1); + } + + rc = connect(s, (struct sockaddr *)&sa, sizeof(sa)); + + if (rc < 0) { + perror("Could not connect to the remote host"); + error = rc; + goto done_close_s; + } + *socket_fd = s; + break; + case MIGRATION_RECV_REQ: + fprintf(stdout, "%s: Waiting for connections from %s on %d port...\r\n", + __func__, ipv4_addr, req.port); + + sa.sin_family = AF_INET; + sa.sin_port = htons(req.port); + sa.sin_addr.s_addr = htonl(INADDR_ANY); + + rc = bind(s, (struct sockaddr *)&sa, sizeof(sa)); + + if (rc < 0) { + perror("Could not bind"); + error = rc; + goto done_close_s; + } + + listen(s, 1); + + con_socket = accept(s, (struct sockaddr *)&client_sa, &client_len); + if (con_socket < 0) { + EPRINTF("Could not accept connection"); + error = -1; + goto done_close_s; + } + *socket_fd = s; + *connection_socket_fd = con_socket; + break; + default: + EPRINTF("unknown operation request"); + error = -1; + goto done; + } + + error = 0; + goto done; + +done_close_s: + close(s); +done: + return (error); +} + +int +vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool live) +{ + int s; + int rc, error, migration_type; + size_t migration_completed; + + rc = migrate_connections(req, &s, NULL, MIGRATION_SEND_REQ); + if (rc < 0) { + EPRINTF("Could not create connection"); + return (-1); + } + + rc = migration_check_specs(s, MIGRATION_SEND_REQ); + + if (rc < 0) { + EPRINTF("Error while checking system requirements"); + error = rc; + goto done; + } + + migration_type = live; + rc = migration_transfer_data(s, &migration_type, + sizeof(migration_type), MIGRATION_SEND_REQ); + if (rc < 0) { + DPRINTF("Could not send migration type"); + return (-1); + } + + if (live) { + rc = live_migrate_send(ctx, s); + if (rc != 0) { + EPRINTF("Could not live migrate the guest's memory"); + error = rc; + } else { + error = 0; + } + goto done; + } // else continue the warm migration procedure + + vm_vcpu_pause(ctx); + + rc = vm_pause_user_devs(ctx); + if (rc != 0) { + EPRINTF("Could not pause devices"); + error = rc; + goto unlock_vm_and_exit; + } + + rc = migrate_send_memory(ctx, s); + if (rc != 0) { + EPRINTF("Could not send memory to destination"); + error = rc; + goto unlock_vm_and_exit; + } + + rc = migrate_kern_data(ctx, s, MIGRATION_SEND_REQ); + if (rc != 0) { + EPRINTF("Could not send kern data to destination"); + error = rc; + goto unlock_vm_and_exit; + } + + rc = migrate_devs(ctx, s, MIGRATION_SEND_REQ); + if (rc < 0) { + EPRINTF("Could not send pci devs to destination"); + error = rc; + goto unlock_vm_and_exit; + } + + rc = migration_transfer_data(s, &migration_completed, + sizeof(migration_completed), MIGRATION_RECV_REQ); + if ((rc < 0) || (migration_completed != MIGRATION_SPECS_OK)) { + EPRINTF("Could not recv migration completed remote or received error"); + error = -1; + goto unlock_vm_and_exit; + } + + vm_destroy(ctx); + exit(0); + +unlock_vm_and_exit: + vm_vcpu_resume(ctx); + + rc = vm_resume_user_devs(ctx); + if (rc != 0) + EPRINTF("Could not resume devices"); +done: + close(s); + return (error); +} + +int +vm_recv_migrate_req(struct vmctx *ctx, struct migrate_req req) +{ + int s, con_socket; + int rc; + int migration_type; + size_t migration_completed; + + rc = migrate_connections(req, &s, &con_socket, MIGRATION_RECV_REQ); + if (rc != 0) { + EPRINTF("Could not create connections"); + return (-1); + } + + rc = migration_check_specs(con_socket, MIGRATION_RECV_REQ); + if (rc < 0) { + EPRINTF("Error while checking specs"); + close(con_socket); + close(s); + return (rc); + } + + rc = migration_transfer_data(con_socket, &migration_type, + sizeof(migration_type), MIGRATION_RECV_REQ); + if (rc < 0) { + EPRINTF("Could not recv migration type"); + return (-1); + } + + /* For recv, the only difference between warm and live migration is the + * way in which the memory is migrated. + */ + if (migration_type) { + rc = live_migrate_recv(ctx, con_socket); + if (rc != 0) { + EPRINTF("Could not live migrate the guest's memory"); + close(con_socket); + close(s); + return (rc); + } + } else { + /* if not live migration, then migrate memory normally. */ + rc = migrate_recv_memory(ctx, con_socket); + if (rc < 0) { + EPRINTF("Could not recv lowmem and highmem"); + close(con_socket); + close(s); + return (-1); + } + } + + rc = migrate_kern_data(ctx, con_socket, MIGRATION_RECV_REQ); + if (rc < 0) { + EPRINTF("Could not recv kern data"); + close(con_socket); + close(s); + return (-1); + } + + rc = migrate_devs(ctx, con_socket, MIGRATION_RECV_REQ); + if (rc < 0) { + EPRINTF("Could not recv pci devs"); + close(con_socket); + close(s); + return (-1); + } + + fprintf(stdout, "%s: Migration completed\r\n", __func__); + migration_completed = MIGRATION_SPECS_OK; + rc = migration_transfer_data(con_socket, &migration_completed, + sizeof(migration_completed), MIGRATION_SEND_REQ); + if (rc < 0) { + EPRINTF("Could not send migration completed remote"); + close(con_socket); + close(s); + return (-1); + } + + close(con_socket); + close(s); + return (0); +} + diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h --- a/usr.sbin/bhyve/snapshot.h +++ b/usr.sbin/bhyve/snapshot.h @@ -44,9 +44,13 @@ #define BHYVE_RUN_DIR "/var/run/bhyve/" #define MAX_SNAPSHOT_FILENAME PATH_MAX +#define MAX_HOSTNAME_LEN 255 +#define DEFAULT_MIGRATION_PORT 24983 struct vmctx; +#define SNAPSHOT_BUFFER_SIZE (20 * MB) + struct restore_state { int kdata_fd; int vmmem_fd; @@ -60,15 +64,23 @@ ucl_object_t *meta_root_obj; }; +struct __attribute__((packed)) migrate_req { + char host[MAX_HOSTNAME_LEN]; + unsigned int port; +}; + /* Filename that will be used for save/restore */ struct checkpoint_op { char snapshot_filename[MAX_SNAPSHOT_FILENAME]; + struct migrate_req migrate_req; }; /* Messages that a bhyve process understands. */ enum ipc_opcode { START_CHECKPOINT, START_SUSPEND, + START_MIGRATE, + START_MIGRATE_LIVE, }; /* @@ -88,8 +100,10 @@ struct checkpoint_thread_info { struct vmctx *ctx; int socket_fd; + struct sockaddr_un *addr; }; +const char **get_pci_devs(int *); typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *); typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *); @@ -106,6 +120,9 @@ enum snapshot_req req; /* request type */ }; +const struct vm_snapshot_dev_info *get_snapshot_devs(int *ndevs); +const struct vm_snapshot_kern_info *get_snapshot_kern_structs(int *ndevs); + void destroy_restore_state(struct restore_state *rstate); const char *lookup_vmname(struct restore_state *rstate); @@ -116,6 +133,8 @@ void checkpoint_cpu_add(int vcpu); void checkpoint_cpu_resume(int vcpu); void checkpoint_cpu_suspend(int vcpu); +void vm_vcpu_pause(struct vmctx *ctx); +void vm_vcpu_resume(struct vmctx *ctx); int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -31,6 +31,8 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $FreeBSD$ */ #include @@ -85,6 +87,7 @@ #include "ioapic.h" #include "mem.h" #include "mevent.h" +#include "migration.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" @@ -166,6 +169,24 @@ { "vrtc", STRUCT_VRTC }, }; +const struct vm_snapshot_dev_info * +get_snapshot_devs(int *ndevs) +{ + if (ndevs != NULL) + *ndevs = nitems(snapshot_devs); + + return (snapshot_devs); +} + +const struct vm_snapshot_kern_info * +get_snapshot_kern_structs(int *ndevs) +{ + if (ndevs != NULL) + *ndevs = nitems(snapshot_kern_structs); + + return (snapshot_kern_structs); +} + static cpuset_t vcpus_active, vcpus_suspended; static pthread_mutex_t vcpu_lock; static pthread_cond_t vcpus_idle, vcpus_can_run; @@ -1301,7 +1322,7 @@ pthread_mutex_unlock(&vcpu_lock); } -static void +void vm_vcpu_pause(struct vmctx *ctx) { @@ -1313,7 +1334,7 @@ pthread_mutex_unlock(&vcpu_lock); } -static void +void vm_vcpu_resume(struct vmctx *ctx) { @@ -1444,6 +1465,8 @@ handle_message(struct ipc_message *imsg, struct vmctx *ctx) { int err; + struct migrate_req req; + int memflags; switch (imsg->code) { case START_CHECKPOINT: @@ -1451,6 +1474,51 @@ break; case START_SUSPEND: err = vm_checkpoint(ctx, imsg->data.op.snapshot_filename, true); + break; + case START_MIGRATE: + fprintf(stdout, "Starting the warm migration procedure\r\n"); + memset(&req, 0, sizeof(struct migrate_req)); + req.port = imsg->data.op.migrate_req.port; + memcpy(req.host, imsg->data.op.migrate_req.host, MAX_HOSTNAME_LEN); + req.host[MAX_HOSTNAME_LEN - 1] = 0; + fprintf(stderr, "%s: IP address used for migration: %s;\r\n" + "Port used for migration: %d\r\n", + __func__, + req.host, + req.port); + + err = vm_send_migrate_req(ctx, req, false); + break; + case START_MIGRATE_LIVE: + fprintf(stdout, "Starting the live migration procedure\r\n"); + + /* Currently, the live migration is implemented only + * for guests that are started using -S (wired + * memory option). + */ + + /* Check memflags. If the VM_MEM_F_WIRED bit is not + * set, then the live migration procedure cannot be + * done. */ + memflags = vm_get_memflags(ctx); + if (!(memflags & VM_MEM_F_WIRED)) { + fprintf(stderr, "%s: Migration not supported for un-wired guests\r\n", __func__); + err = -1; + break; + } + + memset(&req, 0, sizeof(struct migrate_req)); + req.port = imsg->data.op.migrate_req.port; + memcpy(req.host, imsg->data.op.migrate_req.host, MAX_HOSTNAME_LEN); + req.host[MAX_HOSTNAME_LEN - 1] = 0; + fprintf(stderr, "%s: IP address used for migration: %s;\r\n" + "Port used for migration: %d\r\n", + __func__, + req.host, + req.port); + + err = vm_send_migrate_req(ctx, req, true); + break; default: EPRINTLN("Unrecognized checkpoint operation\n"); diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c --- a/usr.sbin/bhyvectl/bhyvectl.c +++ b/usr.sbin/bhyvectl/bhyvectl.c @@ -90,6 +90,8 @@ #ifdef BHYVE_SNAPSHOT " [--checkpoint=]\n" " [--suspend=]\n" + " [--migrate=,]\n" + " [--migrate-live=]\n" #endif " [--get-all]\n" " [--get-stats]\n" @@ -303,6 +305,8 @@ #ifdef BHYVE_SNAPSHOT static int vm_checkpoint_opt; static int vm_suspend_opt; +static int vm_migrate; +static int vm_migrate_live; #endif /* @@ -594,6 +598,8 @@ #ifdef BHYVE_SNAPSHOT SET_CHECKPOINT_FILE, SET_SUSPEND_FILE, + MIGRATE_VM, + MIGRATE_VM_LIVE, #endif }; @@ -1466,6 +1472,8 @@ #ifdef BHYVE_SNAPSHOT { "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE}, { "suspend", REQ_ARG, 0, SET_SUSPEND_FILE}, + { "migrate", REQ_ARG, 0, MIGRATE_VM}, + { "migrate-live", REQ_ARG, 0, MIGRATE_VM_LIVE}, #endif }; @@ -1736,6 +1744,49 @@ return (send_message(ctx, (void *)&imsg, length)); } + +static int +send_start_migrate(struct vmctx *ctx, const char *migrate_vm, bool live) +{ + struct ipc_message imsg; + char *hostname, *pos; + size_t length; + int rc; + + if (live) + imsg.code = START_MIGRATE_LIVE; + else + imsg.code = START_MIGRATE; + + memset(imsg.data.op.migrate_req.host, 0, MAX_HOSTNAME_LEN); + + hostname = strdup(migrate_vm); + + if ((pos = strchr(hostname, ',')) != NULL ) { + *pos = '\0'; + strlcpy(imsg.data.op.migrate_req.host, hostname, MAX_HOSTNAME_LEN); + pos = pos + 1; + + rc = sscanf(pos, "%d", &(imsg.data.op.migrate_req.port)); + + if (rc == 0) { + fprintf(stderr, "Could not parse the port\r\n"); + free(hostname); + return -1; + } + } else { + strlcpy(imsg.data.op.migrate_req.host, hostname, MAX_HOSTNAME_LEN); + + /* If only one variable could be read, it should be the host */ + imsg.data.op.migrate_req.port = DEFAULT_MIGRATION_PORT; + } + + free(hostname); + + length = offsetof(struct ipc_message, data) + sizeof(imsg.data.op); + + return (send_message(ctx, (void *)&imsg, length)); +} #endif int @@ -1755,7 +1806,7 @@ struct tm tm; struct option *opts; #ifdef BHYVE_SNAPSHOT - char *checkpoint_file, *suspend_file; + char *checkpoint_file, *suspend_file, *migrate_host; #endif cpu_intel = cpu_vendor_intel(); @@ -1924,6 +1975,14 @@ vm_suspend_opt = 1; suspend_file = optarg; break; + case MIGRATE_VM: + vm_migrate = 1; + migrate_host = optarg; + break; + case MIGRATE_VM_LIVE: + vm_migrate_live = 1; + migrate_host = optarg; + break; #endif default: usage(cpu_intel); @@ -2402,6 +2461,12 @@ if (!error && vm_suspend_opt) error = snapshot_request(ctx, suspend_file, START_SUSPEND); + + if (!error && vm_migrate) + error = send_start_migrate(ctx, migrate_host, false); + + if (!error && vm_migrate_live) + error = send_start_migrate(ctx, migrate_host, true); #endif free (opts);