Page MenuHomeFreeBSD

D30954.diff
No OneTemporary

D30954.diff

diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -262,4 +262,16 @@
int vm_snapshot_req(struct vm_snapshot_meta *meta);
int vm_restore_time(struct vmctx *ctx);
+int vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages,
+ size_t *highmem_pages);
+int vm_set_vmm_migration_segments(struct vmctx *ctx,
+ struct vmm_migration_segment *lowmem,
+ struct vmm_migration_segment *highmem);
+int vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num);
+
+int vm_copy_vmm_pages(struct vmctx *ctx,
+ struct vmm_migration_pages_req *pages_req);
+int vm_init_vmm_migration_pages_req(struct vmctx *ctx,
+ struct vmm_migration_pages_req *req);
+
#endif /* _VMMAPI_H_ */
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -1630,6 +1630,116 @@
return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
}
+int
+vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages, size_t *highmem_pages)
+{
+ /* ctx cannot be NULL */
+ if (ctx == NULL)
+ return (-1);
+
+ if (lowmem_pages != NULL)
+ *lowmem_pages = ctx->lowmem / PAGE_SIZE;
+
+ if (highmem_pages != NULL)
+ *highmem_pages = ctx->highmem / PAGE_SIZE;
+
+ return (0);
+}
+
+int
+vm_set_vmm_migration_segments(struct vmctx *ctx,
+ struct vmm_migration_segment *lowmem,
+ struct vmm_migration_segment *highmem)
+{
+
+ if (lowmem != NULL) {
+ lowmem->start = 0;
+ lowmem->end = ctx->lowmem;
+ }
+
+ if (highmem != NULL) {
+ if (ctx->highmem != 0) {
+ highmem->start = 4 * GB;
+ highmem->end = 4 * GB + ctx->highmem;
+ }
+ }
+
+ return (0);
+}
+
+int
+vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num_pages)
+{
+ int error;
+ struct vm_get_dirty_page_list list;
+
+ bzero(&list, sizeof(struct vm_get_dirty_page_list));
+ list.page_list = (uint8_t *)page_list;
+ list.num_pages = num_pages;
+
+ error = vm_set_vmm_migration_segments(ctx, &(list.lowmem),
+ &(list.highmem));
+
+ error = ioctl(ctx->fd, VM_GET_DIRTY_PAGE_LIST, &list);
+
+ return (error);
+}
+
+int
+vm_copy_vmm_pages(struct vmctx *ctx, struct vmm_migration_pages_req *pages_req)
+{
+ int error;
+ size_t index;
+
+ if (pages_req == NULL)
+ return (-1);
+
+ if (pages_req->pages_required > VMM_PAGE_CHUNK)
+ return (E2BIG);
+
+ for (index = 0; index < pages_req->pages_required; index ++) {
+ if (pages_req->pages[index].page == NULL)
+ return (-1);
+
+ if (pages_req->req_type == VMM_GET_PAGES)
+ memset(pages_req->pages[index].page, 0, PAGE_SIZE);
+ }
+
+ error = ioctl(ctx->fd, VM_COPY_VMM_PAGES, pages_req);
+
+ return (error);
+}
+
+int
+vm_init_vmm_migration_pages_req(struct vmctx *ctx,
+ struct vmm_migration_pages_req *req)
+{
+ size_t index;
+ struct vmm_migration_page *page;
+
+ vm_set_vmm_migration_segments(ctx, &(req->lowmem_segment),
+ &(req->highmem_segment));
+
+ for (index = 0; index < VMM_PAGE_CHUNK; index++) {
+ page = &req->pages[index];
+ page->page = malloc(PAGE_SIZE * sizeof(uint8_t));
+ if (page->page == NULL)
+ goto deallocate_error;
+ memset(page->page, 0, PAGE_SIZE * sizeof(uint8_t));
+ }
+
+ return (0);
+
+deallocate_error:
+ for (index = 0; index < VMM_PAGE_CHUNK; index ++) {
+ page = &req->pages[index];
+ if (page->page != NULL)
+ free(page->page);
+ }
+
+ return (-1);
+}
+
int
vm_set_topology(struct vmctx *ctx,
uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -35,6 +35,8 @@
#include <x86/segments.h>
struct vm_snapshot_meta;
+struct vm_get_dirty_page_list;
+struct vmm_migration_pages_req;
#ifdef _KERNEL
SDT_PROVIDER_DECLARE(vmm);
@@ -287,6 +289,8 @@
void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta);
int vm_restore_time(struct vm *vm);
+int vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list);
+int vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req);
#ifdef _SYS__CPUSET_H_
/*
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -31,6 +31,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
+#include "vmm_migration.h"
+
struct vm_snapshot_meta;
#ifdef _KERNEL
@@ -256,6 +258,13 @@
};
_Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI");
+struct vm_get_dirty_page_list {
+ uint8_t *page_list;
+ size_t num_pages;
+ struct vmm_migration_segment lowmem;
+ struct vmm_migration_segment highmem;
+};
+
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
@@ -344,7 +353,9 @@
/* checkpoint */
IOCNUM_SNAPSHOT_REQ = 113,
- IOCNUM_RESTORE_TIME = 115
+ IOCNUM_RESTORE_TIME = 115,
+ IOCNUM_VM_GET_DIRTY_PAGE_LIST = 117,
+ IOCNUM_VM_COPY_VMM_PAGES = 118,
};
#define VM_RUN \
@@ -475,4 +486,8 @@
_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta)
#define VM_RESTORE_TIME \
_IOWR('v', IOCNUM_RESTORE_TIME, int)
+#define VM_GET_DIRTY_PAGE_LIST \
+ _IOWR('v', IOCNUM_VM_GET_DIRTY_PAGE_LIST, struct vm_get_dirty_page_list)
+#define VM_COPY_VMM_PAGES \
+ _IOWR('v', IOCNUM_VM_COPY_VMM_PAGES, struct vmm_migration_pages_req)
#endif
diff --git a/sys/amd64/include/vmm_migration.h b/sys/amd64/include/vmm_migration.h
new file mode 100644
--- /dev/null
+++ b/sys/amd64/include/vmm_migration.h
@@ -0,0 +1,66 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+#ifndef _VMM_MIGRATION_H_
+#define _VMM_MIGRATION_H_
+
+#define VMM_PAGE_CHUNK 10
+
+enum migration_req_type {
+ VMM_GET_PAGES = 0,
+ VMM_SET_PAGES = 1,
+};
+
+struct vmm_migration_page {
+ vm_pindex_t pindex;
+ uint8_t *page;
+};
+
+/*
+ * A bhyve guest has two memory segments:
+ * - lowmem segment: mapped from 0GB to 3GB (which is lowmem_limit)
+ * - highmem segment: mapped starting from 4GB
+ * The object that represents a segment is identified by start and end values.
+ * */
+struct vmm_migration_segment {
+ vm_offset_t start;
+ vm_offset_t end;
+};
+
+struct vmm_migration_pages_req {
+ size_t pages_required;
+ enum migration_req_type req_type;
+ struct vmm_migration_segment lowmem_segment;
+ struct vmm_migration_segment highmem_segment;
+ struct vmm_migration_page pages[VMM_PAGE_CHUNK];
+};
+
+#endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -65,6 +65,7 @@
#include <machine/pcb.h>
#include <machine/smp.h>
#include <machine/md_var.h>
+#include <machine/vmparam.h>
#include <x86/psl.h>
#include <x86/apicreg.h>
#include <x86/ifunc.h>
@@ -73,6 +74,7 @@
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_snapshot.h>
+#include <machine/vmm_migration.h>
#include "vmm_ioport.h"
#include "vmm_ktr.h"
@@ -146,6 +148,9 @@
};
#define VM_MAX_MEMMAPS 8
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
/*
* Initialization:
* (o) initialized the first time the VM is created
@@ -2949,4 +2954,191 @@
return (0);
}
-#endif
+
+static inline void
+vm_search_dirty_pages_in_object(vm_object_t object, size_t start, size_t end,
+ size_t offset, uint8_t *page_list)
+{
+ vm_pindex_t pindex;
+ vm_page_t m;
+ uint8_t result;
+
+ for (pindex = start / PAGE_SIZE; pindex < end / PAGE_SIZE; pindex ++) {
+ VM_OBJECT_WLOCK(object);
+ m = vm_page_lookup(object, pindex);
+ VM_OBJECT_WUNLOCK(object);
+ if (m != NULL) {
+ result = vm_page_test_vmm_dirty(m);
+ copyout(&result, page_list + pindex - offset, sizeof(result));
+ }
+ }
+
+}
+
+int
+vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list)
+{
+ int error = 0;
+ struct vmspace *vm_vmspace;
+ struct vm_map *vmmap;
+ struct vm_map_entry *entry;
+ struct vm_object *object;
+ uint8_t *page_list;
+ size_t offset;
+
+ page_list = list->page_list;
+
+ if (page_list == NULL)
+ return (-1);
+
+ vm_vmspace = vm->vmspace;
+
+ if (vm_vmspace == NULL) {
+ printf("%s: vm_vmspace is null\r\n", __func__);
+ error = -1;
+ return (error);
+ }
+
+ vmmap = &vm_vmspace->vm_map;
+
+ vm_map_lock(vmmap);
+ if (vmmap->busy)
+ vm_map_wait_busy(vmmap);
+
+ for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) {
+ object = entry->object.vm_object;
+
+ if (entry->start == list->lowmem.start &&
+ entry->end == list->lowmem.end) {
+ // if object is lowmem
+ if (object == NULL)
+ continue;
+ vm_search_dirty_pages_in_object(object,
+ list->lowmem.start,
+ list->lowmem.end,
+ 0,
+ page_list);
+ }
+
+ if (entry->start == list->highmem.start &&
+ entry->end == list->highmem.end) {
+ if (object == NULL)
+ continue;
+ // if object is highmem
+ offset = (list->highmem.start - list->lowmem.end) / PAGE_SIZE;
+ vm_search_dirty_pages_in_object(object,
+ list->highmem.start,
+ list->highmem.end,
+ offset,
+ page_list);
+ }
+ }
+
+ vm_map_unlock(vmmap);
+
+ return (error);
+}
+
+static inline void
+vm_copy_object_pages(vm_object_t lowmem_object, vm_object_t highmem_object,
+ struct vmm_migration_pages_req *page_req)
+{
+ vm_pindex_t pindex;
+ vm_object_t object;
+ struct vmm_migration_page migration_page;
+ size_t page_idx, limit_page;
+ void *dst;
+ size_t pindex_offset;
+ enum migration_req_type req_type;
+
+ req_type = page_req->req_type;
+
+ if (lowmem_object == NULL) {
+ printf("%s: lowmem_object is NULL\r\n", __func__);
+ return;
+ }
+ limit_page = 3UL * GB / PAGE_SIZE;
+ for (page_idx = 0; page_idx < page_req->pages_required; page_idx ++) {
+ migration_page = page_req->pages[page_idx];
+ pindex = migration_page.pindex;
+ dst = (void *) migration_page.page;
+ if (pindex >= limit_page) {
+ if (highmem_object == NULL) {
+ printf("%s: highmem_object is NULL\r\n", __func__);
+ return;
+ }
+ object = highmem_object;
+ pindex_offset = 1UL * GB / PAGE_SIZE;
+ } else {
+ object = lowmem_object;
+ pindex_offset = 0;
+ }
+
+ if (req_type == VMM_GET_PAGES) {
+ VM_OBJECT_WLOCK(object);
+ vm_object_get_page(object, pindex + pindex_offset, dst);
+ VM_OBJECT_WUNLOCK(object);
+ }
+ else if (req_type == VMM_SET_PAGES) {
+ VM_OBJECT_WLOCK(object);
+ vm_object_set_page(object, pindex + pindex_offset, dst);
+ VM_OBJECT_WUNLOCK(object);
+ }
+ else
+ return;
+ }
+}
+
+int
+vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req)
+{
+ int error = 0;
+ struct vmspace *vm_vmspace;
+ struct vm_map *vmmap;
+ struct vm_map_entry *entry;
+ struct vm_object *lowmem_object, *highmem_object, *object;
+ struct vmm_migration_segment lowmem_segment, highmem_segment;
+
+ lowmem_segment = pages_req->lowmem_segment;
+ highmem_segment = pages_req->highmem_segment;
+ vm_vmspace = vm->vmspace;
+
+ if (vm_vmspace == NULL) {
+ printf("%s: vm_vmspace is null\r\n", __func__);
+ error = -1;
+ return (error);
+ }
+
+ vmmap = &vm_vmspace->vm_map;
+
+ vm_map_lock(vmmap);
+ if (vmmap->busy)
+ vm_map_wait_busy(vmmap);
+
+ lowmem_object = NULL;
+ highmem_object = NULL;
+ for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) {
+ object = entry->object.vm_object;
+
+ if (entry->start == lowmem_segment.start &&
+ entry->end == lowmem_segment.end) {
+ lowmem_object = object;
+ }
+
+ if (entry->start == highmem_segment.start &&
+ entry->end == highmem_segment.end) {
+ highmem_object = object;
+ }
+ }
+
+ if (lowmem_object == NULL)
+ return (-1);
+
+ vm_copy_object_pages(lowmem_object, highmem_object, pages_req);
+
+ vm_map_unlock(vmmap);
+
+ return (error);
+}
+#endif /* BHYVE_SNAPSHOT */
+
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -59,6 +59,7 @@
#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_snapshot.h>
#include <x86/apicreg.h>
+#include <machine/vmm_migration.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
@@ -388,6 +389,8 @@
int *regnums;
#ifdef BHYVE_SNAPSHOT
struct vm_snapshot_meta *snapshot_meta;
+ struct vm_get_dirty_page_list *page_list;
+ struct vmm_migration_pages_req *pages_req;
#endif
error = vmm_priv_check(curthread->td_ucred);
@@ -871,6 +874,14 @@
case VM_RESTORE_TIME:
error = vm_restore_time(sc->vm);
break;
+ case VM_GET_DIRTY_PAGE_LIST:
+ page_list = (struct vm_get_dirty_page_list *)data;
+ error = vm_get_dirty_page_list(sc->vm, page_list);
+ break;
+ case VM_COPY_VMM_PAGES:
+ pages_req = (struct vmm_migration_pages_req *)data;
+ error = vm_copy_vmm_pages(sc->vm, pages_req);
+ break;
#endif
default:
error = ENOTTY;
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -390,6 +390,8 @@
void vm_object_unwire(vm_object_t object, vm_ooffset_t offset,
vm_size_t length, uint8_t queue);
struct vnode *vm_object_vnode(vm_object_t object);
+int vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst);
+int vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src);
#endif /* _KERNEL */
#endif /* _VM_OBJECT_ */
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -2592,6 +2592,52 @@
CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
"List of VM objects");
+int
+vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst)
+{
+ vm_page_t page;
+ vm_offset_t page_src;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ page = vm_page_lookup(object, pindex);
+ if (page == NULL) {
+ // cannot find page
+ return (-1);
+ }
+
+ vm_page_xbusy(page);
+ page->oflags &= ~VPO_VMM_DIRTY;
+
+ pmap_clear_modify(page);
+
+ page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page));
+ copyout((void *)page_src, dst, PAGE_SIZE);
+ vm_page_xunbusy(page);
+
+ return (0);
+}
+
+int
+vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src)
+{
+ vm_page_t page;
+ vm_offset_t page_src;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ page = vm_page_lookup(object, pindex);
+ if (page == NULL) {
+ // cannot find page
+ return (-1);
+ }
+
+ page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page));
+ copyin(src, (void *)page_src, PAGE_SIZE);
+
+ return (0);
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kernel.h>
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -295,6 +295,7 @@
#define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */
#define VPO_UNMANAGED 0x04 /* no PV management for page */
#define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */
+#define VPO_VMM_DIRTY 0x80 /* dirty bit used for bhyve migration */
/*
* Busy page implementation details.
@@ -693,6 +694,7 @@
void vm_page_valid(vm_page_t m);
int vm_page_is_valid(vm_page_t, int, int);
void vm_page_test_dirty(vm_page_t);
+uint8_t vm_page_test_vmm_dirty(vm_page_t m);
vm_page_bits_t vm_page_bits(int base, int size);
void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count);
@@ -890,6 +892,7 @@
vm_page_dirty_KBI(m);
#else
m->dirty = VM_PAGE_BITS_ALL;
+ m->oflags |= VPO_VMM_DIRTY;
#endif
}
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1483,6 +1483,28 @@
/* Refer to this operation by its public name. */
KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!"));
m->dirty = VM_PAGE_BITS_ALL;
+ m->oflags |= VPO_VMM_DIRTY;
+}
+
+uint8_t
+vm_page_test_vmm_dirty(vm_page_t m)
+{
+ uint64_t value;
+
+ vm_page_xbusy(m);
+ vm_page_test_dirty(m);
+ vm_page_xunbusy(m);
+
+// VM_OBJECT_ASSERT_WLOCKED(m->object);
+
+ value = m->oflags & VPO_VMM_DIRTY;
+ if (value == 0 && pmap_is_modified(m))
+ value = 1;
+
+ if (value == 0)
+ return (0);
+ else
+ return (1);
}
/*
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -34,6 +34,7 @@
kernemu_dev.c \
mem.c \
mevent.c \
+ migration.c \
mptbl.c \
net_backends.c \
net_utils.c \
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -96,6 +96,9 @@
#include "kernemu_dev.h"
#include "mem.h"
#include "mevent.h"
+#ifdef BHYVE_SNAPSHOT
+#include "migration.h"
+#endif
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
@@ -243,6 +246,7 @@
" -p: pin 'vcpu' to 'hostcpu'\n"
#ifdef BHYVE_SNAPSHOT
" -r: path to checkpoint file\n"
+ " -R: <host,port> the source vm host and port for migration\n"
#endif
" -S: guest memory cannot be swapped\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
@@ -1214,10 +1218,12 @@
char *optstr;
#ifdef BHYVE_SNAPSHOT
char *restore_file;
+ char *receive_migration;
struct restore_state rstate;
int vcpu;
restore_file = NULL;
+ receive_migration = NULL;
#endif
init_config();
@@ -1225,7 +1231,7 @@
progname = basename(argv[0]);
#ifdef BHYVE_SNAPSHOT
- optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:";
+ optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:R:";
#else
optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:";
#endif
@@ -1278,6 +1284,9 @@
case 'r':
restore_file = optarg;
break;
+ case 'R':
+ receive_migration = optarg;
+ break;
#endif
case 's':
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
@@ -1494,7 +1503,25 @@
exit(1);
}
}
-#endif
+
+ if (receive_migration != NULL) {
+ if (vm_pause_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to pause PCI device state.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Starting the migration process...\r\n");
+ if (receive_vm_migration(ctx, receive_migration) != 0) {
+ fprintf(stderr, "Failed to migrate the vm.\r\n");
+ exit(1);
+ }
+
+ if (vm_resume_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to resume PCI device state.\n");
+ exit(1);
+ }
+ }
+#endif /* BHYVE_SNAPSHOT */
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
@@ -1549,7 +1576,7 @@
if (init_checkpoint_thread(ctx) < 0)
printf("Failed to start checkpoint thread!\r\n");
- if (restore_file != NULL)
+ if ((restore_file != NULL) || (receive_migration != NULL))
vm_restore_time(ctx);
#endif
@@ -1563,7 +1590,7 @@
* If we restore a VM, start all vCPUs now (including APs), otherwise,
* let the guest OS to spin them up later via vmexits.
*/
- if (restore_file != NULL) {
+ if ((restore_file != NULL) || (receive_migration != NULL)) {
for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
if (vcpu == BSP)
continue;
diff --git a/usr.sbin/bhyve/migration.h b/usr.sbin/bhyve/migration.h
new file mode 100644
--- /dev/null
+++ b/usr.sbin/bhyve/migration.h
@@ -0,0 +1,87 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017-2020 Elena Mihailescu
+ * Copyright (c) 2017-2020 Darius Mihai
+ * Copyright (c) 2017-2020 Mihai Carabas
+ * All rights reserved.
+ * The migration feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVE_MIGRATION_
+#define _BHYVE_MIGRATION_
+
+#include <machine/vmm_dev.h>
+#include <vmmapi.h>
+#include "snapshot.h"
+
+struct vmctx;
+
+int receive_vm_migration(struct vmctx *ctx, char *migration_data);
+
+/* Warm Migration */
+#define MAX_DEV_NAME_LEN 64
+
+#define MAX_IP_LEN 64
+#define MAX_SPEC_LEN 256
+
+#define MIGRATION_SPECS_OK 0
+#define MIGRATION_SPECS_NOT_OK 1
+
+#define NO_KERN_STRUCT -1
+
+enum migration_transfer_req {
+ MIGRATION_SEND_REQ = 0,
+ MIGRATION_RECV_REQ = 1
+};
+
+enum message_types {
+ MESSAGE_TYPE_SPECS = 1,
+ MESSAGE_TYPE_METADATA = 2,
+ MESSAGE_TYPE_RAM = 3,
+ MESSAGE_TYPE_KERN = 4,
+ MESSAGE_TYPE_DEV = 5,
+ MESSAGE_TYPE_UNKNOWN = 8,
+};
+
+struct __attribute__((packed)) migration_message_type {
+ size_t len;
+ unsigned int type; /* enum message_type */
+ unsigned int req_type; /* enum snapshot_req */
+ char name[MAX_DEV_NAME_LEN];
+};
+
+struct __attribute__((packed)) migration_system_specs {
+ char hw_machine[MAX_SPEC_LEN];
+ char hw_model[MAX_SPEC_LEN];
+ size_t hw_pagesize;
+};
+
+int vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool live);
+int vm_recv_migrate_req(struct vmctx *ctx, struct migrate_req req);
+
+#endif
diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c
new file mode 100644
--- /dev/null
+++ b/usr.sbin/bhyve/migration.c
@@ -0,0 +1,1444 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017-2020 Elena Mihailescu
+ * Copyright (c) 2017-2020 Darius Mihai
+ * Copyright (c) 2017-2020 Mihai Carabas
+ * All rights reserved.
+ * The migration feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#include <capsicum_helpers.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/vmm.h>
+#ifndef WITHOUT_CAPSICUM
+#include <machine/vmm_dev.h>
+#endif
+#include <machine/vmm_migration.h>
+#include <vmmapi.h>
+
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <unistd.h>
+
+#include "migration.h"
+#include "pci_emul.h"
+#include "snapshot.h"
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+#define ALLOCA_VM_SNAPSHOT_META(CTX, DEV_NAME, DEV_REQ, BUFFER, BUFFER_SIZE, OP) \
+({ \
+ &(struct vm_snapshot_meta) { \
+ .ctx = CTX, \
+ .dev_name = DEV_NAME, \
+ .dev_req = DEV_REQ, \
+ \
+ .buffer.buf_start = BUFFER, \
+ .buffer.buf_size = BUFFER_SIZE, \
+ .op = OP, \
+ }; \
+ \
+})
+
+#ifdef BHYVE_DEBUG
+#define DPRINTF(FMT, ...) \
+({ \
+ fprintf(stderr, "%s: " FMT "\r\n", __func__, ##__VA_ARGS__); \
+ })
+#else
+#define DPRINTF(FMT, ...)
+#endif
+
+#define EPRINTF(FMT, ...) \
+({ \
+ fprintf(stderr, "%s: " FMT "\r\n", __func__, ##__VA_ARGS__); \
+ })
+
+int
+receive_vm_migration(struct vmctx *ctx, char *migration_data)
+{
+ struct migrate_req req;
+ char *hostname, *pos;
+ int rc;
+
+ memset(req.host, 0, MAX_HOSTNAME_LEN);
+ hostname = strdup(migration_data);
+
+ if ((pos = strchr(hostname, ',')) != NULL) {
+ *pos = '\0';
+ strlcpy(req.host, hostname, MAX_HOSTNAME_LEN);
+ pos = pos + 1;
+
+ rc = sscanf(pos, "%d", &(req.port));
+
+ if (rc == 0) {
+ EPRINTF("Could not parse the port");
+ free(hostname);
+ return -1;
+ }
+ } else {
+ strlcpy(req.host, hostname, MAX_HOSTNAME_LEN);
+
+ /* If only one variable could be read, it should be the host */
+ req.port = DEFAULT_MIGRATION_PORT;
+ }
+
+ rc = vm_recv_migrate_req(ctx, req);
+
+ free(hostname);
+ return (rc);
+}
+
+static int
+get_system_specs_for_migration(struct migration_system_specs *specs)
+{
+ int mib[2];
+ size_t len_machine, len_model, len_pagesize;
+ char interm[MAX_SPEC_LEN];
+ int rc;
+ int num;
+
+ mib[0] = CTL_HW;
+ mib[1] = HW_MACHINE;
+ memset(interm, 0, MAX_SPEC_LEN);
+ len_machine = sizeof(interm);
+
+ rc = sysctl(mib, 2, interm, &len_machine, NULL, 0);
+ if (rc != 0) {
+ perror("Could not retrieve HW_MACHINE specs");
+ return (rc);
+ }
+ strlcpy(specs->hw_machine, interm, MAX_SPEC_LEN);
+
+ memset(interm, 0, MAX_SPEC_LEN);
+ mib[0] = CTL_HW;
+ mib[1] = HW_MODEL;
+ len_model = sizeof(interm);
+ rc = sysctl(mib, 2, interm, &len_model, NULL, 0);
+ if (rc != 0) {
+ perror("Could not retrieve HW_MODEL specs");
+ return (rc);
+ }
+ strlcpy(specs->hw_model, interm, MAX_SPEC_LEN);
+
+ mib[0] = CTL_HW;
+ mib[1] = HW_PAGESIZE;
+ len_pagesize = sizeof(num);
+ rc = sysctl(mib, 2, &num, &len_pagesize, NULL, 0);
+ if (rc != 0) {
+ perror("Could not retrieve HW_PAGESIZE specs");
+ return (rc);
+ }
+ specs->hw_pagesize = num;
+
+ return (0);
+}
+
+static int
+migration_transfer_data(int socket, void *msg, size_t len, enum migration_transfer_req req)
+{
+ uint64_t to_transfer, total_transferred;
+ int64_t transferred;
+
+ to_transfer = len;
+ total_transferred = 0;
+
+ while (to_transfer > 0) {
+ switch (req) {
+ case MIGRATION_SEND_REQ:
+ transferred = send(socket, msg + total_transferred,
+ to_transfer, 0);
+ break;
+ case MIGRATION_RECV_REQ:
+ transferred = recv(socket, msg + total_transferred,
+ to_transfer, 0);
+ break;
+ default:
+ DPRINTF("Unknown transfer option");
+ return (-1);
+ break;
+ }
+
+ if (transferred == 0)
+ break;
+ if (transferred < 0) {
+ perror("Error while transfering data");
+ return (transferred);
+ }
+
+ to_transfer -= transferred;
+ total_transferred += transferred;
+ }
+
+ return (0);
+}
+
+static int
+migration_check_specs(int socket, enum migration_transfer_req req)
+{
+ struct migration_system_specs local_specs;
+ struct migration_system_specs remote_specs;
+ struct migration_system_specs transfer_specs;
+ struct migration_message_type msg;
+ enum migration_transfer_req rev_req;
+ size_t response;
+ int rc;
+
+ if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) {
+ DPRINTF("Unknown option for migration req");
+ return (-1);
+ }
+
+ if (req == MIGRATION_SEND_REQ)
+ rev_req = MIGRATION_RECV_REQ;
+ else
+ rev_req = MIGRATION_SEND_REQ;
+
+ rc = get_system_specs_for_migration(&local_specs);
+ if (rc != 0) {
+ EPRINTF("Could not retrieve local specs");
+ return (rc);
+ }
+
+ if (req == MIGRATION_SEND_REQ) {
+ /* Send message type to server: specs & len */
+ msg.type = MESSAGE_TYPE_SPECS;
+ msg.len = sizeof(local_specs);
+ }
+
+ rc = migration_transfer_data(socket, &msg, sizeof(msg), req);
+ if (rc < 0) {
+ DPRINTF("Could not send message type");
+ return (-1);
+ }
+
+ if ((req == MIGRATION_RECV_REQ) && (msg.type != MESSAGE_TYPE_SPECS)) {
+ DPRINTF(" Wrong message type received from remote");
+ return (-1);
+ }
+
+ /* For the send req, we send the local specs and for the receive req
+ * we receive the remote specs.
+ */
+ if (req == MIGRATION_SEND_REQ)
+ transfer_specs = local_specs;
+
+ rc = migration_transfer_data(socket, &transfer_specs, sizeof(transfer_specs), req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer system specs");
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ remote_specs = transfer_specs;
+
+ /* Check specs */
+ response = MIGRATION_SPECS_OK;
+ if ((strncmp(local_specs.hw_model, remote_specs.hw_model, MAX_SPEC_LEN) != 0)
+ || (strncmp(local_specs.hw_machine, remote_specs.hw_machine, MAX_SPEC_LEN) != 0)
+ || (local_specs.hw_pagesize != remote_specs.hw_pagesize)
+ ) {
+ EPRINTF("System specification mismatch");
+ DPRINTF("Local specs vs Remote Specs: \r\n"
+ "\tmachine: %s vs %s\r\n"
+ "\tmodel: %s vs %s\r\n"
+ "\tpagesize: %zu vs %zu\r\n",
+ local_specs.hw_machine,
+ remote_specs.hw_machine,
+ local_specs.hw_model,
+ remote_specs.hw_model,
+ local_specs.hw_pagesize,
+ remote_specs.hw_pagesize
+ );
+ response = MIGRATION_SPECS_NOT_OK;
+ }
+ }
+
+ /* The source will receive the result of the checkup (i.e.
+ * whether the migration is possible or the source and destination
+ * are incompatible for migration) and the destination will send the
+ * result of the checkup.
+ */
+ rc = migration_transfer_data(socket, &response, sizeof(response), rev_req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer response from server");
+ return (-1);
+ }
+
+ if (response == MIGRATION_SPECS_NOT_OK)
+ return (-1);
+
+ fprintf(stdout, "%s: System specification accepted\r\n", __func__);
+
+ return (0);
+
+}
+
+static int
+get_migration_host_and_type(const char *hostname, unsigned char *ipv4_addr,
+ unsigned char *ipv6_addr, int *type)
+{
+ struct addrinfo hints, *res;
+ void *addr;
+ int rc;
+
+ memset(&hints, 0, sizeof(hints));
+
+ hints.ai_family = AF_UNSPEC;
+
+ rc = getaddrinfo(hostname, NULL, &hints, &res);
+
+ if (rc != 0) {
+ DPRINTF("Could not get address info");
+ return (-1);
+ }
+
+ *type = res->ai_family;
+ switch(res->ai_family) {
+ case AF_INET:
+ addr = &((struct sockaddr_in *) res->ai_addr)->sin_addr;
+ inet_ntop(res->ai_family, addr, ipv4_addr, MAX_IP_LEN);
+ break;
+ case AF_INET6:
+ addr = &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr;
+ inet_ntop(res->ai_family, addr, ipv6_addr, MAX_IP_LEN);
+ break;
+ default:
+ DPRINTF("Unknown address family.");
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+migrate_check_memsize(size_t local_lowmem_size, size_t local_highmem_size,
+ size_t remote_lowmem_size, size_t remote_highmem_size)
+{
+ int ret;
+
+ ret = MIGRATION_SPECS_OK;
+
+ if (local_lowmem_size != remote_lowmem_size){
+ ret = MIGRATION_SPECS_NOT_OK;
+ DPRINTF("Local and remote lowmem size mismatch");
+ }
+
+ if (local_highmem_size != remote_highmem_size){
+ ret = MIGRATION_SPECS_NOT_OK;
+ DPRINTF("Local and remote highmem size mismatch");
+ }
+
+ return (ret);
+}
+
+static int
+migrate_recv_memory(struct vmctx *ctx, int socket)
+{
+ size_t local_lowmem_size, local_highmem_size;
+ size_t remote_lowmem_size, remote_highmem_size;
+ char *baseaddr;
+ int memsize_ok;
+ int rc;
+
+ local_lowmem_size = local_highmem_size = 0;
+ remote_lowmem_size = remote_highmem_size = 0;
+ rc = 0;
+
+ rc = vm_get_guestmem_from_ctx(ctx,
+ &baseaddr, &local_lowmem_size,
+ &local_highmem_size);
+ if (rc != 0) {
+ DPRINTF("Could not get guest lowmem size and highmem size");
+ return (rc);
+ }
+
+ rc = migration_transfer_data(socket, &remote_lowmem_size, sizeof(remote_lowmem_size), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv lowmem size");
+ return (rc);
+ }
+
+ rc = migration_transfer_data(socket, &remote_highmem_size, sizeof(remote_highmem_size), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv highmem size");
+ return (rc);
+ }
+
+ memsize_ok = migrate_check_memsize(local_lowmem_size, local_highmem_size,
+ remote_lowmem_size, remote_highmem_size);
+
+ rc = migration_transfer_data(socket,
+ &memsize_ok, sizeof(memsize_ok), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send migration_ok to remote");
+ return (rc);
+ }
+
+ if (memsize_ok != MIGRATION_SPECS_OK) {
+ DPRINTF("Memory size mismatch with remote host");
+ return (-1);
+ }
+
+ rc = migration_transfer_data(socket, baseaddr, local_lowmem_size, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv chunk lowmem.");
+ return (-1);
+ }
+
+ if (local_highmem_size > 0){
+ rc = migration_transfer_data(socket, baseaddr + 4 * GB, local_highmem_size, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv highmem");
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+migrate_send_memory(struct vmctx *ctx, int socket)
+{
+ size_t lowmem_size, highmem_size;
+ char *mmap_vm_lowmem, *mmap_vm_highmem;
+ char *baseaddr;
+ int memsize_ok;
+ int rc;
+
+ rc = 0;
+ mmap_vm_lowmem = MAP_FAILED;
+ mmap_vm_highmem = MAP_FAILED;
+
+ rc = vm_get_guestmem_from_ctx(ctx, &baseaddr,
+ &lowmem_size, &highmem_size);
+ if (rc != 0) {
+ DPRINTF("Could not get guest lowmem size and highmem size");
+ return (rc);
+ }
+
+ /* Send the size of the lowmem segment */
+ rc = migration_transfer_data(socket, &lowmem_size, sizeof(lowmem_size), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send lowmem size");
+ return (rc);
+ }
+
+ /* Send the size of the highmem segment */
+ rc = migration_transfer_data(socket, &highmem_size, sizeof(lowmem_size), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send highmem size");
+ return (rc);
+ }
+
+ /* Wait for answer - params ok (if memory size matches) */
+ rc = migration_transfer_data(socket, &memsize_ok, sizeof(memsize_ok), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not receive response from remote");
+ return (rc);
+ }
+
+ if (memsize_ok != MIGRATION_SPECS_OK) {
+ DPRINTF("Memory size mismatch with remote host");
+ return (-1);
+ }
+
+ mmap_vm_lowmem = baseaddr;
+ mmap_vm_highmem = baseaddr + 4 * GB;
+
+ /* Send the lowmem segment */
+ rc = migration_transfer_data(socket, mmap_vm_lowmem, lowmem_size, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send lowmem");
+ return (-1);
+ }
+
+ /* Send the highmem segment */
+ if (highmem_size > 0){
+ rc = migration_transfer_data(socket, mmap_vm_highmem, highmem_size, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send highmem");
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+/**
+ * The source host saves the state for the kernel structure that will be
+ * migrated and sends to the destination host a message that contains
+ * the type of data to be sent (MESSAGE_TYPE_KERN), the size of the structure
+ * to be received and the index that represents the kernel structure in order to
+ * be identified by the destination host. Then, the source host transfer the
+ * state of the kernel structure over the network and the destination host
+ * restores it.
+ */
+static inline int
+migrate_kern_struct(struct vmctx *ctx, int socket, char *buffer,
+ enum snapshot_req struct_req, enum migration_transfer_req req)
+{
+ int rc;
+ struct migration_message_type msg;
+ struct vm_snapshot_meta *meta;
+
+ if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) {
+ DPRINTF("Unknown request");
+ return (-1);
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ if (req == MIGRATION_SEND_REQ) {
+ msg.type = MESSAGE_TYPE_KERN;
+
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, NULL, struct_req, buffer, SNAPSHOT_BUFFER_SIZE, VM_SNAPSHOT_SAVE);
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = vm_snapshot_req(meta);
+ if (rc < 0) {
+ DPRINTF("Could not get struct with req %d", struct_req);
+ return (-1);
+ }
+
+ msg.len = vm_get_snapshot_size(meta);
+ msg.req_type = struct_req;
+
+ }
+
+ rc = migration_transfer_data(socket, &msg, sizeof(msg), req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer message type for kern struct %d", struct_req);
+ return (-1);
+ }
+
+ if ((req == MIGRATION_RECV_REQ) && (msg.type != MESSAGE_TYPE_KERN)) {
+ DPRINTF("Receive wrong message type.");
+ return (-1);
+ }
+
+ rc = migration_transfer_data(socket, buffer, msg.len, req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer struct with req %d", struct_req);
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, NULL, msg.req_type, buffer,
+ msg.len, VM_SNAPSHOT_RESTORE);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = vm_snapshot_req(meta);
+ if (rc != 0) {
+ DPRINTF("Failed to restore struct %d", msg.req_type);
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+migrate_kern_data(struct vmctx *ctx, int socket, enum migration_transfer_req req)
+{
+ int i, rc, error;
+ int ndevs;
+ char *buffer;
+ const struct vm_snapshot_kern_info *snapshot_kern_structs;
+
+ error = 0;
+ snapshot_kern_structs = get_snapshot_kern_structs(&ndevs);
+
+ buffer = malloc(SNAPSHOT_BUFFER_SIZE);
+ if (buffer == NULL) {
+ EPRINTF("Could not allocate memory.");
+ return (-1);
+ }
+
+ for (i = 0; i < ndevs; i++) {
+ if (req == MIGRATION_RECV_REQ) {
+ rc = migrate_kern_struct(ctx, socket, buffer, NO_KERN_STRUCT, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not restore struct %s", snapshot_kern_structs[i].struct_name);
+ error = -1;
+ break;
+ }
+ } else if (req == MIGRATION_SEND_REQ) {
+ rc = migrate_kern_struct(ctx, socket, buffer,
+ snapshot_kern_structs[i].req, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send %s", snapshot_kern_structs[i].struct_name);
+ error = -1;
+ break;
+ }
+ } else {
+ DPRINTF("Unknown transfer request");
+ error = -1;
+ break;
+ }
+ }
+
+ free(buffer);
+
+ return (error);
+}
+
+static inline const struct vm_snapshot_dev_info *
+find_entry_for_dev(const char *name)
+{
+ int i;
+ int ndevs;
+ const struct vm_snapshot_dev_info *snapshot_devs;
+
+ snapshot_devs = get_snapshot_devs(&ndevs);
+
+ for (i = 0; i < ndevs; i++) {
+ if (strncmp(name, snapshot_devs[i].dev_name, MAX_DEV_NAME_LEN) == 0) {
+ return (&snapshot_devs[i]);
+ }
+ }
+
+ return NULL;
+}
+
+static inline int
+migrate_transfer_dev(struct vmctx *ctx, int socket, const char *dev,
+ char *buffer, size_t len, enum migration_transfer_req req)
+{
+ int rc;
+ size_t data_size;
+ struct migration_message_type msg;
+ struct vm_snapshot_meta *meta;
+ const struct vm_snapshot_dev_info *dev_info;
+
+ if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) {
+ DPRINTF("Unknown transfer request option");
+ return (-1);
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ memset(buffer, 0, len);
+ if (req == MIGRATION_SEND_REQ) {
+ dev_info = find_entry_for_dev(dev);
+ if (dev_info == NULL) {
+ EPRINTF("Could not find the device %s "
+ "or migration not implemented yet for it.", dev);
+ return (0);
+ }
+
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, dev, 0, buffer, len, VM_SNAPSHOT_SAVE);
+
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = (*dev_info->snapshot_cb)(meta);
+ if (rc < 0) {
+ DPRINTF("Could not get info about %s dev", dev);
+ return (-1);
+ }
+
+ data_size = vm_get_snapshot_size(meta);
+
+ msg.type = MESSAGE_TYPE_DEV;
+ msg.len = data_size;
+ strlcpy(msg.name, dev, MAX_DEV_NAME_LEN);
+ }
+
+ rc = migration_transfer_data(socket, &msg, sizeof(msg), req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer msg for %s dev", dev);
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ if (msg.type != MESSAGE_TYPE_DEV) {
+ DPRINTF("Wrong message type for device.");
+ return (-1);
+ }
+
+ data_size = msg.len;
+ }
+
+ if (data_size == 0)
+ return (0); // this type of device is not used
+
+
+ rc = migration_transfer_data(socket, buffer, data_size, req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer %s dev", dev);
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ dev_info = find_entry_for_dev(msg.name);
+ if (dev_info == NULL) {
+ EPRINTF("Could not find the device %s "
+ "or migration not implemented yet for it.", msg.name);
+ return (0);
+ }
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, msg.name, 0, buffer, data_size, VM_SNAPSHOT_RESTORE);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = (*dev_info->snapshot_cb)(meta);
+ if (rc != 0) {
+ EPRINTF("Could not restore %s dev", msg.name);
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+migrate_devs(struct vmctx *ctx, int socket, enum migration_transfer_req req)
+{
+ int i, num_items;
+ int rc, error;
+ char *buffer;
+ const struct vm_snapshot_dev_info *snapshot_devs;
+
+ error = 0;
+ buffer = malloc(SNAPSHOT_BUFFER_SIZE);
+ if (buffer == NULL) {
+ EPRINTF("Could not allocate memory");
+ error = -1;
+ goto end;
+ }
+
+ if (req == MIGRATION_SEND_REQ) {
+ /*
+ * Send to the destination the number of devices that will
+ * be migrated.
+ */
+ snapshot_devs = get_snapshot_devs(&num_items);
+
+ rc = migration_transfer_data(socket, &num_items, sizeof(num_items), req);
+ if (rc < 0) {
+ DPRINTF("Could not send num_items to destination");
+ return (-1);
+ }
+
+ for (i = 0; i < num_items; i++) {
+ rc = migrate_transfer_dev(ctx, socket, snapshot_devs[i].dev_name,
+ buffer, SNAPSHOT_BUFFER_SIZE, req);
+
+ if (rc < 0) {
+ DPRINTF("Could not send %s", snapshot_devs[i].dev_name);
+ error = -1;
+ goto end;
+ }
+ }
+ } else if (req == MIGRATION_RECV_REQ) {
+ /* receive the number of devices that will be migrated */
+ rc = migration_transfer_data(socket, &num_items, sizeof(num_items), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv num_items from source");
+ return (-1);
+ }
+
+ for (i = 0; i < num_items; i++) {
+ rc = migrate_transfer_dev(ctx, socket, NULL, buffer, SNAPSHOT_BUFFER_SIZE, req);
+ if (rc < 0) {
+ DPRINTF("Could not recv device");
+ error = -1;
+ goto end;
+ }
+ }
+ }
+
+end:
+ if (buffer != NULL)
+ free(buffer);
+
+ return (error);
+}
+
+
+#define MIGRATION_ROUNDS 4
+
+static size_t
+num_dirty_pages(char *page_list, size_t size)
+{
+ size_t num = 0;
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ if (page_list[i] == 1)
+ num++;
+
+ return (num);
+}
+
+static int
+migration_fill_vmm_migration_pages_req(struct vmctx *ctx,
+ struct vmm_migration_pages_req *req,
+ char *page_list,
+ size_t size,
+ size_t *current_position)
+{
+ size_t i, count;
+
+ count = 0;
+ for (i = *current_position; i < size; i++) {
+ if (count == VMM_PAGE_CHUNK)
+ break;
+
+ if (page_list[i] == 1) {
+ req->pages[count].pindex = i;
+ count ++;
+ }
+ }
+
+ *current_position = i;
+ req->pages_required = count;
+ req->req_type = VMM_GET_PAGES;
+
+ return vm_copy_vmm_pages(ctx, req);
+}
+
+static int
+migrate_pages(struct vmctx *ctx, int socket, struct vmm_migration_pages_req *req,
+ char *page_list, size_t page_list_size, int already_locked,
+ enum migration_transfer_req migration_req)
+{
+ size_t dirty_pages;
+ size_t current_pos, i, count;
+ int rc;
+
+ if ((migration_req != MIGRATION_SEND_REQ) && (migration_req != MIGRATION_RECV_REQ)) {
+ EPRINTF("wrong migration transfer req");
+ return (-1);
+ }
+
+ /*
+ * Transfer the state of the pages (dirty/not dirty) from the source
+ * host to the destination host. The pages that are dirty will be
+ * transferred in the next steps.
+ */
+ rc = migration_transfer_data(socket, page_list, page_list_size, migration_req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer page_list remote");
+ return (-1);
+ }
+
+ dirty_pages = num_dirty_pages(page_list, page_list_size);
+
+ current_pos = 0;
+ while (1) {
+ if (current_pos >= page_list_size)
+ break;
+
+ for (i = 0; i < VMM_PAGE_CHUNK; i++)
+ req->pages[i].pindex = -1;
+
+ req->pages_required = 0;
+
+ /* Only the source host pauses the vcpus */
+ if (migration_req == MIGRATION_SEND_REQ) {
+ if (!already_locked)
+ vm_vcpu_pause(ctx);
+
+ rc = migration_fill_vmm_migration_pages_req(ctx, req, page_list,
+ page_list_size,
+ &current_pos);
+
+ if (!already_locked)
+ vm_vcpu_resume(ctx);
+
+ if (rc < 0) {
+ DPRINTF("Could not get pages");
+ return (-1);
+ }
+ } else {
+ count = 0;
+ for (i = current_pos; i < page_list_size; i++) {
+ if (count == VMM_PAGE_CHUNK)
+ break;
+
+ if (page_list[i] == 1) {
+ req->pages[count].pindex = i;
+ count ++;
+ }
+ }
+
+ current_pos = i;
+ req->pages_required = count;
+ }
+
+ for (i = 0; i < req->pages_required; i++) {
+ rc = migration_transfer_data(socket, req->pages[i].page, PAGE_SIZE, migration_req);
+ if (rc < 0) {
+ DPRINTF("Cound not transfer page %zu", req->pages[i].pindex);
+ return (-1);
+ }
+ }
+
+ if (migration_req == MIGRATION_RECV_REQ) {
+ req->req_type = VMM_SET_PAGES;
+
+ rc = vm_copy_vmm_pages(ctx, req);
+ if (rc < 0) {
+ EPRINTF("Could not copy pages into guest memory");
+ return (-1);
+ }
+ }
+ }
+
+ return (0);
+}
+
+static int
+search_dirty_pages(struct vmctx *ctx, char *page_list)
+{
+ size_t lowmem_pages, highmem_pages, pages;
+ int error;
+
+ if (page_list == NULL)
+ return (-1);
+
+ error = vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages);
+ if (error != 0) {
+ DPRINTF("Error while trying to get page number");
+ return (-1);
+ }
+
+ pages = lowmem_pages + highmem_pages;
+ vm_get_dirty_page_list(ctx, page_list, pages);
+
+ return (0);
+}
+
+static inline void
+fill_page_list(char *page_list, size_t list_len, char c)
+{
+ size_t index;
+
+ if (page_list == NULL)
+ return;
+
+ for (index = 0; index < list_len; index ++)
+ page_list[index] = c;
+}
+
+static int
+live_migrate_send(struct vmctx *ctx, int socket)
+{
+ int error, i, rc;
+ uint8_t rounds;
+ size_t memory_size, lowmem_size, highmem_size;
+ size_t migration_completed;
+ size_t lowmem_pages, highmem_pages, pages;
+ char *baseaddr, *page_list_indexes;
+ struct vmm_migration_pages_req memory_req;
+
+ error = 0;
+ memory_size = lowmem_size = highmem_size = 0;
+ page_list_indexes = NULL;
+ rounds = MIGRATION_ROUNDS;
+
+ /* Send the number of memory rounds to destination */
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Could not send the number of rounds remote");
+ goto done;
+ }
+
+ /* Compute memory_size and pages*/
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size);
+
+ memory_size = lowmem_size + highmem_size;
+ vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages);
+ pages = lowmem_pages + highmem_pages;
+
+ /* alloc page_list_indexes */
+ page_list_indexes = malloc (pages * sizeof(char));
+ if (page_list_indexes == NULL) {
+ perror("Page list indexes could not be allocated");
+ error = -1;
+ goto done;
+ }
+
+ error = vm_init_vmm_migration_pages_req(ctx, &memory_req);
+ if (error < 0) {
+ DPRINTF("Could not initialize struct vmm_migration_pages_req");
+ return (error);
+ }
+
+ for (i = 0; i <= MIGRATION_ROUNDS; i++) {
+ if (i == MIGRATION_ROUNDS) { // Last Round
+ rc = vm_pause_user_devs(ctx);
+ if (rc != 0) {
+ DPRINTF("Could not pause devices");
+ error = rc;
+ goto done;
+ }
+
+ vm_vcpu_pause(ctx);
+ }
+
+ if (i == 0) { // First Round
+ fill_page_list(page_list_indexes, pages, 1);
+ } else {
+ DPRINTF("ROUND: %d", i);
+ fill_page_list(page_list_indexes, pages, 0);
+
+ if (i != MIGRATION_ROUNDS) {
+ vm_vcpu_pause(ctx);
+ }
+
+ /* Search the dirty pages and populate page_list_index */
+ error = search_dirty_pages(ctx, page_list_indexes);
+
+ if (error != 0) {
+ DPRINTF("Couldn't search for the dirty pages");
+ goto unlock_vm_and_exit;
+ }
+
+ if (i != MIGRATION_ROUNDS) {
+ vm_vcpu_resume(ctx);
+ }
+ }
+
+ error = migrate_pages(ctx, socket, &memory_req, page_list_indexes,
+ pages, i == MIGRATION_ROUNDS ? 1 : 0, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Couldn't send dirty pages to dest");
+ goto done;
+ }
+ }
+
+ // Send kern data
+ error = migrate_kern_data(ctx, socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Could not send kern data to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ // Send PCI data
+ error = migrate_devs(ctx, socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Could not send pci devs to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ // Wait for migration completed
+ error = migration_transfer_data(socket, &migration_completed,
+ sizeof(migration_completed), MIGRATION_RECV_REQ);
+ if ((error < 0) || (migration_completed != MIGRATION_SPECS_OK)) {
+ DPRINTF("Could not recv migration completed remote or received error");
+ goto unlock_vm_and_exit;
+ }
+
+ // Poweroff the vm
+ vm_vcpu_resume(ctx);
+
+ vm_destroy(ctx);
+ exit(0);
+
+unlock_vm_and_exit:
+ vm_vcpu_resume(ctx);
+done:
+ rc = vm_resume_user_devs(ctx);
+ if (rc != 0)
+ EPRINTF("Could not resume devices");
+ if (page_list_indexes != NULL)
+ free(page_list_indexes);
+ return (error);
+}
+
+static int
+live_migrate_recv(struct vmctx *ctx, int socket)
+{
+ int error, index;
+ uint8_t rounds;
+ size_t memory_size, lowmem_size, highmem_size;
+ size_t lowmem_pages, highmem_pages, pages;
+ char *baseaddr, *page_list_indexes;
+ struct vmm_migration_pages_req memory_req;
+
+ memory_size = lowmem_size = highmem_size = 0;
+ page_list_indexes = NULL;
+
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_RECV_REQ);
+ if (error != 0) {
+ DPRINTF("Could not recv the number of rounds from remote");
+ goto done;
+ }
+
+ /* Compute memory_size and pages*/
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size);
+
+ memory_size = lowmem_size + highmem_size;
+ vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages);
+ pages = lowmem_pages + highmem_pages;
+
+ /* alloc page_list_indexes */
+ page_list_indexes = malloc(pages * sizeof(char));
+ if (page_list_indexes == NULL) {
+ perror("Page list indexes could not be allocated");
+ error = -1;
+ goto done;
+ }
+
+ error = vm_init_vmm_migration_pages_req(ctx, &memory_req);
+ if (error < 0) {
+ DPRINTF("Could not initialize struct vmm_migration_pages_req");
+ return (error);
+ }
+
+ /* The following iteration contains the preliminary round in which the
+ * entire memory is migrated to the destination. Then, for
+ * MIGRATION_ROUNDS - 1 rounds, only the dirtied pages will be migrated.
+ * In the final round, the rest of the pages are migrated.
+ * Since the vcpus are not started, we don't need to lock them, so we
+ * can do the memory migration pretty straight-forward.
+ */
+ for (index = 0; index <= rounds; index ++) {
+ fill_page_list(page_list_indexes, pages, 0);
+
+ error = migrate_pages(ctx, socket, &memory_req, page_list_indexes, pages, true, MIGRATION_RECV_REQ);
+ if (error != 0) {
+ DPRINTF("Couldn't recv dirty pages from source");
+ goto done;
+ }
+ }
+
+ error = 0;
+done:
+ if (page_list_indexes != NULL) {
+ free(page_list_indexes);
+ }
+ return (error);
+}
+
+static inline int
+migrate_connections(struct migrate_req req, int *socket_fd,
+ int *connection_socket_fd,
+ enum migration_transfer_req type)
+{
+ unsigned char ipv4_addr[MAX_IP_LEN];
+ unsigned char ipv6_addr[MAX_IP_LEN];
+ int addr_type;
+ int error;
+ int s, con_socket;
+ struct sockaddr_in sa, client_sa;
+ socklen_t client_len;
+ int rc;
+
+ rc = get_migration_host_and_type(req.host, ipv4_addr,
+ ipv6_addr, &addr_type);
+
+ if (rc != 0) {
+ EPRINTF("Invalid address.");
+ DPRINTF("IP address used for migration: %s;\r\n"
+ "Port used for migration: %d",
+ req.host, req.port);
+ return (rc);
+ }
+
+ if (addr_type == AF_INET6) {
+ EPRINTF("IPv6 is not supported yet for migration. "
+ "Please try again using a IPv4 address.");
+
+ DPRINTF("IP address used for migration: %s;\r\nPort used for migration: %d",
+ ipv6_addr, req.port);
+ return (-1);
+ }
+
+ s = socket(AF_INET, SOCK_STREAM, 0);
+
+ if (s < 0) {
+ perror("Could not create socket");
+ return (-1);
+ }
+
+ bzero(&sa, sizeof(sa));
+
+ switch (type) {
+ case MIGRATION_SEND_REQ:
+ fprintf(stdout, "%s: Starting connection to %s on %d port...\r\n",
+ __func__, ipv4_addr, req.port);
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = htons(req.port);
+
+ rc = inet_pton(AF_INET, ipv4_addr, &sa.sin_addr);
+ if (rc <= 0) {
+ DPRINTF("Could not retrive the IPV4 address");
+ return (-1);
+ }
+
+ rc = connect(s, (struct sockaddr *)&sa, sizeof(sa));
+
+ if (rc < 0) {
+ perror("Could not connect to the remote host");
+ error = rc;
+ goto done_close_s;
+ }
+ *socket_fd = s;
+ break;
+ case MIGRATION_RECV_REQ:
+ fprintf(stdout, "%s: Waiting for connections from %s on %d port...\r\n",
+ __func__, ipv4_addr, req.port);
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = htons(req.port);
+ sa.sin_addr.s_addr = htonl(INADDR_ANY);
+
+ rc = bind(s, (struct sockaddr *)&sa, sizeof(sa));
+
+ if (rc < 0) {
+ perror("Could not bind");
+ error = rc;
+ goto done_close_s;
+ }
+
+ listen(s, 1);
+
+ con_socket = accept(s, (struct sockaddr *)&client_sa, &client_len);
+ if (con_socket < 0) {
+ EPRINTF("Could not accept connection");
+ error = -1;
+ goto done_close_s;
+ }
+ *socket_fd = s;
+ *connection_socket_fd = con_socket;
+ break;
+ default:
+ EPRINTF("unknown operation request");
+ error = -1;
+ goto done;
+ }
+
+ error = 0;
+ goto done;
+
+done_close_s:
+ close(s);
+done:
+ return (error);
+}
+
+int
+vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool live)
+{
+ int s;
+ int rc, error, migration_type;
+ size_t migration_completed;
+
+ rc = migrate_connections(req, &s, NULL, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not create connection");
+ return (-1);
+ }
+
+ rc = migration_check_specs(s, MIGRATION_SEND_REQ);
+
+ if (rc < 0) {
+ EPRINTF("Error while checking system requirements");
+ error = rc;
+ goto done;
+ }
+
+ migration_type = live;
+ rc = migration_transfer_data(s, &migration_type,
+ sizeof(migration_type), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send migration type");
+ return (-1);
+ }
+
+ if (live) {
+ rc = live_migrate_send(ctx, s);
+ if (rc != 0) {
+ EPRINTF("Could not live migrate the guest's memory");
+ error = rc;
+ } else {
+ error = 0;
+ }
+ goto done;
+ } // else continue the warm migration procedure
+
+ vm_vcpu_pause(ctx);
+
+ rc = vm_pause_user_devs(ctx);
+ if (rc != 0) {
+ EPRINTF("Could not pause devices");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migrate_send_memory(ctx, s);
+ if (rc != 0) {
+ EPRINTF("Could not send memory to destination");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migrate_kern_data(ctx, s, MIGRATION_SEND_REQ);
+ if (rc != 0) {
+ EPRINTF("Could not send kern data to destination");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migrate_devs(ctx, s, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not send pci devs to destination");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migration_transfer_data(s, &migration_completed,
+ sizeof(migration_completed), MIGRATION_RECV_REQ);
+ if ((rc < 0) || (migration_completed != MIGRATION_SPECS_OK)) {
+ EPRINTF("Could not recv migration completed remote or received error");
+ error = -1;
+ goto unlock_vm_and_exit;
+ }
+
+ vm_destroy(ctx);
+ exit(0);
+
+unlock_vm_and_exit:
+ vm_vcpu_resume(ctx);
+
+ rc = vm_resume_user_devs(ctx);
+ if (rc != 0)
+ EPRINTF("Could not resume devices");
+done:
+ close(s);
+ return (error);
+}
+
+int
+vm_recv_migrate_req(struct vmctx *ctx, struct migrate_req req)
+{
+ int s, con_socket;
+ int rc;
+ int migration_type;
+ size_t migration_completed;
+
+ rc = migrate_connections(req, &s, &con_socket, MIGRATION_RECV_REQ);
+ if (rc != 0) {
+ EPRINTF("Could not create connections");
+ return (-1);
+ }
+
+ rc = migration_check_specs(con_socket, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Error while checking specs");
+ close(con_socket);
+ close(s);
+ return (rc);
+ }
+
+ rc = migration_transfer_data(con_socket, &migration_type,
+ sizeof(migration_type), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not recv migration type");
+ return (-1);
+ }
+
+ /* For recv, the only difference between warm and live migration is the
+ * way in which the memory is migrated.
+ */
+ if (migration_type) {
+ rc = live_migrate_recv(ctx, con_socket);
+ if (rc != 0) {
+ EPRINTF("Could not live migrate the guest's memory");
+ close(con_socket);
+ close(s);
+ return (rc);
+ }
+ } else {
+ /* if not live migration, then migrate memory normally. */
+ rc = migrate_recv_memory(ctx, con_socket);
+ if (rc < 0) {
+ EPRINTF("Could not recv lowmem and highmem");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+ }
+
+ rc = migrate_kern_data(ctx, con_socket, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not recv kern data");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+
+ rc = migrate_devs(ctx, con_socket, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not recv pci devs");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+
+ fprintf(stdout, "%s: Migration completed\r\n", __func__);
+ migration_completed = MIGRATION_SPECS_OK;
+ rc = migration_transfer_data(con_socket, &migration_completed,
+ sizeof(migration_completed), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not send migration completed remote");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+
+ close(con_socket);
+ close(s);
+ return (0);
+}
+
diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h
--- a/usr.sbin/bhyve/snapshot.h
+++ b/usr.sbin/bhyve/snapshot.h
@@ -44,9 +44,13 @@
#define BHYVE_RUN_DIR "/var/run/bhyve/"
#define MAX_SNAPSHOT_FILENAME PATH_MAX
+#define MAX_HOSTNAME_LEN 255
+#define DEFAULT_MIGRATION_PORT 24983
struct vmctx;
+#define SNAPSHOT_BUFFER_SIZE (20 * MB)
+
struct restore_state {
int kdata_fd;
int vmmem_fd;
@@ -60,15 +64,23 @@
ucl_object_t *meta_root_obj;
};
+struct __attribute__((packed)) migrate_req {
+ char host[MAX_HOSTNAME_LEN];
+ unsigned int port;
+};
+
/* Filename that will be used for save/restore */
struct checkpoint_op {
char snapshot_filename[MAX_SNAPSHOT_FILENAME];
+ struct migrate_req migrate_req;
};
/* Messages that a bhyve process understands. */
enum ipc_opcode {
START_CHECKPOINT,
START_SUSPEND,
+ START_MIGRATE,
+ START_MIGRATE_LIVE,
};
/*
@@ -88,8 +100,10 @@
struct checkpoint_thread_info {
struct vmctx *ctx;
int socket_fd;
+ struct sockaddr_un *addr;
};
+const char **get_pci_devs(int *);
typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *);
typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *);
typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *);
@@ -106,6 +120,9 @@
enum snapshot_req req; /* request type */
};
+const struct vm_snapshot_dev_info *get_snapshot_devs(int *ndevs);
+const struct vm_snapshot_kern_info *get_snapshot_kern_structs(int *ndevs);
+
void destroy_restore_state(struct restore_state *rstate);
const char *lookup_vmname(struct restore_state *rstate);
@@ -116,6 +133,8 @@
void checkpoint_cpu_add(int vcpu);
void checkpoint_cpu_resume(int vcpu);
void checkpoint_cpu_suspend(int vcpu);
+void vm_vcpu_pause(struct vmctx *ctx);
+void vm_vcpu_resume(struct vmctx *ctx);
int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate);
int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate);
diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c
--- a/usr.sbin/bhyve/snapshot.c
+++ b/usr.sbin/bhyve/snapshot.c
@@ -31,6 +31,8 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $FreeBSD$
*/
#include <sys/cdefs.h>
@@ -85,6 +87,7 @@
#include "ioapic.h"
#include "mem.h"
#include "mevent.h"
+#include "migration.h"
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
@@ -166,6 +169,24 @@
{ "vrtc", STRUCT_VRTC },
};
+const struct vm_snapshot_dev_info *
+get_snapshot_devs(int *ndevs)
+{
+ if (ndevs != NULL)
+ *ndevs = nitems(snapshot_devs);
+
+ return (snapshot_devs);
+}
+
+const struct vm_snapshot_kern_info *
+get_snapshot_kern_structs(int *ndevs)
+{
+ if (ndevs != NULL)
+ *ndevs = nitems(snapshot_kern_structs);
+
+ return (snapshot_kern_structs);
+}
+
static cpuset_t vcpus_active, vcpus_suspended;
static pthread_mutex_t vcpu_lock;
static pthread_cond_t vcpus_idle, vcpus_can_run;
@@ -1301,7 +1322,7 @@
pthread_mutex_unlock(&vcpu_lock);
}
-static void
+void
vm_vcpu_pause(struct vmctx *ctx)
{
@@ -1313,7 +1334,7 @@
pthread_mutex_unlock(&vcpu_lock);
}
-static void
+void
vm_vcpu_resume(struct vmctx *ctx)
{
@@ -1444,6 +1465,8 @@
handle_message(struct ipc_message *imsg, struct vmctx *ctx)
{
int err;
+ struct migrate_req req;
+ int memflags;
switch (imsg->code) {
case START_CHECKPOINT:
@@ -1451,6 +1474,51 @@
break;
case START_SUSPEND:
err = vm_checkpoint(ctx, imsg->data.op.snapshot_filename, true);
+ break;
+ case START_MIGRATE:
+ fprintf(stdout, "Starting the warm migration procedure\r\n");
+ memset(&req, 0, sizeof(struct migrate_req));
+ req.port = imsg->data.op.migrate_req.port;
+ memcpy(req.host, imsg->data.op.migrate_req.host, MAX_HOSTNAME_LEN);
+ req.host[MAX_HOSTNAME_LEN - 1] = 0;
+ fprintf(stderr, "%s: IP address used for migration: %s;\r\n"
+ "Port used for migration: %d\r\n",
+ __func__,
+ req.host,
+ req.port);
+
+ err = vm_send_migrate_req(ctx, req, false);
+ break;
+ case START_MIGRATE_LIVE:
+ fprintf(stdout, "Starting the live migration procedure\r\n");
+
+ /* Currently, the live migration is implemented only
+ * for guests that are started using -S (wired
+ * memory option).
+ */
+
+ /* Check memflags. If the VM_MEM_F_WIRED bit is not
+ * set, then the live migration procedure cannot be
+ * done. */
+ memflags = vm_get_memflags(ctx);
+ if (!(memflags & VM_MEM_F_WIRED)) {
+ fprintf(stderr, "%s: Migration not supported for un-wired guests\r\n", __func__);
+ err = -1;
+ break;
+ }
+
+ memset(&req, 0, sizeof(struct migrate_req));
+ req.port = imsg->data.op.migrate_req.port;
+ memcpy(req.host, imsg->data.op.migrate_req.host, MAX_HOSTNAME_LEN);
+ req.host[MAX_HOSTNAME_LEN - 1] = 0;
+ fprintf(stderr, "%s: IP address used for migration: %s;\r\n"
+ "Port used for migration: %d\r\n",
+ __func__,
+ req.host,
+ req.port);
+
+ err = vm_send_migrate_req(ctx, req, true);
+
break;
default:
EPRINTLN("Unrecognized checkpoint operation\n");
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -90,6 +90,8 @@
#ifdef BHYVE_SNAPSHOT
" [--checkpoint=<filename>]\n"
" [--suspend=<filename>]\n"
+ " [--migrate=<host>,<port>]\n"
+ " [--migrate-live=<host,port>]\n"
#endif
" [--get-all]\n"
" [--get-stats]\n"
@@ -303,6 +305,8 @@
#ifdef BHYVE_SNAPSHOT
static int vm_checkpoint_opt;
static int vm_suspend_opt;
+static int vm_migrate;
+static int vm_migrate_live;
#endif
/*
@@ -594,6 +598,8 @@
#ifdef BHYVE_SNAPSHOT
SET_CHECKPOINT_FILE,
SET_SUSPEND_FILE,
+ MIGRATE_VM,
+ MIGRATE_VM_LIVE,
#endif
};
@@ -1466,6 +1472,8 @@
#ifdef BHYVE_SNAPSHOT
{ "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE},
{ "suspend", REQ_ARG, 0, SET_SUSPEND_FILE},
+ { "migrate", REQ_ARG, 0, MIGRATE_VM},
+ { "migrate-live", REQ_ARG, 0, MIGRATE_VM_LIVE},
#endif
};
@@ -1736,6 +1744,49 @@
return (send_message(ctx, (void *)&imsg, length));
}
+
+static int
+send_start_migrate(struct vmctx *ctx, const char *migrate_vm, bool live)
+{
+ struct ipc_message imsg;
+ char *hostname, *pos;
+ size_t length;
+ int rc;
+
+ if (live)
+ imsg.code = START_MIGRATE_LIVE;
+ else
+ imsg.code = START_MIGRATE;
+
+ memset(imsg.data.op.migrate_req.host, 0, MAX_HOSTNAME_LEN);
+
+ hostname = strdup(migrate_vm);
+
+ if ((pos = strchr(hostname, ',')) != NULL ) {
+ *pos = '\0';
+ strlcpy(imsg.data.op.migrate_req.host, hostname, MAX_HOSTNAME_LEN);
+ pos = pos + 1;
+
+ rc = sscanf(pos, "%d", &(imsg.data.op.migrate_req.port));
+
+ if (rc == 0) {
+ fprintf(stderr, "Could not parse the port\r\n");
+ free(hostname);
+ return -1;
+ }
+ } else {
+ strlcpy(imsg.data.op.migrate_req.host, hostname, MAX_HOSTNAME_LEN);
+
+ /* If only one variable could be read, it should be the host */
+ imsg.data.op.migrate_req.port = DEFAULT_MIGRATION_PORT;
+ }
+
+ free(hostname);
+
+ length = offsetof(struct ipc_message, data) + sizeof(imsg.data.op);
+
+ return (send_message(ctx, (void *)&imsg, length));
+}
#endif
int
@@ -1755,7 +1806,7 @@
struct tm tm;
struct option *opts;
#ifdef BHYVE_SNAPSHOT
- char *checkpoint_file, *suspend_file;
+ char *checkpoint_file, *suspend_file, *migrate_host;
#endif
cpu_intel = cpu_vendor_intel();
@@ -1924,6 +1975,14 @@
vm_suspend_opt = 1;
suspend_file = optarg;
break;
+ case MIGRATE_VM:
+ vm_migrate = 1;
+ migrate_host = optarg;
+ break;
+ case MIGRATE_VM_LIVE:
+ vm_migrate_live = 1;
+ migrate_host = optarg;
+ break;
#endif
default:
usage(cpu_intel);
@@ -2402,6 +2461,12 @@
if (!error && vm_suspend_opt)
error = snapshot_request(ctx, suspend_file, START_SUSPEND);
+
+ if (!error && vm_migrate)
+ error = send_start_migrate(ctx, migrate_host, false);
+
+ if (!error && vm_migrate_live)
+ error = send_start_migrate(ctx, migrate_host, true);
#endif
free (opts);

File Metadata

Mime Type
text/plain
Expires
Mon, Jun 29, 10:43 AM (10 h, 9 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34457398
Default Alt Text
D30954.diff (66 KB)

Event Timeline