Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F160835184
D30954.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
66 KB
Referenced Files
None
Subscribers
None
D30954.diff
View Options
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -262,4 +262,16 @@
int vm_snapshot_req(struct vm_snapshot_meta *meta);
int vm_restore_time(struct vmctx *ctx);
+int vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages,
+ size_t *highmem_pages);
+int vm_set_vmm_migration_segments(struct vmctx *ctx,
+ struct vmm_migration_segment *lowmem,
+ struct vmm_migration_segment *highmem);
+int vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num);
+
+int vm_copy_vmm_pages(struct vmctx *ctx,
+ struct vmm_migration_pages_req *pages_req);
+int vm_init_vmm_migration_pages_req(struct vmctx *ctx,
+ struct vmm_migration_pages_req *req);
+
#endif /* _VMMAPI_H_ */
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -1630,6 +1630,116 @@
return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
}
+int
+vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages, size_t *highmem_pages)
+{
+ /* ctx cannot be NULL */
+ if (ctx == NULL)
+ return (-1);
+
+ if (lowmem_pages != NULL)
+ *lowmem_pages = ctx->lowmem / PAGE_SIZE;
+
+ if (highmem_pages != NULL)
+ *highmem_pages = ctx->highmem / PAGE_SIZE;
+
+ return (0);
+}
+
+int
+vm_set_vmm_migration_segments(struct vmctx *ctx,
+ struct vmm_migration_segment *lowmem,
+ struct vmm_migration_segment *highmem)
+{
+
+ if (lowmem != NULL) {
+ lowmem->start = 0;
+ lowmem->end = ctx->lowmem;
+ }
+
+ if (highmem != NULL) {
+ if (ctx->highmem != 0) {
+ highmem->start = 4 * GB;
+ highmem->end = 4 * GB + ctx->highmem;
+ }
+ }
+
+ return (0);
+}
+
+int
+vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num_pages)
+{
+ int error;
+ struct vm_get_dirty_page_list list;
+
+ bzero(&list, sizeof(struct vm_get_dirty_page_list));
+ list.page_list = (uint8_t *)page_list;
+ list.num_pages = num_pages;
+
+ error = vm_set_vmm_migration_segments(ctx, &(list.lowmem),
+ &(list.highmem));
+
+ error = ioctl(ctx->fd, VM_GET_DIRTY_PAGE_LIST, &list);
+
+ return (error);
+}
+
+int
+vm_copy_vmm_pages(struct vmctx *ctx, struct vmm_migration_pages_req *pages_req)
+{
+ int error;
+ size_t index;
+
+ if (pages_req == NULL)
+ return (-1);
+
+ if (pages_req->pages_required > VMM_PAGE_CHUNK)
+ return (E2BIG);
+
+ for (index = 0; index < pages_req->pages_required; index ++) {
+ if (pages_req->pages[index].page == NULL)
+ return (-1);
+
+ if (pages_req->req_type == VMM_GET_PAGES)
+ memset(pages_req->pages[index].page, 0, PAGE_SIZE);
+ }
+
+ error = ioctl(ctx->fd, VM_COPY_VMM_PAGES, pages_req);
+
+ return (error);
+}
+
+int
+vm_init_vmm_migration_pages_req(struct vmctx *ctx,
+ struct vmm_migration_pages_req *req)
+{
+ size_t index;
+ struct vmm_migration_page *page;
+
+ vm_set_vmm_migration_segments(ctx, &(req->lowmem_segment),
+ &(req->highmem_segment));
+
+ for (index = 0; index < VMM_PAGE_CHUNK; index++) {
+ page = &req->pages[index];
+ page->page = malloc(PAGE_SIZE * sizeof(uint8_t));
+ if (page->page == NULL)
+ goto deallocate_error;
+ memset(page->page, 0, PAGE_SIZE * sizeof(uint8_t));
+ }
+
+ return (0);
+
+deallocate_error:
+ for (index = 0; index < VMM_PAGE_CHUNK; index ++) {
+ page = &req->pages[index];
+ if (page->page != NULL)
+ free(page->page);
+ }
+
+ return (-1);
+}
+
int
vm_set_topology(struct vmctx *ctx,
uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -35,6 +35,8 @@
#include <x86/segments.h>
struct vm_snapshot_meta;
+struct vm_get_dirty_page_list;
+struct vmm_migration_pages_req;
#ifdef _KERNEL
SDT_PROVIDER_DECLARE(vmm);
@@ -287,6 +289,8 @@
void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta);
int vm_restore_time(struct vm *vm);
+int vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list);
+int vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req);
#ifdef _SYS__CPUSET_H_
/*
diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h
--- a/sys/amd64/include/vmm_dev.h
+++ b/sys/amd64/include/vmm_dev.h
@@ -31,6 +31,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_
+#include "vmm_migration.h"
+
struct vm_snapshot_meta;
#ifdef _KERNEL
@@ -256,6 +258,13 @@
};
_Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI");
+struct vm_get_dirty_page_list {
+ uint8_t *page_list;
+ size_t num_pages;
+ struct vmm_migration_segment lowmem;
+ struct vmm_migration_segment highmem;
+};
+
enum {
/* general routines */
IOCNUM_ABIVERS = 0,
@@ -344,7 +353,9 @@
/* checkpoint */
IOCNUM_SNAPSHOT_REQ = 113,
- IOCNUM_RESTORE_TIME = 115
+ IOCNUM_RESTORE_TIME = 115,
+ IOCNUM_VM_GET_DIRTY_PAGE_LIST = 117,
+ IOCNUM_VM_COPY_VMM_PAGES = 118,
};
#define VM_RUN \
@@ -475,4 +486,8 @@
_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta)
#define VM_RESTORE_TIME \
_IOWR('v', IOCNUM_RESTORE_TIME, int)
+#define VM_GET_DIRTY_PAGE_LIST \
+ _IOWR('v', IOCNUM_VM_GET_DIRTY_PAGE_LIST, struct vm_get_dirty_page_list)
+#define VM_COPY_VMM_PAGES \
+ _IOWR('v', IOCNUM_VM_COPY_VMM_PAGES, struct vmm_migration_pages_req)
#endif
diff --git a/sys/amd64/include/vmm_migration.h b/sys/amd64/include/vmm_migration.h
new file mode 100644
--- /dev/null
+++ b/sys/amd64/include/vmm_migration.h
@@ -0,0 +1,66 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+
+#ifndef _VMM_MIGRATION_H_
+#define _VMM_MIGRATION_H_
+
+#define VMM_PAGE_CHUNK 10
+
+enum migration_req_type {
+ VMM_GET_PAGES = 0,
+ VMM_SET_PAGES = 1,
+};
+
+struct vmm_migration_page {
+ vm_pindex_t pindex;
+ uint8_t *page;
+};
+
+/*
+ * A bhyve guest has two memory segments:
+ * - lowmem segment: mapped from 0GB to 3GB (which is lowmem_limit)
+ * - highmem segment: mapped starting from 4GB
+ * The object that represents a segment is identified by start and end values.
+ * */
+struct vmm_migration_segment {
+ vm_offset_t start;
+ vm_offset_t end;
+};
+
+struct vmm_migration_pages_req {
+ size_t pages_required;
+ enum migration_req_type req_type;
+ struct vmm_migration_segment lowmem_segment;
+ struct vmm_migration_segment highmem_segment;
+ struct vmm_migration_page pages[VMM_PAGE_CHUNK];
+};
+
+#endif
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -65,6 +65,7 @@
#include <machine/pcb.h>
#include <machine/smp.h>
#include <machine/md_var.h>
+#include <machine/vmparam.h>
#include <x86/psl.h>
#include <x86/apicreg.h>
#include <x86/ifunc.h>
@@ -73,6 +74,7 @@
#include <machine/vmm_dev.h>
#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_snapshot.h>
+#include <machine/vmm_migration.h>
#include "vmm_ioport.h"
#include "vmm_ktr.h"
@@ -146,6 +148,9 @@
};
#define VM_MAX_MEMMAPS 8
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
/*
* Initialization:
* (o) initialized the first time the VM is created
@@ -2949,4 +2954,191 @@
return (0);
}
-#endif
+
+static inline void
+vm_search_dirty_pages_in_object(vm_object_t object, size_t start, size_t end,
+ size_t offset, uint8_t *page_list)
+{
+ vm_pindex_t pindex;
+ vm_page_t m;
+ uint8_t result;
+
+ for (pindex = start / PAGE_SIZE; pindex < end / PAGE_SIZE; pindex ++) {
+ VM_OBJECT_WLOCK(object);
+ m = vm_page_lookup(object, pindex);
+ VM_OBJECT_WUNLOCK(object);
+ if (m != NULL) {
+ result = vm_page_test_vmm_dirty(m);
+ copyout(&result, page_list + pindex - offset, sizeof(result));
+ }
+ }
+
+}
+
+int
+vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list)
+{
+ int error = 0;
+ struct vmspace *vm_vmspace;
+ struct vm_map *vmmap;
+ struct vm_map_entry *entry;
+ struct vm_object *object;
+ uint8_t *page_list;
+ size_t offset;
+
+ page_list = list->page_list;
+
+ if (page_list == NULL)
+ return (-1);
+
+ vm_vmspace = vm->vmspace;
+
+ if (vm_vmspace == NULL) {
+ printf("%s: vm_vmspace is null\r\n", __func__);
+ error = -1;
+ return (error);
+ }
+
+ vmmap = &vm_vmspace->vm_map;
+
+ vm_map_lock(vmmap);
+ if (vmmap->busy)
+ vm_map_wait_busy(vmmap);
+
+ for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) {
+ object = entry->object.vm_object;
+
+ if (entry->start == list->lowmem.start &&
+ entry->end == list->lowmem.end) {
+ // if object is lowmem
+ if (object == NULL)
+ continue;
+ vm_search_dirty_pages_in_object(object,
+ list->lowmem.start,
+ list->lowmem.end,
+ 0,
+ page_list);
+ }
+
+ if (entry->start == list->highmem.start &&
+ entry->end == list->highmem.end) {
+ if (object == NULL)
+ continue;
+ // if object is highmem
+ offset = (list->highmem.start - list->lowmem.end) / PAGE_SIZE;
+ vm_search_dirty_pages_in_object(object,
+ list->highmem.start,
+ list->highmem.end,
+ offset,
+ page_list);
+ }
+ }
+
+ vm_map_unlock(vmmap);
+
+ return (error);
+}
+
+static inline void
+vm_copy_object_pages(vm_object_t lowmem_object, vm_object_t highmem_object,
+ struct vmm_migration_pages_req *page_req)
+{
+ vm_pindex_t pindex;
+ vm_object_t object;
+ struct vmm_migration_page migration_page;
+ size_t page_idx, limit_page;
+ void *dst;
+ size_t pindex_offset;
+ enum migration_req_type req_type;
+
+ req_type = page_req->req_type;
+
+ if (lowmem_object == NULL) {
+ printf("%s: lowmem_object is NULL\r\n", __func__);
+ return;
+ }
+ limit_page = 3UL * GB / PAGE_SIZE;
+ for (page_idx = 0; page_idx < page_req->pages_required; page_idx ++) {
+ migration_page = page_req->pages[page_idx];
+ pindex = migration_page.pindex;
+ dst = (void *) migration_page.page;
+ if (pindex >= limit_page) {
+ if (highmem_object == NULL) {
+ printf("%s: highmem_object is NULL\r\n", __func__);
+ return;
+ }
+ object = highmem_object;
+ pindex_offset = 1UL * GB / PAGE_SIZE;
+ } else {
+ object = lowmem_object;
+ pindex_offset = 0;
+ }
+
+ if (req_type == VMM_GET_PAGES) {
+ VM_OBJECT_WLOCK(object);
+ vm_object_get_page(object, pindex + pindex_offset, dst);
+ VM_OBJECT_WUNLOCK(object);
+ }
+ else if (req_type == VMM_SET_PAGES) {
+ VM_OBJECT_WLOCK(object);
+ vm_object_set_page(object, pindex + pindex_offset, dst);
+ VM_OBJECT_WUNLOCK(object);
+ }
+ else
+ return;
+ }
+}
+
+int
+vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req)
+{
+ int error = 0;
+ struct vmspace *vm_vmspace;
+ struct vm_map *vmmap;
+ struct vm_map_entry *entry;
+ struct vm_object *lowmem_object, *highmem_object, *object;
+ struct vmm_migration_segment lowmem_segment, highmem_segment;
+
+ lowmem_segment = pages_req->lowmem_segment;
+ highmem_segment = pages_req->highmem_segment;
+ vm_vmspace = vm->vmspace;
+
+ if (vm_vmspace == NULL) {
+ printf("%s: vm_vmspace is null\r\n", __func__);
+ error = -1;
+ return (error);
+ }
+
+ vmmap = &vm_vmspace->vm_map;
+
+ vm_map_lock(vmmap);
+ if (vmmap->busy)
+ vm_map_wait_busy(vmmap);
+
+ lowmem_object = NULL;
+ highmem_object = NULL;
+ for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) {
+ object = entry->object.vm_object;
+
+ if (entry->start == lowmem_segment.start &&
+ entry->end == lowmem_segment.end) {
+ lowmem_object = object;
+ }
+
+ if (entry->start == highmem_segment.start &&
+ entry->end == highmem_segment.end) {
+ highmem_object = object;
+ }
+ }
+
+ if (lowmem_object == NULL)
+ return (-1);
+
+ vm_copy_object_pages(lowmem_object, highmem_object, pages_req);
+
+ vm_map_unlock(vmmap);
+
+ return (error);
+}
+#endif /* BHYVE_SNAPSHOT */
+
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -59,6 +59,7 @@
#include <machine/vmm_instruction_emul.h>
#include <machine/vmm_snapshot.h>
#include <x86/apicreg.h>
+#include <machine/vmm_migration.h>
#include "vmm_lapic.h"
#include "vmm_stat.h"
@@ -388,6 +389,8 @@
int *regnums;
#ifdef BHYVE_SNAPSHOT
struct vm_snapshot_meta *snapshot_meta;
+ struct vm_get_dirty_page_list *page_list;
+ struct vmm_migration_pages_req *pages_req;
#endif
error = vmm_priv_check(curthread->td_ucred);
@@ -871,6 +874,14 @@
case VM_RESTORE_TIME:
error = vm_restore_time(sc->vm);
break;
+ case VM_GET_DIRTY_PAGE_LIST:
+ page_list = (struct vm_get_dirty_page_list *)data;
+ error = vm_get_dirty_page_list(sc->vm, page_list);
+ break;
+ case VM_COPY_VMM_PAGES:
+ pages_req = (struct vmm_migration_pages_req *)data;
+ error = vm_copy_vmm_pages(sc->vm, pages_req);
+ break;
#endif
default:
error = ENOTTY;
diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h
--- a/sys/vm/vm_object.h
+++ b/sys/vm/vm_object.h
@@ -390,6 +390,8 @@
void vm_object_unwire(vm_object_t object, vm_ooffset_t offset,
vm_size_t length, uint8_t queue);
struct vnode *vm_object_vnode(vm_object_t object);
+int vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst);
+int vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src);
#endif /* _KERNEL */
#endif /* _VM_OBJECT_ */
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -2592,6 +2592,52 @@
CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
"List of VM objects");
+int
+vm_object_get_page(vm_object_t object, vm_pindex_t pindex, void *dst)
+{
+ vm_page_t page;
+ vm_offset_t page_src;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ page = vm_page_lookup(object, pindex);
+ if (page == NULL) {
+ // cannot find page
+ return (-1);
+ }
+
+ vm_page_xbusy(page);
+ page->oflags &= ~VPO_VMM_DIRTY;
+
+ pmap_clear_modify(page);
+
+ page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page));
+ copyout((void *)page_src, dst, PAGE_SIZE);
+ vm_page_xunbusy(page);
+
+ return (0);
+}
+
+int
+vm_object_set_page(vm_object_t object, vm_pindex_t pindex, void *src)
+{
+ vm_page_t page;
+ vm_offset_t page_src;
+
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ page = vm_page_lookup(object, pindex);
+ if (page == NULL) {
+ // cannot find page
+ return (-1);
+ }
+
+ page_src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(page));
+ copyin(src, (void *)page_src, PAGE_SIZE);
+
+ return (0);
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kernel.h>
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -295,6 +295,7 @@
#define VPO_SWAPSLEEP 0x02 /* waiting for swap to finish */
#define VPO_UNMANAGED 0x04 /* no PV management for page */
#define VPO_SWAPINPROG 0x08 /* swap I/O in progress on page */
+#define VPO_VMM_DIRTY 0x80 /* dirty bit used for bhyve migration */
/*
* Busy page implementation details.
@@ -693,6 +694,7 @@
void vm_page_valid(vm_page_t m);
int vm_page_is_valid(vm_page_t, int, int);
void vm_page_test_dirty(vm_page_t);
+uint8_t vm_page_test_vmm_dirty(vm_page_t m);
vm_page_bits_t vm_page_bits(int base, int size);
void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count);
@@ -890,6 +892,7 @@
vm_page_dirty_KBI(m);
#else
m->dirty = VM_PAGE_BITS_ALL;
+ m->oflags |= VPO_VMM_DIRTY;
#endif
}
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -1483,6 +1483,28 @@
/* Refer to this operation by its public name. */
KASSERT(vm_page_all_valid(m), ("vm_page_dirty: page is invalid!"));
m->dirty = VM_PAGE_BITS_ALL;
+ m->oflags |= VPO_VMM_DIRTY;
+}
+
+uint8_t
+vm_page_test_vmm_dirty(vm_page_t m)
+{
+ uint64_t value;
+
+ vm_page_xbusy(m);
+ vm_page_test_dirty(m);
+ vm_page_xunbusy(m);
+
+// VM_OBJECT_ASSERT_WLOCKED(m->object);
+
+ value = m->oflags & VPO_VMM_DIRTY;
+ if (value == 0 && pmap_is_modified(m))
+ value = 1;
+
+ if (value == 0)
+ return (0);
+ else
+ return (1);
}
/*
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -34,6 +34,7 @@
kernemu_dev.c \
mem.c \
mevent.c \
+ migration.c \
mptbl.c \
net_backends.c \
net_utils.c \
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -96,6 +96,9 @@
#include "kernemu_dev.h"
#include "mem.h"
#include "mevent.h"
+#ifdef BHYVE_SNAPSHOT
+#include "migration.h"
+#endif
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
@@ -243,6 +246,7 @@
" -p: pin 'vcpu' to 'hostcpu'\n"
#ifdef BHYVE_SNAPSHOT
" -r: path to checkpoint file\n"
+ " -R: <host,port> the source vm host and port for migration\n"
#endif
" -S: guest memory cannot be swapped\n"
" -s: <slot,driver,configinfo> PCI slot config\n"
@@ -1214,10 +1218,12 @@
char *optstr;
#ifdef BHYVE_SNAPSHOT
char *restore_file;
+ char *receive_migration;
struct restore_state rstate;
int vcpu;
restore_file = NULL;
+ receive_migration = NULL;
#endif
init_config();
@@ -1225,7 +1231,7 @@
progname = basename(argv[0]);
#ifdef BHYVE_SNAPSHOT
- optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:";
+ optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:r:R:";
#else
optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:U:";
#endif
@@ -1278,6 +1284,9 @@
case 'r':
restore_file = optarg;
break;
+ case 'R':
+ receive_migration = optarg;
+ break;
#endif
case 's':
if (strncmp(optarg, "help", strlen(optarg)) == 0) {
@@ -1494,7 +1503,25 @@
exit(1);
}
}
-#endif
+
+ if (receive_migration != NULL) {
+ if (vm_pause_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to pause PCI device state.\n");
+ exit(1);
+ }
+
+ fprintf(stdout, "Starting the migration process...\r\n");
+ if (receive_vm_migration(ctx, receive_migration) != 0) {
+ fprintf(stderr, "Failed to migrate the vm.\r\n");
+ exit(1);
+ }
+
+ if (vm_resume_user_devs(ctx) != 0) {
+ fprintf(stderr, "Failed to resume PCI device state.\n");
+ exit(1);
+ }
+ }
+#endif /* BHYVE_SNAPSHOT */
error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
assert(error == 0);
@@ -1549,7 +1576,7 @@
if (init_checkpoint_thread(ctx) < 0)
printf("Failed to start checkpoint thread!\r\n");
- if (restore_file != NULL)
+ if ((restore_file != NULL) || (receive_migration != NULL))
vm_restore_time(ctx);
#endif
@@ -1563,7 +1590,7 @@
* If we restore a VM, start all vCPUs now (including APs), otherwise,
* let the guest OS to spin them up later via vmexits.
*/
- if (restore_file != NULL) {
+ if ((restore_file != NULL) || (receive_migration != NULL)) {
for (vcpu = 0; vcpu < guest_ncpus; vcpu++) {
if (vcpu == BSP)
continue;
diff --git a/usr.sbin/bhyve/migration.h b/usr.sbin/bhyve/migration.h
new file mode 100644
--- /dev/null
+++ b/usr.sbin/bhyve/migration.h
@@ -0,0 +1,87 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017-2020 Elena Mihailescu
+ * Copyright (c) 2017-2020 Darius Mihai
+ * Copyright (c) 2017-2020 Mihai Carabas
+ * All rights reserved.
+ * The migration feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVE_MIGRATION_
+#define _BHYVE_MIGRATION_
+
+#include <machine/vmm_dev.h>
+#include <vmmapi.h>
+#include "snapshot.h"
+
+struct vmctx;
+
+int receive_vm_migration(struct vmctx *ctx, char *migration_data);
+
+/* Warm Migration */
+#define MAX_DEV_NAME_LEN 64
+
+#define MAX_IP_LEN 64
+#define MAX_SPEC_LEN 256
+
+#define MIGRATION_SPECS_OK 0
+#define MIGRATION_SPECS_NOT_OK 1
+
+#define NO_KERN_STRUCT -1
+
+enum migration_transfer_req {
+ MIGRATION_SEND_REQ = 0,
+ MIGRATION_RECV_REQ = 1
+};
+
+enum message_types {
+ MESSAGE_TYPE_SPECS = 1,
+ MESSAGE_TYPE_METADATA = 2,
+ MESSAGE_TYPE_RAM = 3,
+ MESSAGE_TYPE_KERN = 4,
+ MESSAGE_TYPE_DEV = 5,
+ MESSAGE_TYPE_UNKNOWN = 8,
+};
+
+struct __attribute__((packed)) migration_message_type {
+ size_t len;
+ unsigned int type; /* enum message_type */
+ unsigned int req_type; /* enum snapshot_req */
+ char name[MAX_DEV_NAME_LEN];
+};
+
+struct __attribute__((packed)) migration_system_specs {
+ char hw_machine[MAX_SPEC_LEN];
+ char hw_model[MAX_SPEC_LEN];
+ size_t hw_pagesize;
+};
+
+int vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool live);
+int vm_recv_migrate_req(struct vmctx *ctx, struct migrate_req req);
+
+#endif
diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c
new file mode 100644
--- /dev/null
+++ b/usr.sbin/bhyve/migration.c
@@ -0,0 +1,1444 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017-2020 Elena Mihailescu
+ * Copyright (c) 2017-2020 Darius Mihai
+ * Copyright (c) 2017-2020 Mihai Carabas
+ * All rights reserved.
+ * The migration feature was developed under sponsorships
+ * from Matthew Grooms.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#ifndef WITHOUT_CAPSICUM
+#include <sys/capsicum.h>
+#include <capsicum_helpers.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/vmm.h>
+#ifndef WITHOUT_CAPSICUM
+#include <machine/vmm_dev.h>
+#endif
+#include <machine/vmm_migration.h>
+#include <vmmapi.h>
+
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+
+#include <unistd.h>
+
+#include "migration.h"
+#include "pci_emul.h"
+#include "snapshot.h"
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+#define ALLOCA_VM_SNAPSHOT_META(CTX, DEV_NAME, DEV_REQ, BUFFER, BUFFER_SIZE, OP) \
+({ \
+ &(struct vm_snapshot_meta) { \
+ .ctx = CTX, \
+ .dev_name = DEV_NAME, \
+ .dev_req = DEV_REQ, \
+ \
+ .buffer.buf_start = BUFFER, \
+ .buffer.buf_size = BUFFER_SIZE, \
+ .op = OP, \
+ }; \
+ \
+})
+
+#ifdef BHYVE_DEBUG
+#define DPRINTF(FMT, ...) \
+({ \
+ fprintf(stderr, "%s: " FMT "\r\n", __func__, ##__VA_ARGS__); \
+ })
+#else
+#define DPRINTF(FMT, ...)
+#endif
+
+#define EPRINTF(FMT, ...) \
+({ \
+ fprintf(stderr, "%s: " FMT "\r\n", __func__, ##__VA_ARGS__); \
+ })
+
+int
+receive_vm_migration(struct vmctx *ctx, char *migration_data)
+{
+ struct migrate_req req;
+ char *hostname, *pos;
+ int rc;
+
+ memset(req.host, 0, MAX_HOSTNAME_LEN);
+ hostname = strdup(migration_data);
+
+ if ((pos = strchr(hostname, ',')) != NULL) {
+ *pos = '\0';
+ strlcpy(req.host, hostname, MAX_HOSTNAME_LEN);
+ pos = pos + 1;
+
+ rc = sscanf(pos, "%d", &(req.port));
+
+ if (rc == 0) {
+ EPRINTF("Could not parse the port");
+ free(hostname);
+ return -1;
+ }
+ } else {
+ strlcpy(req.host, hostname, MAX_HOSTNAME_LEN);
+
+ /* If only one variable could be read, it should be the host */
+ req.port = DEFAULT_MIGRATION_PORT;
+ }
+
+ rc = vm_recv_migrate_req(ctx, req);
+
+ free(hostname);
+ return (rc);
+}
+
+static int
+get_system_specs_for_migration(struct migration_system_specs *specs)
+{
+ int mib[2];
+ size_t len_machine, len_model, len_pagesize;
+ char interm[MAX_SPEC_LEN];
+ int rc;
+ int num;
+
+ mib[0] = CTL_HW;
+ mib[1] = HW_MACHINE;
+ memset(interm, 0, MAX_SPEC_LEN);
+ len_machine = sizeof(interm);
+
+ rc = sysctl(mib, 2, interm, &len_machine, NULL, 0);
+ if (rc != 0) {
+ perror("Could not retrieve HW_MACHINE specs");
+ return (rc);
+ }
+ strlcpy(specs->hw_machine, interm, MAX_SPEC_LEN);
+
+ memset(interm, 0, MAX_SPEC_LEN);
+ mib[0] = CTL_HW;
+ mib[1] = HW_MODEL;
+ len_model = sizeof(interm);
+ rc = sysctl(mib, 2, interm, &len_model, NULL, 0);
+ if (rc != 0) {
+ perror("Could not retrieve HW_MODEL specs");
+ return (rc);
+ }
+ strlcpy(specs->hw_model, interm, MAX_SPEC_LEN);
+
+ mib[0] = CTL_HW;
+ mib[1] = HW_PAGESIZE;
+ len_pagesize = sizeof(num);
+ rc = sysctl(mib, 2, &num, &len_pagesize, NULL, 0);
+ if (rc != 0) {
+ perror("Could not retrieve HW_PAGESIZE specs");
+ return (rc);
+ }
+ specs->hw_pagesize = num;
+
+ return (0);
+}
+
+static int
+migration_transfer_data(int socket, void *msg, size_t len, enum migration_transfer_req req)
+{
+ uint64_t to_transfer, total_transferred;
+ int64_t transferred;
+
+ to_transfer = len;
+ total_transferred = 0;
+
+ while (to_transfer > 0) {
+ switch (req) {
+ case MIGRATION_SEND_REQ:
+ transferred = send(socket, msg + total_transferred,
+ to_transfer, 0);
+ break;
+ case MIGRATION_RECV_REQ:
+ transferred = recv(socket, msg + total_transferred,
+ to_transfer, 0);
+ break;
+ default:
+ DPRINTF("Unknown transfer option");
+ return (-1);
+ break;
+ }
+
+ if (transferred == 0)
+ break;
+ if (transferred < 0) {
+ perror("Error while transfering data");
+ return (transferred);
+ }
+
+ to_transfer -= transferred;
+ total_transferred += transferred;
+ }
+
+ return (0);
+}
+
+static int
+migration_check_specs(int socket, enum migration_transfer_req req)
+{
+ struct migration_system_specs local_specs;
+ struct migration_system_specs remote_specs;
+ struct migration_system_specs transfer_specs;
+ struct migration_message_type msg;
+ enum migration_transfer_req rev_req;
+ size_t response;
+ int rc;
+
+ if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) {
+ DPRINTF("Unknown option for migration req");
+ return (-1);
+ }
+
+ if (req == MIGRATION_SEND_REQ)
+ rev_req = MIGRATION_RECV_REQ;
+ else
+ rev_req = MIGRATION_SEND_REQ;
+
+ rc = get_system_specs_for_migration(&local_specs);
+ if (rc != 0) {
+ EPRINTF("Could not retrieve local specs");
+ return (rc);
+ }
+
+ if (req == MIGRATION_SEND_REQ) {
+ /* Send message type to server: specs & len */
+ msg.type = MESSAGE_TYPE_SPECS;
+ msg.len = sizeof(local_specs);
+ }
+
+ rc = migration_transfer_data(socket, &msg, sizeof(msg), req);
+ if (rc < 0) {
+ DPRINTF("Could not send message type");
+ return (-1);
+ }
+
+ if ((req == MIGRATION_RECV_REQ) && (msg.type != MESSAGE_TYPE_SPECS)) {
+ DPRINTF(" Wrong message type received from remote");
+ return (-1);
+ }
+
+ /* For the send req, we send the local specs and for the receive req
+ * we receive the remote specs.
+ */
+ if (req == MIGRATION_SEND_REQ)
+ transfer_specs = local_specs;
+
+ rc = migration_transfer_data(socket, &transfer_specs, sizeof(transfer_specs), req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer system specs");
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ remote_specs = transfer_specs;
+
+ /* Check specs */
+ response = MIGRATION_SPECS_OK;
+ if ((strncmp(local_specs.hw_model, remote_specs.hw_model, MAX_SPEC_LEN) != 0)
+ || (strncmp(local_specs.hw_machine, remote_specs.hw_machine, MAX_SPEC_LEN) != 0)
+ || (local_specs.hw_pagesize != remote_specs.hw_pagesize)
+ ) {
+ EPRINTF("System specification mismatch");
+ DPRINTF("Local specs vs Remote Specs: \r\n"
+ "\tmachine: %s vs %s\r\n"
+ "\tmodel: %s vs %s\r\n"
+ "\tpagesize: %zu vs %zu\r\n",
+ local_specs.hw_machine,
+ remote_specs.hw_machine,
+ local_specs.hw_model,
+ remote_specs.hw_model,
+ local_specs.hw_pagesize,
+ remote_specs.hw_pagesize
+ );
+ response = MIGRATION_SPECS_NOT_OK;
+ }
+ }
+
+ /* The source will receive the result of the checkup (i.e.
+ * whether the migration is possible or the source and destination
+ * are incompatible for migration) and the destination will send the
+ * result of the checkup.
+ */
+ rc = migration_transfer_data(socket, &response, sizeof(response), rev_req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer response from server");
+ return (-1);
+ }
+
+ if (response == MIGRATION_SPECS_NOT_OK)
+ return (-1);
+
+ fprintf(stdout, "%s: System specification accepted\r\n", __func__);
+
+ return (0);
+
+}
+
+static int
+get_migration_host_and_type(const char *hostname, unsigned char *ipv4_addr,
+ unsigned char *ipv6_addr, int *type)
+{
+ struct addrinfo hints, *res;
+ void *addr;
+ int rc;
+
+ memset(&hints, 0, sizeof(hints));
+
+ hints.ai_family = AF_UNSPEC;
+
+ rc = getaddrinfo(hostname, NULL, &hints, &res);
+
+ if (rc != 0) {
+ DPRINTF("Could not get address info");
+ return (-1);
+ }
+
+ *type = res->ai_family;
+ switch(res->ai_family) {
+ case AF_INET:
+ addr = &((struct sockaddr_in *) res->ai_addr)->sin_addr;
+ inet_ntop(res->ai_family, addr, ipv4_addr, MAX_IP_LEN);
+ break;
+ case AF_INET6:
+ addr = &((struct sockaddr_in6 *) res->ai_addr)->sin6_addr;
+ inet_ntop(res->ai_family, addr, ipv6_addr, MAX_IP_LEN);
+ break;
+ default:
+ DPRINTF("Unknown address family.");
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+migrate_check_memsize(size_t local_lowmem_size, size_t local_highmem_size,
+ size_t remote_lowmem_size, size_t remote_highmem_size)
+{
+ int ret;
+
+ ret = MIGRATION_SPECS_OK;
+
+ if (local_lowmem_size != remote_lowmem_size){
+ ret = MIGRATION_SPECS_NOT_OK;
+ DPRINTF("Local and remote lowmem size mismatch");
+ }
+
+ if (local_highmem_size != remote_highmem_size){
+ ret = MIGRATION_SPECS_NOT_OK;
+ DPRINTF("Local and remote highmem size mismatch");
+ }
+
+ return (ret);
+}
+
+static int
+migrate_recv_memory(struct vmctx *ctx, int socket)
+{
+ size_t local_lowmem_size, local_highmem_size;
+ size_t remote_lowmem_size, remote_highmem_size;
+ char *baseaddr;
+ int memsize_ok;
+ int rc;
+
+ local_lowmem_size = local_highmem_size = 0;
+ remote_lowmem_size = remote_highmem_size = 0;
+ rc = 0;
+
+ rc = vm_get_guestmem_from_ctx(ctx,
+ &baseaddr, &local_lowmem_size,
+ &local_highmem_size);
+ if (rc != 0) {
+ DPRINTF("Could not get guest lowmem size and highmem size");
+ return (rc);
+ }
+
+ rc = migration_transfer_data(socket, &remote_lowmem_size, sizeof(remote_lowmem_size), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv lowmem size");
+ return (rc);
+ }
+
+ rc = migration_transfer_data(socket, &remote_highmem_size, sizeof(remote_highmem_size), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv highmem size");
+ return (rc);
+ }
+
+ memsize_ok = migrate_check_memsize(local_lowmem_size, local_highmem_size,
+ remote_lowmem_size, remote_highmem_size);
+
+ rc = migration_transfer_data(socket,
+ &memsize_ok, sizeof(memsize_ok), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send migration_ok to remote");
+ return (rc);
+ }
+
+ if (memsize_ok != MIGRATION_SPECS_OK) {
+ DPRINTF("Memory size mismatch with remote host");
+ return (-1);
+ }
+
+ rc = migration_transfer_data(socket, baseaddr, local_lowmem_size, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv chunk lowmem.");
+ return (-1);
+ }
+
+ if (local_highmem_size > 0){
+ rc = migration_transfer_data(socket, baseaddr + 4 * GB, local_highmem_size, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv highmem");
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+migrate_send_memory(struct vmctx *ctx, int socket)
+{
+ size_t lowmem_size, highmem_size;
+ char *mmap_vm_lowmem, *mmap_vm_highmem;
+ char *baseaddr;
+ int memsize_ok;
+ int rc;
+
+ rc = 0;
+ mmap_vm_lowmem = MAP_FAILED;
+ mmap_vm_highmem = MAP_FAILED;
+
+ rc = vm_get_guestmem_from_ctx(ctx, &baseaddr,
+ &lowmem_size, &highmem_size);
+ if (rc != 0) {
+ DPRINTF("Could not get guest lowmem size and highmem size");
+ return (rc);
+ }
+
+ /* Send the size of the lowmem segment */
+ rc = migration_transfer_data(socket, &lowmem_size, sizeof(lowmem_size), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send lowmem size");
+ return (rc);
+ }
+
+ /* Send the size of the highmem segment */
+ rc = migration_transfer_data(socket, &highmem_size, sizeof(lowmem_size), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send highmem size");
+ return (rc);
+ }
+
+ /* Wait for answer - params ok (if memory size matches) */
+ rc = migration_transfer_data(socket, &memsize_ok, sizeof(memsize_ok), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not receive response from remote");
+ return (rc);
+ }
+
+ if (memsize_ok != MIGRATION_SPECS_OK) {
+ DPRINTF("Memory size mismatch with remote host");
+ return (-1);
+ }
+
+ mmap_vm_lowmem = baseaddr;
+ mmap_vm_highmem = baseaddr + 4 * GB;
+
+ /* Send the lowmem segment */
+ rc = migration_transfer_data(socket, mmap_vm_lowmem, lowmem_size, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send lowmem");
+ return (-1);
+ }
+
+ /* Send the highmem segment */
+ if (highmem_size > 0){
+ rc = migration_transfer_data(socket, mmap_vm_highmem, highmem_size, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send highmem");
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+/**
+ * The source host saves the state for the kernel structure that will be
+ * migrated and sends to the destination host a message that contains
+ * the type of data to be sent (MESSAGE_TYPE_KERN), the size of the structure
+ * to be received and the index that represents the kernel structure in order to
+ * be identified by the destination host. Then, the source host transfer the
+ * state of the kernel structure over the network and the destination host
+ * restores it.
+ */
+static inline int
+migrate_kern_struct(struct vmctx *ctx, int socket, char *buffer,
+ enum snapshot_req struct_req, enum migration_transfer_req req)
+{
+ int rc;
+ struct migration_message_type msg;
+ struct vm_snapshot_meta *meta;
+
+ if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) {
+ DPRINTF("Unknown request");
+ return (-1);
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ if (req == MIGRATION_SEND_REQ) {
+ msg.type = MESSAGE_TYPE_KERN;
+
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, NULL, struct_req, buffer, SNAPSHOT_BUFFER_SIZE, VM_SNAPSHOT_SAVE);
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = vm_snapshot_req(meta);
+ if (rc < 0) {
+ DPRINTF("Could not get struct with req %d", struct_req);
+ return (-1);
+ }
+
+ msg.len = vm_get_snapshot_size(meta);
+ msg.req_type = struct_req;
+
+ }
+
+ rc = migration_transfer_data(socket, &msg, sizeof(msg), req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer message type for kern struct %d", struct_req);
+ return (-1);
+ }
+
+ if ((req == MIGRATION_RECV_REQ) && (msg.type != MESSAGE_TYPE_KERN)) {
+ DPRINTF("Receive wrong message type.");
+ return (-1);
+ }
+
+ rc = migration_transfer_data(socket, buffer, msg.len, req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer struct with req %d", struct_req);
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, NULL, msg.req_type, buffer,
+ msg.len, VM_SNAPSHOT_RESTORE);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = vm_snapshot_req(meta);
+ if (rc != 0) {
+ DPRINTF("Failed to restore struct %d", msg.req_type);
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+migrate_kern_data(struct vmctx *ctx, int socket, enum migration_transfer_req req)
+{
+ int i, rc, error;
+ int ndevs;
+ char *buffer;
+ const struct vm_snapshot_kern_info *snapshot_kern_structs;
+
+ error = 0;
+ snapshot_kern_structs = get_snapshot_kern_structs(&ndevs);
+
+ buffer = malloc(SNAPSHOT_BUFFER_SIZE);
+ if (buffer == NULL) {
+ EPRINTF("Could not allocate memory.");
+ return (-1);
+ }
+
+ for (i = 0; i < ndevs; i++) {
+ if (req == MIGRATION_RECV_REQ) {
+ rc = migrate_kern_struct(ctx, socket, buffer, NO_KERN_STRUCT, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not restore struct %s", snapshot_kern_structs[i].struct_name);
+ error = -1;
+ break;
+ }
+ } else if (req == MIGRATION_SEND_REQ) {
+ rc = migrate_kern_struct(ctx, socket, buffer,
+ snapshot_kern_structs[i].req, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send %s", snapshot_kern_structs[i].struct_name);
+ error = -1;
+ break;
+ }
+ } else {
+ DPRINTF("Unknown transfer request");
+ error = -1;
+ break;
+ }
+ }
+
+ free(buffer);
+
+ return (error);
+}
+
+static inline const struct vm_snapshot_dev_info *
+find_entry_for_dev(const char *name)
+{
+ int i;
+ int ndevs;
+ const struct vm_snapshot_dev_info *snapshot_devs;
+
+ snapshot_devs = get_snapshot_devs(&ndevs);
+
+ for (i = 0; i < ndevs; i++) {
+ if (strncmp(name, snapshot_devs[i].dev_name, MAX_DEV_NAME_LEN) == 0) {
+ return (&snapshot_devs[i]);
+ }
+ }
+
+ return NULL;
+}
+
+static inline int
+migrate_transfer_dev(struct vmctx *ctx, int socket, const char *dev,
+ char *buffer, size_t len, enum migration_transfer_req req)
+{
+ int rc;
+ size_t data_size;
+ struct migration_message_type msg;
+ struct vm_snapshot_meta *meta;
+ const struct vm_snapshot_dev_info *dev_info;
+
+ if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) {
+ DPRINTF("Unknown transfer request option");
+ return (-1);
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ memset(buffer, 0, len);
+ if (req == MIGRATION_SEND_REQ) {
+ dev_info = find_entry_for_dev(dev);
+ if (dev_info == NULL) {
+ EPRINTF("Could not find the device %s "
+ "or migration not implemented yet for it.", dev);
+ return (0);
+ }
+
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, dev, 0, buffer, len, VM_SNAPSHOT_SAVE);
+
+ memset(meta->buffer.buf_start, 0, meta->buffer.buf_size);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = (*dev_info->snapshot_cb)(meta);
+ if (rc < 0) {
+ DPRINTF("Could not get info about %s dev", dev);
+ return (-1);
+ }
+
+ data_size = vm_get_snapshot_size(meta);
+
+ msg.type = MESSAGE_TYPE_DEV;
+ msg.len = data_size;
+ strlcpy(msg.name, dev, MAX_DEV_NAME_LEN);
+ }
+
+ rc = migration_transfer_data(socket, &msg, sizeof(msg), req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer msg for %s dev", dev);
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ if (msg.type != MESSAGE_TYPE_DEV) {
+ DPRINTF("Wrong message type for device.");
+ return (-1);
+ }
+
+ data_size = msg.len;
+ }
+
+ if (data_size == 0)
+ return (0); // this type of device is not used
+
+
+ rc = migration_transfer_data(socket, buffer, data_size, req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer %s dev", dev);
+ return (-1);
+ }
+
+ if (req == MIGRATION_RECV_REQ) {
+ dev_info = find_entry_for_dev(msg.name);
+ if (dev_info == NULL) {
+ EPRINTF("Could not find the device %s "
+ "or migration not implemented yet for it.", msg.name);
+ return (0);
+ }
+ meta = ALLOCA_VM_SNAPSHOT_META(ctx, msg.name, 0, buffer, data_size, VM_SNAPSHOT_RESTORE);
+ meta->buffer.buf = meta->buffer.buf_start;
+ meta->buffer.buf_rem = meta->buffer.buf_size;
+
+ rc = (*dev_info->snapshot_cb)(meta);
+ if (rc != 0) {
+ EPRINTF("Could not restore %s dev", msg.name);
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+migrate_devs(struct vmctx *ctx, int socket, enum migration_transfer_req req)
+{
+ int i, num_items;
+ int rc, error;
+ char *buffer;
+ const struct vm_snapshot_dev_info *snapshot_devs;
+
+ error = 0;
+ buffer = malloc(SNAPSHOT_BUFFER_SIZE);
+ if (buffer == NULL) {
+ EPRINTF("Could not allocate memory");
+ error = -1;
+ goto end;
+ }
+
+ if (req == MIGRATION_SEND_REQ) {
+ /*
+ * Send to the destination the number of devices that will
+ * be migrated.
+ */
+ snapshot_devs = get_snapshot_devs(&num_items);
+
+ rc = migration_transfer_data(socket, &num_items, sizeof(num_items), req);
+ if (rc < 0) {
+ DPRINTF("Could not send num_items to destination");
+ return (-1);
+ }
+
+ for (i = 0; i < num_items; i++) {
+ rc = migrate_transfer_dev(ctx, socket, snapshot_devs[i].dev_name,
+ buffer, SNAPSHOT_BUFFER_SIZE, req);
+
+ if (rc < 0) {
+ DPRINTF("Could not send %s", snapshot_devs[i].dev_name);
+ error = -1;
+ goto end;
+ }
+ }
+ } else if (req == MIGRATION_RECV_REQ) {
+ /* receive the number of devices that will be migrated */
+ rc = migration_transfer_data(socket, &num_items, sizeof(num_items), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not recv num_items from source");
+ return (-1);
+ }
+
+ for (i = 0; i < num_items; i++) {
+ rc = migrate_transfer_dev(ctx, socket, NULL, buffer, SNAPSHOT_BUFFER_SIZE, req);
+ if (rc < 0) {
+ DPRINTF("Could not recv device");
+ error = -1;
+ goto end;
+ }
+ }
+ }
+
+end:
+ if (buffer != NULL)
+ free(buffer);
+
+ return (error);
+}
+
+
+#define MIGRATION_ROUNDS 4
+
+static size_t
+num_dirty_pages(char *page_list, size_t size)
+{
+ size_t num = 0;
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ if (page_list[i] == 1)
+ num++;
+
+ return (num);
+}
+
+static int
+migration_fill_vmm_migration_pages_req(struct vmctx *ctx,
+ struct vmm_migration_pages_req *req,
+ char *page_list,
+ size_t size,
+ size_t *current_position)
+{
+ size_t i, count;
+
+ count = 0;
+ for (i = *current_position; i < size; i++) {
+ if (count == VMM_PAGE_CHUNK)
+ break;
+
+ if (page_list[i] == 1) {
+ req->pages[count].pindex = i;
+ count ++;
+ }
+ }
+
+ *current_position = i;
+ req->pages_required = count;
+ req->req_type = VMM_GET_PAGES;
+
+ return vm_copy_vmm_pages(ctx, req);
+}
+
+static int
+migrate_pages(struct vmctx *ctx, int socket, struct vmm_migration_pages_req *req,
+ char *page_list, size_t page_list_size, int already_locked,
+ enum migration_transfer_req migration_req)
+{
+ size_t dirty_pages;
+ size_t current_pos, i, count;
+ int rc;
+
+ if ((migration_req != MIGRATION_SEND_REQ) && (migration_req != MIGRATION_RECV_REQ)) {
+ EPRINTF("wrong migration transfer req");
+ return (-1);
+ }
+
+ /*
+ * Transfer the state of the pages (dirty/not dirty) from the source
+ * host to the destination host. The pages that are dirty will be
+ * transferred in the next steps.
+ */
+ rc = migration_transfer_data(socket, page_list, page_list_size, migration_req);
+ if (rc < 0) {
+ DPRINTF("Could not transfer page_list remote");
+ return (-1);
+ }
+
+ dirty_pages = num_dirty_pages(page_list, page_list_size);
+
+ current_pos = 0;
+ while (1) {
+ if (current_pos >= page_list_size)
+ break;
+
+ for (i = 0; i < VMM_PAGE_CHUNK; i++)
+ req->pages[i].pindex = -1;
+
+ req->pages_required = 0;
+
+ /* Only the source host pauses the vcpus */
+ if (migration_req == MIGRATION_SEND_REQ) {
+ if (!already_locked)
+ vm_vcpu_pause(ctx);
+
+ rc = migration_fill_vmm_migration_pages_req(ctx, req, page_list,
+ page_list_size,
+ ¤t_pos);
+
+ if (!already_locked)
+ vm_vcpu_resume(ctx);
+
+ if (rc < 0) {
+ DPRINTF("Could not get pages");
+ return (-1);
+ }
+ } else {
+ count = 0;
+ for (i = current_pos; i < page_list_size; i++) {
+ if (count == VMM_PAGE_CHUNK)
+ break;
+
+ if (page_list[i] == 1) {
+ req->pages[count].pindex = i;
+ count ++;
+ }
+ }
+
+ current_pos = i;
+ req->pages_required = count;
+ }
+
+ for (i = 0; i < req->pages_required; i++) {
+ rc = migration_transfer_data(socket, req->pages[i].page, PAGE_SIZE, migration_req);
+ if (rc < 0) {
+ DPRINTF("Cound not transfer page %zu", req->pages[i].pindex);
+ return (-1);
+ }
+ }
+
+ if (migration_req == MIGRATION_RECV_REQ) {
+ req->req_type = VMM_SET_PAGES;
+
+ rc = vm_copy_vmm_pages(ctx, req);
+ if (rc < 0) {
+ EPRINTF("Could not copy pages into guest memory");
+ return (-1);
+ }
+ }
+ }
+
+ return (0);
+}
+
+static int
+search_dirty_pages(struct vmctx *ctx, char *page_list)
+{
+ size_t lowmem_pages, highmem_pages, pages;
+ int error;
+
+ if (page_list == NULL)
+ return (-1);
+
+ error = vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages);
+ if (error != 0) {
+ DPRINTF("Error while trying to get page number");
+ return (-1);
+ }
+
+ pages = lowmem_pages + highmem_pages;
+ vm_get_dirty_page_list(ctx, page_list, pages);
+
+ return (0);
+}
+
+static inline void
+fill_page_list(char *page_list, size_t list_len, char c)
+{
+ size_t index;
+
+ if (page_list == NULL)
+ return;
+
+ for (index = 0; index < list_len; index ++)
+ page_list[index] = c;
+}
+
+static int
+live_migrate_send(struct vmctx *ctx, int socket)
+{
+ int error, i, rc;
+ uint8_t rounds;
+ size_t memory_size, lowmem_size, highmem_size;
+ size_t migration_completed;
+ size_t lowmem_pages, highmem_pages, pages;
+ char *baseaddr, *page_list_indexes;
+ struct vmm_migration_pages_req memory_req;
+
+ error = 0;
+ memory_size = lowmem_size = highmem_size = 0;
+ page_list_indexes = NULL;
+ rounds = MIGRATION_ROUNDS;
+
+ /* Send the number of memory rounds to destination */
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Could not send the number of rounds remote");
+ goto done;
+ }
+
+ /* Compute memory_size and pages*/
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size);
+
+ memory_size = lowmem_size + highmem_size;
+ vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages);
+ pages = lowmem_pages + highmem_pages;
+
+ /* alloc page_list_indexes */
+ page_list_indexes = malloc (pages * sizeof(char));
+ if (page_list_indexes == NULL) {
+ perror("Page list indexes could not be allocated");
+ error = -1;
+ goto done;
+ }
+
+ error = vm_init_vmm_migration_pages_req(ctx, &memory_req);
+ if (error < 0) {
+ DPRINTF("Could not initialize struct vmm_migration_pages_req");
+ return (error);
+ }
+
+ for (i = 0; i <= MIGRATION_ROUNDS; i++) {
+ if (i == MIGRATION_ROUNDS) { // Last Round
+ rc = vm_pause_user_devs(ctx);
+ if (rc != 0) {
+ DPRINTF("Could not pause devices");
+ error = rc;
+ goto done;
+ }
+
+ vm_vcpu_pause(ctx);
+ }
+
+ if (i == 0) { // First Round
+ fill_page_list(page_list_indexes, pages, 1);
+ } else {
+ DPRINTF("ROUND: %d", i);
+ fill_page_list(page_list_indexes, pages, 0);
+
+ if (i != MIGRATION_ROUNDS) {
+ vm_vcpu_pause(ctx);
+ }
+
+ /* Search the dirty pages and populate page_list_index */
+ error = search_dirty_pages(ctx, page_list_indexes);
+
+ if (error != 0) {
+ DPRINTF("Couldn't search for the dirty pages");
+ goto unlock_vm_and_exit;
+ }
+
+ if (i != MIGRATION_ROUNDS) {
+ vm_vcpu_resume(ctx);
+ }
+ }
+
+ error = migrate_pages(ctx, socket, &memory_req, page_list_indexes,
+ pages, i == MIGRATION_ROUNDS ? 1 : 0, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Couldn't send dirty pages to dest");
+ goto done;
+ }
+ }
+
+ // Send kern data
+ error = migrate_kern_data(ctx, socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Could not send kern data to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ // Send PCI data
+ error = migrate_devs(ctx, socket, MIGRATION_SEND_REQ);
+ if (error != 0) {
+ DPRINTF("Could not send pci devs to destination");
+ goto unlock_vm_and_exit;
+ }
+
+ // Wait for migration completed
+ error = migration_transfer_data(socket, &migration_completed,
+ sizeof(migration_completed), MIGRATION_RECV_REQ);
+ if ((error < 0) || (migration_completed != MIGRATION_SPECS_OK)) {
+ DPRINTF("Could not recv migration completed remote or received error");
+ goto unlock_vm_and_exit;
+ }
+
+ // Poweroff the vm
+ vm_vcpu_resume(ctx);
+
+ vm_destroy(ctx);
+ exit(0);
+
+unlock_vm_and_exit:
+ vm_vcpu_resume(ctx);
+done:
+ rc = vm_resume_user_devs(ctx);
+ if (rc != 0)
+ EPRINTF("Could not resume devices");
+ if (page_list_indexes != NULL)
+ free(page_list_indexes);
+ return (error);
+}
+
+static int
+live_migrate_recv(struct vmctx *ctx, int socket)
+{
+ int error, index;
+ uint8_t rounds;
+ size_t memory_size, lowmem_size, highmem_size;
+ size_t lowmem_pages, highmem_pages, pages;
+ char *baseaddr, *page_list_indexes;
+ struct vmm_migration_pages_req memory_req;
+
+ memory_size = lowmem_size = highmem_size = 0;
+ page_list_indexes = NULL;
+
+ error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_RECV_REQ);
+ if (error != 0) {
+ DPRINTF("Could not recv the number of rounds from remote");
+ goto done;
+ }
+
+ /* Compute memory_size and pages*/
+ vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size);
+
+ memory_size = lowmem_size + highmem_size;
+ vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages);
+ pages = lowmem_pages + highmem_pages;
+
+ /* alloc page_list_indexes */
+ page_list_indexes = malloc(pages * sizeof(char));
+ if (page_list_indexes == NULL) {
+ perror("Page list indexes could not be allocated");
+ error = -1;
+ goto done;
+ }
+
+ error = vm_init_vmm_migration_pages_req(ctx, &memory_req);
+ if (error < 0) {
+ DPRINTF("Could not initialize struct vmm_migration_pages_req");
+ return (error);
+ }
+
+ /* The following iteration contains the preliminary round in which the
+ * entire memory is migrated to the destination. Then, for
+ * MIGRATION_ROUNDS - 1 rounds, only the dirtied pages will be migrated.
+ * In the final round, the rest of the pages are migrated.
+ * Since the vcpus are not started, we don't need to lock them, so we
+ * can do the memory migration pretty straight-forward.
+ */
+ for (index = 0; index <= rounds; index ++) {
+ fill_page_list(page_list_indexes, pages, 0);
+
+ error = migrate_pages(ctx, socket, &memory_req, page_list_indexes, pages, true, MIGRATION_RECV_REQ);
+ if (error != 0) {
+ DPRINTF("Couldn't recv dirty pages from source");
+ goto done;
+ }
+ }
+
+ error = 0;
+done:
+ if (page_list_indexes != NULL) {
+ free(page_list_indexes);
+ }
+ return (error);
+}
+
+static inline int
+migrate_connections(struct migrate_req req, int *socket_fd,
+ int *connection_socket_fd,
+ enum migration_transfer_req type)
+{
+ unsigned char ipv4_addr[MAX_IP_LEN];
+ unsigned char ipv6_addr[MAX_IP_LEN];
+ int addr_type;
+ int error;
+ int s, con_socket;
+ struct sockaddr_in sa, client_sa;
+ socklen_t client_len;
+ int rc;
+
+ rc = get_migration_host_and_type(req.host, ipv4_addr,
+ ipv6_addr, &addr_type);
+
+ if (rc != 0) {
+ EPRINTF("Invalid address.");
+ DPRINTF("IP address used for migration: %s;\r\n"
+ "Port used for migration: %d",
+ req.host, req.port);
+ return (rc);
+ }
+
+ if (addr_type == AF_INET6) {
+ EPRINTF("IPv6 is not supported yet for migration. "
+ "Please try again using a IPv4 address.");
+
+ DPRINTF("IP address used for migration: %s;\r\nPort used for migration: %d",
+ ipv6_addr, req.port);
+ return (-1);
+ }
+
+ s = socket(AF_INET, SOCK_STREAM, 0);
+
+ if (s < 0) {
+ perror("Could not create socket");
+ return (-1);
+ }
+
+ bzero(&sa, sizeof(sa));
+
+ switch (type) {
+ case MIGRATION_SEND_REQ:
+ fprintf(stdout, "%s: Starting connection to %s on %d port...\r\n",
+ __func__, ipv4_addr, req.port);
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = htons(req.port);
+
+ rc = inet_pton(AF_INET, ipv4_addr, &sa.sin_addr);
+ if (rc <= 0) {
+ DPRINTF("Could not retrive the IPV4 address");
+ return (-1);
+ }
+
+ rc = connect(s, (struct sockaddr *)&sa, sizeof(sa));
+
+ if (rc < 0) {
+ perror("Could not connect to the remote host");
+ error = rc;
+ goto done_close_s;
+ }
+ *socket_fd = s;
+ break;
+ case MIGRATION_RECV_REQ:
+ fprintf(stdout, "%s: Waiting for connections from %s on %d port...\r\n",
+ __func__, ipv4_addr, req.port);
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = htons(req.port);
+ sa.sin_addr.s_addr = htonl(INADDR_ANY);
+
+ rc = bind(s, (struct sockaddr *)&sa, sizeof(sa));
+
+ if (rc < 0) {
+ perror("Could not bind");
+ error = rc;
+ goto done_close_s;
+ }
+
+ listen(s, 1);
+
+ con_socket = accept(s, (struct sockaddr *)&client_sa, &client_len);
+ if (con_socket < 0) {
+ EPRINTF("Could not accept connection");
+ error = -1;
+ goto done_close_s;
+ }
+ *socket_fd = s;
+ *connection_socket_fd = con_socket;
+ break;
+ default:
+ EPRINTF("unknown operation request");
+ error = -1;
+ goto done;
+ }
+
+ error = 0;
+ goto done;
+
+done_close_s:
+ close(s);
+done:
+ return (error);
+}
+
+int
+vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool live)
+{
+ int s;
+ int rc, error, migration_type;
+ size_t migration_completed;
+
+ rc = migrate_connections(req, &s, NULL, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not create connection");
+ return (-1);
+ }
+
+ rc = migration_check_specs(s, MIGRATION_SEND_REQ);
+
+ if (rc < 0) {
+ EPRINTF("Error while checking system requirements");
+ error = rc;
+ goto done;
+ }
+
+ migration_type = live;
+ rc = migration_transfer_data(s, &migration_type,
+ sizeof(migration_type), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ DPRINTF("Could not send migration type");
+ return (-1);
+ }
+
+ if (live) {
+ rc = live_migrate_send(ctx, s);
+ if (rc != 0) {
+ EPRINTF("Could not live migrate the guest's memory");
+ error = rc;
+ } else {
+ error = 0;
+ }
+ goto done;
+ } // else continue the warm migration procedure
+
+ vm_vcpu_pause(ctx);
+
+ rc = vm_pause_user_devs(ctx);
+ if (rc != 0) {
+ EPRINTF("Could not pause devices");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migrate_send_memory(ctx, s);
+ if (rc != 0) {
+ EPRINTF("Could not send memory to destination");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migrate_kern_data(ctx, s, MIGRATION_SEND_REQ);
+ if (rc != 0) {
+ EPRINTF("Could not send kern data to destination");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migrate_devs(ctx, s, MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not send pci devs to destination");
+ error = rc;
+ goto unlock_vm_and_exit;
+ }
+
+ rc = migration_transfer_data(s, &migration_completed,
+ sizeof(migration_completed), MIGRATION_RECV_REQ);
+ if ((rc < 0) || (migration_completed != MIGRATION_SPECS_OK)) {
+ EPRINTF("Could not recv migration completed remote or received error");
+ error = -1;
+ goto unlock_vm_and_exit;
+ }
+
+ vm_destroy(ctx);
+ exit(0);
+
+unlock_vm_and_exit:
+ vm_vcpu_resume(ctx);
+
+ rc = vm_resume_user_devs(ctx);
+ if (rc != 0)
+ EPRINTF("Could not resume devices");
+done:
+ close(s);
+ return (error);
+}
+
+int
+vm_recv_migrate_req(struct vmctx *ctx, struct migrate_req req)
+{
+ int s, con_socket;
+ int rc;
+ int migration_type;
+ size_t migration_completed;
+
+ rc = migrate_connections(req, &s, &con_socket, MIGRATION_RECV_REQ);
+ if (rc != 0) {
+ EPRINTF("Could not create connections");
+ return (-1);
+ }
+
+ rc = migration_check_specs(con_socket, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Error while checking specs");
+ close(con_socket);
+ close(s);
+ return (rc);
+ }
+
+ rc = migration_transfer_data(con_socket, &migration_type,
+ sizeof(migration_type), MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not recv migration type");
+ return (-1);
+ }
+
+ /* For recv, the only difference between warm and live migration is the
+ * way in which the memory is migrated.
+ */
+ if (migration_type) {
+ rc = live_migrate_recv(ctx, con_socket);
+ if (rc != 0) {
+ EPRINTF("Could not live migrate the guest's memory");
+ close(con_socket);
+ close(s);
+ return (rc);
+ }
+ } else {
+ /* if not live migration, then migrate memory normally. */
+ rc = migrate_recv_memory(ctx, con_socket);
+ if (rc < 0) {
+ EPRINTF("Could not recv lowmem and highmem");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+ }
+
+ rc = migrate_kern_data(ctx, con_socket, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not recv kern data");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+
+ rc = migrate_devs(ctx, con_socket, MIGRATION_RECV_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not recv pci devs");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+
+ fprintf(stdout, "%s: Migration completed\r\n", __func__);
+ migration_completed = MIGRATION_SPECS_OK;
+ rc = migration_transfer_data(con_socket, &migration_completed,
+ sizeof(migration_completed), MIGRATION_SEND_REQ);
+ if (rc < 0) {
+ EPRINTF("Could not send migration completed remote");
+ close(con_socket);
+ close(s);
+ return (-1);
+ }
+
+ close(con_socket);
+ close(s);
+ return (0);
+}
+
diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h
--- a/usr.sbin/bhyve/snapshot.h
+++ b/usr.sbin/bhyve/snapshot.h
@@ -44,9 +44,13 @@
#define BHYVE_RUN_DIR "/var/run/bhyve/"
#define MAX_SNAPSHOT_FILENAME PATH_MAX
+#define MAX_HOSTNAME_LEN 255
+#define DEFAULT_MIGRATION_PORT 24983
struct vmctx;
+#define SNAPSHOT_BUFFER_SIZE (20 * MB)
+
struct restore_state {
int kdata_fd;
int vmmem_fd;
@@ -60,15 +64,23 @@
ucl_object_t *meta_root_obj;
};
+struct __attribute__((packed)) migrate_req {
+ char host[MAX_HOSTNAME_LEN];
+ unsigned int port;
+};
+
/* Filename that will be used for save/restore */
struct checkpoint_op {
char snapshot_filename[MAX_SNAPSHOT_FILENAME];
+ struct migrate_req migrate_req;
};
/* Messages that a bhyve process understands. */
enum ipc_opcode {
START_CHECKPOINT,
START_SUSPEND,
+ START_MIGRATE,
+ START_MIGRATE_LIVE,
};
/*
@@ -88,8 +100,10 @@
struct checkpoint_thread_info {
struct vmctx *ctx;
int socket_fd;
+ struct sockaddr_un *addr;
};
+const char **get_pci_devs(int *);
typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *);
typedef int (*vm_pause_dev_cb) (struct vmctx *, const char *);
typedef int (*vm_resume_dev_cb) (struct vmctx *, const char *);
@@ -106,6 +120,9 @@
enum snapshot_req req; /* request type */
};
+const struct vm_snapshot_dev_info *get_snapshot_devs(int *ndevs);
+const struct vm_snapshot_kern_info *get_snapshot_kern_structs(int *ndevs);
+
void destroy_restore_state(struct restore_state *rstate);
const char *lookup_vmname(struct restore_state *rstate);
@@ -116,6 +133,8 @@
void checkpoint_cpu_add(int vcpu);
void checkpoint_cpu_resume(int vcpu);
void checkpoint_cpu_suspend(int vcpu);
+void vm_vcpu_pause(struct vmctx *ctx);
+void vm_vcpu_resume(struct vmctx *ctx);
int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate);
int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate);
diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c
--- a/usr.sbin/bhyve/snapshot.c
+++ b/usr.sbin/bhyve/snapshot.c
@@ -31,6 +31,8 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $FreeBSD$
*/
#include <sys/cdefs.h>
@@ -85,6 +87,7 @@
#include "ioapic.h"
#include "mem.h"
#include "mevent.h"
+#include "migration.h"
#include "mptbl.h"
#include "pci_emul.h"
#include "pci_irq.h"
@@ -166,6 +169,24 @@
{ "vrtc", STRUCT_VRTC },
};
+const struct vm_snapshot_dev_info *
+get_snapshot_devs(int *ndevs)
+{
+ if (ndevs != NULL)
+ *ndevs = nitems(snapshot_devs);
+
+ return (snapshot_devs);
+}
+
+const struct vm_snapshot_kern_info *
+get_snapshot_kern_structs(int *ndevs)
+{
+ if (ndevs != NULL)
+ *ndevs = nitems(snapshot_kern_structs);
+
+ return (snapshot_kern_structs);
+}
+
static cpuset_t vcpus_active, vcpus_suspended;
static pthread_mutex_t vcpu_lock;
static pthread_cond_t vcpus_idle, vcpus_can_run;
@@ -1301,7 +1322,7 @@
pthread_mutex_unlock(&vcpu_lock);
}
-static void
+void
vm_vcpu_pause(struct vmctx *ctx)
{
@@ -1313,7 +1334,7 @@
pthread_mutex_unlock(&vcpu_lock);
}
-static void
+void
vm_vcpu_resume(struct vmctx *ctx)
{
@@ -1444,6 +1465,8 @@
handle_message(struct ipc_message *imsg, struct vmctx *ctx)
{
int err;
+ struct migrate_req req;
+ int memflags;
switch (imsg->code) {
case START_CHECKPOINT:
@@ -1451,6 +1474,51 @@
break;
case START_SUSPEND:
err = vm_checkpoint(ctx, imsg->data.op.snapshot_filename, true);
+ break;
+ case START_MIGRATE:
+ fprintf(stdout, "Starting the warm migration procedure\r\n");
+ memset(&req, 0, sizeof(struct migrate_req));
+ req.port = imsg->data.op.migrate_req.port;
+ memcpy(req.host, imsg->data.op.migrate_req.host, MAX_HOSTNAME_LEN);
+ req.host[MAX_HOSTNAME_LEN - 1] = 0;
+ fprintf(stderr, "%s: IP address used for migration: %s;\r\n"
+ "Port used for migration: %d\r\n",
+ __func__,
+ req.host,
+ req.port);
+
+ err = vm_send_migrate_req(ctx, req, false);
+ break;
+ case START_MIGRATE_LIVE:
+ fprintf(stdout, "Starting the live migration procedure\r\n");
+
+ /* Currently, the live migration is implemented only
+ * for guests that are started using -S (wired
+ * memory option).
+ */
+
+ /* Check memflags. If the VM_MEM_F_WIRED bit is not
+ * set, then the live migration procedure cannot be
+ * done. */
+ memflags = vm_get_memflags(ctx);
+ if (!(memflags & VM_MEM_F_WIRED)) {
+ fprintf(stderr, "%s: Migration not supported for un-wired guests\r\n", __func__);
+ err = -1;
+ break;
+ }
+
+ memset(&req, 0, sizeof(struct migrate_req));
+ req.port = imsg->data.op.migrate_req.port;
+ memcpy(req.host, imsg->data.op.migrate_req.host, MAX_HOSTNAME_LEN);
+ req.host[MAX_HOSTNAME_LEN - 1] = 0;
+ fprintf(stderr, "%s: IP address used for migration: %s;\r\n"
+ "Port used for migration: %d\r\n",
+ __func__,
+ req.host,
+ req.port);
+
+ err = vm_send_migrate_req(ctx, req, true);
+
break;
default:
EPRINTLN("Unrecognized checkpoint operation\n");
diff --git a/usr.sbin/bhyvectl/bhyvectl.c b/usr.sbin/bhyvectl/bhyvectl.c
--- a/usr.sbin/bhyvectl/bhyvectl.c
+++ b/usr.sbin/bhyvectl/bhyvectl.c
@@ -90,6 +90,8 @@
#ifdef BHYVE_SNAPSHOT
" [--checkpoint=<filename>]\n"
" [--suspend=<filename>]\n"
+ " [--migrate=<host>,<port>]\n"
+ " [--migrate-live=<host,port>]\n"
#endif
" [--get-all]\n"
" [--get-stats]\n"
@@ -303,6 +305,8 @@
#ifdef BHYVE_SNAPSHOT
static int vm_checkpoint_opt;
static int vm_suspend_opt;
+static int vm_migrate;
+static int vm_migrate_live;
#endif
/*
@@ -594,6 +598,8 @@
#ifdef BHYVE_SNAPSHOT
SET_CHECKPOINT_FILE,
SET_SUSPEND_FILE,
+ MIGRATE_VM,
+ MIGRATE_VM_LIVE,
#endif
};
@@ -1466,6 +1472,8 @@
#ifdef BHYVE_SNAPSHOT
{ "checkpoint", REQ_ARG, 0, SET_CHECKPOINT_FILE},
{ "suspend", REQ_ARG, 0, SET_SUSPEND_FILE},
+ { "migrate", REQ_ARG, 0, MIGRATE_VM},
+ { "migrate-live", REQ_ARG, 0, MIGRATE_VM_LIVE},
#endif
};
@@ -1736,6 +1744,49 @@
return (send_message(ctx, (void *)&imsg, length));
}
+
+static int
+send_start_migrate(struct vmctx *ctx, const char *migrate_vm, bool live)
+{
+ struct ipc_message imsg;
+ char *hostname, *pos;
+ size_t length;
+ int rc;
+
+ if (live)
+ imsg.code = START_MIGRATE_LIVE;
+ else
+ imsg.code = START_MIGRATE;
+
+ memset(imsg.data.op.migrate_req.host, 0, MAX_HOSTNAME_LEN);
+
+ hostname = strdup(migrate_vm);
+
+ if ((pos = strchr(hostname, ',')) != NULL ) {
+ *pos = '\0';
+ strlcpy(imsg.data.op.migrate_req.host, hostname, MAX_HOSTNAME_LEN);
+ pos = pos + 1;
+
+ rc = sscanf(pos, "%d", &(imsg.data.op.migrate_req.port));
+
+ if (rc == 0) {
+ fprintf(stderr, "Could not parse the port\r\n");
+ free(hostname);
+ return -1;
+ }
+ } else {
+ strlcpy(imsg.data.op.migrate_req.host, hostname, MAX_HOSTNAME_LEN);
+
+ /* If only one variable could be read, it should be the host */
+ imsg.data.op.migrate_req.port = DEFAULT_MIGRATION_PORT;
+ }
+
+ free(hostname);
+
+ length = offsetof(struct ipc_message, data) + sizeof(imsg.data.op);
+
+ return (send_message(ctx, (void *)&imsg, length));
+}
#endif
int
@@ -1755,7 +1806,7 @@
struct tm tm;
struct option *opts;
#ifdef BHYVE_SNAPSHOT
- char *checkpoint_file, *suspend_file;
+ char *checkpoint_file, *suspend_file, *migrate_host;
#endif
cpu_intel = cpu_vendor_intel();
@@ -1924,6 +1975,14 @@
vm_suspend_opt = 1;
suspend_file = optarg;
break;
+ case MIGRATE_VM:
+ vm_migrate = 1;
+ migrate_host = optarg;
+ break;
+ case MIGRATE_VM_LIVE:
+ vm_migrate_live = 1;
+ migrate_host = optarg;
+ break;
#endif
default:
usage(cpu_intel);
@@ -2402,6 +2461,12 @@
if (!error && vm_suspend_opt)
error = snapshot_request(ctx, suspend_file, START_SUSPEND);
+
+ if (!error && vm_migrate)
+ error = send_start_migrate(ctx, migrate_host, false);
+
+ if (!error && vm_migrate_live)
+ error = send_start_migrate(ctx, migrate_host, true);
#endif
free (opts);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Jun 29, 10:43 AM (10 h, 9 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34457398
Default Alt Text
D30954.diff (66 KB)
Attached To
Mode
D30954: Live Migration feature for bhyve
Attached
Detach File
Event Timeline
Log In to Comment