diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -268,6 +268,12 @@ int vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); +/* + * Live migration + */ +int vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num, + bool is_all_dirty); + /* * Deprecated interfaces, do not use them in new code. */ diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1734,6 +1734,28 @@ return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } +int +vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num_pages, bool is_all_dirty) +{ + struct vm_get_dirty_page_list list; + + bzero(&list, sizeof(struct vm_get_dirty_page_list)); + list.page_list = (uint8_t *)page_list; + list.num_pages = num_pages; + list.is_all_dirty = is_all_dirty; + + list.lowmem_start = 0; + list.lowmem_end = ctx->lowmem; + list.highmem_start = ctx->highmem != 0 ? 4 * GB : -1; + list.highmem_end = ctx->highmem != 0 ? 4 * GB + ctx->highmem : -1; + + madvise(ctx->baseaddr, ctx->lowmem, MADV_WILLNEED); + if (ctx->highmem != 0) + madvise(ctx->baseaddr + 4 * GB, ctx->highmem, MADV_WILLNEED); + + return (ioctl(ctx->fd, VM_GET_DIRTY_PAGE_LIST, &list)); +} + int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) @@ -1788,7 +1810,7 @@ VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY, - VM_SNAPSHOT_REQ, VM_RESTORE_TIME + VM_SNAPSHOT_REQ, VM_RESTORE_TIME, VM_GET_DIRTY_PAGE_LIST }; int diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c --- a/usr.sbin/bhyve/migration.c +++ b/usr.sbin/bhyve/migration.c @@ -30,6 +30,9 @@ #include #include #include +#ifdef BHYVE_DEBUG +#include +#endif #include "migration.h" #include "pci_emul.h" @@ -742,6 +745,287 @@ return (error); } +#define MIGRATION_ROUNDS 4 + +static int +migrate_segment_pages(int socket, char *baseaddr, char *page_list, + size_t nr_pages, enum migration_transfer_req migration_req) +{ + size_t start_dirty_page; + int rc; + + for (size_t i = 0; i < nr_pages; i++) { + if (page_list[i] == 0) + continue; + + start_dirty_page = i; + + for (; i < nr_pages; i++) { + if (page_list[i] == 0) + break; + } + + /* Transfer all continous dirty pages into the vm's memory */ + rc = migration_transfer_data(socket, baseaddr + start_dirty_page * PAGE_SIZE, + (i - start_dirty_page) * PAGE_SIZE, migration_req); + if (rc != 0) + return (rc); + } + + return (0); +} + +static int +migrate_pages(struct vmctx *ctx, int socket, char *page_list, + size_t page_list_size, enum migration_transfer_req migration_req) +{ + size_t lowmem_limit_page, lowmem, highmem; + int rc; + char *baseaddr; + + if ((migration_req != MIGRATION_SEND_REQ) && (migration_req != MIGRATION_RECV_REQ)) { + EPRINTF("wrong migration transfer req"); + return (EINVAL); + } + + /* + * Transfer the state of the pages (dirty/not dirty) from the source + * host to the destination host. The pages that are dirty will be + * transferred in the next steps. + */ + rc = migration_transfer_data(socket, page_list, page_list_size, migration_req); + if (rc != 0) { + EPRINTF("Could not transfer page list"); + return (rc); + } + + vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem, &highmem); + + /* Lowmem segment */ + rc = migrate_segment_pages(socket, baseaddr, + page_list, lowmem / PAGE_SIZE, migration_req); + if (rc != 0) { + EPRINTF("Could not migrate the lowmem segment pages"); + return (rc); + } + + /* Highmem segment */ + if (highmem == 0) + return (0); + + lowmem_limit_page = vm_get_lowmem_limit(ctx) / PAGE_SIZE; + + rc = migrate_segment_pages(socket, baseaddr + 4 * GB, + page_list + lowmem_limit_page, highmem / PAGE_SIZE, migration_req); + if (rc != 0) { + EPRINTF("Could not migrate the highmem segment pages"); + return (rc); + } + + return (0); +} + +static int +live_migrate_send(struct vmctx *ctx, int socket) +{ + int error, i, rc; + uint8_t rounds; + size_t migration_completed; + size_t pages; + char *page_list_indexes; + +#ifdef BHYVE_DEBUG + struct timespec start, now; + uint64_t time_diff_ms; +#endif + + error = 0; + page_list_indexes = NULL; + rounds = MIGRATION_ROUNDS; + + /* Send the number of memory rounds to destination */ + error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_SEND_REQ); + if (error != 0) { + EPRINTF("Could not transfer the number of rounds"); + goto done; + } + + pages = (vm_get_lowmem_size(ctx) + vm_get_highmem_size(ctx)) / PAGE_SIZE; + + /* alloc page_list_indexes */ + page_list_indexes = mmap(NULL, pages * sizeof(char), PROT_READ | PROT_WRITE, MAP_ANON, -1, 0); + if (page_list_indexes == MAP_FAILED) { + perror("Page list indexes could not be allocated"); + error = errno; + goto done; + } + + if (mlock(page_list_indexes, pages * sizeof(char)) == -1) { + perror("Page list indexes could not be locked"); + error = errno; + goto done; + } + + for (i = 0; i <= MIGRATION_ROUNDS; i++) { + if (i == MIGRATION_ROUNDS) { + /* Last round */ + vm_vcpu_pause(ctx); + + rc = vm_pause_user_devs(); + if (rc != 0) { + EPRINTF("Could not pause devices"); + error = rc; + goto unlock_vm_and_exit; + } + } + + DPRINTF("Live migration round %d - Start", i); +#ifdef BHYVE_DEBUG + clock_gettime(CLOCK_MONOTONIC, &start); +#endif + memset(page_list_indexes, 0, pages); + + /* Search the dirty pages and populate page_list_index */ + rc = vm_get_dirty_page_list(ctx, page_list_indexes, pages, i == 0); + + if (rc != 0) { + EPRINTF("Couldn't search for the dirty pages"); + error = errno; + if (i == MIGRATION_ROUNDS) + goto unlock_vm_and_exit; + else + goto done; + } + + DPRINTF("Live migration round %d: Finished searching the dirty pages", i); + + error = migrate_pages(ctx, socket, page_list_indexes, + pages, MIGRATION_SEND_REQ); + if (error != 0) { + EPRINTF("Couldn't send dirty pages to dest"); + if (i == MIGRATION_ROUNDS) + goto unlock_vm_and_exit; + else + goto done; + } + +#ifdef BHYVE_DEBUG + clock_gettime(CLOCK_MONOTONIC, &now); + time_diff_ms = (now.tv_sec - start.tv_sec) * 1000 + + (now.tv_nsec - start.tv_nsec) / 1000000; + DPRINTF("Live migration round %d - Done - %lu ms", i, time_diff_ms); +#endif + } + + /* Send kern data */ + error = migrate_kern_data(ctx, socket, MIGRATION_SEND_REQ); + if (error != 0) { + EPRINTF("Could not send kern data to destination"); + goto unlock_vm_and_exit; + } + + /* Send PCI data */ + error = migrate_devs(socket, MIGRATION_SEND_REQ); + if (error != 0) { + EPRINTF("Could not send pci devs to destination"); + goto unlock_vm_and_exit; + } + + /* Wait for migration completed */ + error = migration_transfer_data(socket, &migration_completed, + sizeof(migration_completed), MIGRATION_RECV_REQ); + if ((error != 0) || (migration_completed != MIGRATION_SPECS_OK)) { + EPRINTF("Could not recv migration completed remote or received error"); + if (error == 0) + error = EINVAL; + goto unlock_vm_and_exit; + } + +#ifdef BHYVE_DEBUG + clock_gettime(CLOCK_MONOTONIC, &now); + time_diff_ms = (now.tv_sec - start.tv_sec) * 1000 + (now.tv_nsec - start.tv_nsec) / 1000000; + DPRINTF("Live migration downtime - %lu ms", time_diff_ms); +#endif + + vm_destroy(ctx); + exit(0); + +unlock_vm_and_exit: + rc = vm_resume_user_devs(); + if (rc != 0) + EPRINTF("Could not resume devices"); + vm_vcpu_resume(ctx); + +done: + if (page_list_indexes != MAP_FAILED) { + munmap(page_list_indexes, pages * sizeof(char)); + } + return (error); +} + +static int +live_migrate_recv(struct vmctx *ctx, int socket) +{ + int error, index; + uint8_t rounds; + size_t lowmem_size, highmem_size, pages; + char *baseaddr, *page_list_indexes; + + page_list_indexes = NULL; + + error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_RECV_REQ); + if (error != 0) { + EPRINTF("Could not recv the number of rounds from remote"); + goto done; + } + + /* Compute memory_size and pages*/ + vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size); + + pages = (lowmem_size + highmem_size) / PAGE_SIZE; + + madvise(baseaddr, lowmem_size, MADV_WILLNEED); + if (highmem_size != 0) + madvise(baseaddr + 4 * GB, highmem_size, MADV_WILLNEED); + + /* alloc page_list_indexes */ + page_list_indexes = malloc(pages * sizeof(char)); + if (page_list_indexes == NULL) { + perror("Page list indexes could not be allocated"); + error = ENOMEM; + goto done; + } + + /* The following iteration contains the preliminary round in which the + * entire memory is migrated to the destination. Then, for + * MIGRATION_ROUNDS - 1 rounds, only the dirtied pages will be migrated. + * In the final round, the rest of the pages are migrated. + * Since the vcpus are not started, we don't need to lock them, so we + * can do the memory migration pretty straight-forward. + */ + DPRINTF("Live migration start"); + for (index = 0; index <= rounds; index ++) { + DPRINTF("Live migration round %d: Start", index); + error = migrate_pages(ctx, socket, page_list_indexes, pages, MIGRATION_RECV_REQ); + if (error != 0) { + EPRINTF("Couldn't recv dirty pages from source"); + goto done; + } + } + DPRINTF("Live migration done"); + + madvise(baseaddr, lowmem_size, MADV_NORMAL); + if (highmem_size!= 0) + madvise(baseaddr + 4 * GB, highmem_size, MADV_NORMAL); + + error = 0; +done: + if (page_list_indexes != NULL) { + free(page_list_indexes); + } + return (error); +} + static inline int migrate_connections(struct migrate_req req, int *socket_fd, enum migration_transfer_req type) @@ -874,8 +1158,7 @@ } if (is_live) { - EPRINTF("Live migration not implemented"); - rc = EOPNOTSUPP; + rc = live_migrate_send(ctx, s); if (rc != 0) EPRINTF("Could not live migrate the guest's memory"); error = rc; @@ -964,8 +1247,7 @@ * way in which the memory is migrated. */ if (is_live) { - EPRINTF("Live migration not implemented"); - rc = EOPNOTSUPP; + rc = live_migrate_recv(ctx, s); if (rc != 0) { EPRINTF("Could not live migrate the guest's memory"); goto done;