diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h --- a/lib/libvmmapi/vmmapi.h +++ b/lib/libvmmapi/vmmapi.h @@ -262,4 +262,16 @@ int vm_snapshot_req(struct vm_snapshot_meta *meta); int vm_restore_time(struct vmctx *ctx); +int vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages, + size_t *highmem_pages); +int vm_set_vmm_migration_segments(struct vmctx *ctx, + struct vmm_migration_segment *lowmem, + struct vmm_migration_segment *highmem); +int vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num); + +int vm_copy_vmm_pages(struct vmctx *ctx, + struct vmm_migration_pages_req *pages_req); +int vm_init_vmm_migration_pages_req(struct vmctx *ctx, + struct vmm_migration_pages_req *req); + #endif /* _VMMAPI_H_ */ diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1655,6 +1655,116 @@ return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); } +int +vm_get_pages_num(struct vmctx *ctx, size_t *lowmem_pages, size_t *highmem_pages) +{ + /* ctx cannot be NULL */ + if (ctx == NULL) + return (-1); + + if (lowmem_pages != NULL) + *lowmem_pages = ctx->lowmem / PAGE_SIZE; + + if (highmem_pages != NULL) + *highmem_pages = ctx->highmem / PAGE_SIZE; + + return (0); +} + +int +vm_set_vmm_migration_segments(struct vmctx *ctx, + struct vmm_migration_segment *lowmem, + struct vmm_migration_segment *highmem) +{ + + if (lowmem != NULL) { + lowmem->start = 0; + lowmem->end = ctx->lowmem; + } + + if (highmem != NULL) { + if (ctx->highmem != 0) { + highmem->start = 4 * GB; + highmem->end = 4 * GB + ctx->highmem; + } + } + + return (0); +} + +int +vm_get_dirty_page_list(struct vmctx *ctx, char *page_list, size_t num_pages) +{ + int error; + struct vm_get_dirty_page_list list; + + bzero(&list, sizeof(struct vm_get_dirty_page_list)); + list.page_list = (uint8_t *)page_list; + list.num_pages = num_pages; + + error = vm_set_vmm_migration_segments(ctx, &(list.lowmem), + &(list.highmem)); + + error = ioctl(ctx->fd, VM_GET_DIRTY_PAGE_LIST, &list); + + return (error); +} + +int +vm_copy_vmm_pages(struct vmctx *ctx, struct vmm_migration_pages_req *pages_req) +{ + int error; + size_t index; + + if (pages_req == NULL) + return (-1); + + if (pages_req->pages_required > VMM_PAGE_CHUNK) + return (E2BIG); + + for (index = 0; index < pages_req->pages_required; index ++) { + if (pages_req->pages[index].page == NULL) + return (-1); + + if (pages_req->req_type == VMM_GET_PAGES) + memset(pages_req->pages[index].page, 0, PAGE_SIZE); + } + + error = ioctl(ctx->fd, VM_COPY_VMM_PAGES, pages_req); + + return (error); +} + +int +vm_init_vmm_migration_pages_req(struct vmctx *ctx, + struct vmm_migration_pages_req *req) +{ + size_t index; + struct vmm_migration_page *page; + + vm_set_vmm_migration_segments(ctx, &(req->lowmem_segment), + &(req->highmem_segment)); + + for (index = 0; index < VMM_PAGE_CHUNK; index++) { + page = &req->pages[index]; + page->page = malloc(PAGE_SIZE * sizeof(uint8_t)); + if (page->page == NULL) + goto deallocate_error; + memset(page->page, 0, PAGE_SIZE * sizeof(uint8_t)); + } + + return (0); + +deallocate_error: + for (index = 0; index < VMM_PAGE_CHUNK; index ++) { + page = &req->pages[index]; + if (page->page != NULL) + free(page->page); + } + + return (-1); +} + int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -36,6 +36,7 @@ struct vm_snapshot_meta; struct vm_get_dirty_page_list; +struct vmm_migration_pages_req; #ifdef _KERNEL SDT_PROVIDER_DECLARE(vmm); @@ -289,6 +290,7 @@ int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta); int vm_restore_time(struct vm *vm); int vm_get_dirty_page_list(struct vm *vm, struct vm_get_dirty_page_list *list); +int vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req); #ifdef _SYS__CPUSET_H_ /* diff --git a/sys/amd64/include/vmm_dev.h b/sys/amd64/include/vmm_dev.h --- a/sys/amd64/include/vmm_dev.h +++ b/sys/amd64/include/vmm_dev.h @@ -356,6 +356,7 @@ IOCNUM_RESTORE_TIME = 115, IOCNUM_VM_GET_DIRTY_PAGE_LIST = 117, + IOCNUM_VM_COPY_VMM_PAGES = 118, }; #define VM_RUN \ @@ -488,4 +489,6 @@ _IOWR('v', IOCNUM_RESTORE_TIME, int) #define VM_GET_DIRTY_PAGE_LIST \ _IOWR('v', IOCNUM_VM_GET_DIRTY_PAGE_LIST, struct vm_get_dirty_page_list) +#define VM_COPY_VMM_PAGES \ + _IOWR('v', IOCNUM_VM_COPY_VMM_PAGES, struct vmm_migration_pages_req) #endif diff --git a/sys/amd64/include/vmm_migration.h b/sys/amd64/include/vmm_migration.h --- a/sys/amd64/include/vmm_migration.h +++ b/sys/amd64/include/vmm_migration.h @@ -32,6 +32,18 @@ #include +#define VMM_PAGE_CHUNK 10 + +enum migration_req_type { + VMM_GET_PAGES = 0, + VMM_SET_PAGES = 1, +}; + +struct vmm_migration_page { + vm_pindex_t pindex; + uint8_t *page; +}; + /* * A bhyve guest has two memory segments: * - lowmem segment: mapped from 0GB to 3GB (which is lowmem_limit) @@ -43,4 +55,12 @@ vm_offset_t end; }; +struct vmm_migration_pages_req { + size_t pages_required; + enum migration_req_type req_type; + struct vmm_migration_segment lowmem_segment; + struct vmm_migration_segment highmem_segment; + struct vmm_migration_page pages[VMM_PAGE_CHUNK]; +}; + #endif diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -3040,5 +3040,106 @@ return (error); } +static inline void +vm_copy_object_pages(vm_object_t lowmem_object, vm_object_t highmem_object, + struct vmm_migration_pages_req *page_req) +{ + vm_pindex_t pindex; + vm_object_t object; + struct vmm_migration_page migration_page; + size_t page_idx, limit_page; + void *dst; + size_t pindex_offset; + enum migration_req_type req_type; + + req_type = page_req->req_type; + + if (lowmem_object == NULL) { + printf("%s: lowmem_object is NULL\r\n", __func__); + return; + } + limit_page = 3UL * GB / PAGE_SIZE; + for (page_idx = 0; page_idx < page_req->pages_required; page_idx ++) { + migration_page = page_req->pages[page_idx]; + pindex = migration_page.pindex; + dst = (void *) migration_page.page; + if (pindex >= limit_page) { + if (highmem_object == NULL) { + printf("%s: highmem_object is NULL\r\n", __func__); + return; + } + object = highmem_object; + pindex_offset = 1UL * GB / PAGE_SIZE; + } else { + object = lowmem_object; + pindex_offset = 0; + } + + if (req_type == VMM_GET_PAGES) { + VM_OBJECT_WLOCK(object); + vm_object_get_page(object, pindex + pindex_offset, dst); + VM_OBJECT_WUNLOCK(object); + } + else if (req_type == VMM_SET_PAGES) { + VM_OBJECT_WLOCK(object); + vm_object_set_page(object, pindex + pindex_offset, dst); + VM_OBJECT_WUNLOCK(object); + } + else + return; + } +} + +int +vm_copy_vmm_pages(struct vm *vm, struct vmm_migration_pages_req *pages_req) +{ + int error = 0; + struct vmspace *vm_vmspace; + struct vm_map *vmmap; + struct vm_map_entry *entry; + struct vm_object *lowmem_object, *highmem_object, *object; + struct vmm_migration_segment lowmem_segment, highmem_segment; + + lowmem_segment = pages_req->lowmem_segment; + highmem_segment = pages_req->highmem_segment; + vm_vmspace = vm->vmspace; + + if (vm_vmspace == NULL) { + printf("%s: vm_vmspace is null\r\n", __func__); + error = -1; + return (error); + } + + vmmap = &vm_vmspace->vm_map; + + vm_map_lock(vmmap); + if (vmmap->busy) + vm_map_wait_busy(vmmap); + + lowmem_object = NULL; + highmem_object = NULL; + for (entry = vmmap->header.right; entry != &vmmap->header; entry = entry->right) { + object = entry->object.vm_object; + + if (entry->start == lowmem_segment.start && + entry->end == lowmem_segment.end) { + lowmem_object = object; + } + + if (entry->start == highmem_segment.start && + entry->end == highmem_segment.end) { + highmem_object = object; + } + } + + if (lowmem_object == NULL) + return (-1); + + vm_copy_object_pages(lowmem_object, highmem_object, pages_req); + + vm_map_unlock(vmmap); + + return (error); +} #endif /* BHYVE_SNAPSHOT */ diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c --- a/sys/amd64/vmm/vmm_dev.c +++ b/sys/amd64/vmm/vmm_dev.c @@ -412,6 +412,7 @@ #ifdef BHYVE_SNAPSHOT struct vm_snapshot_meta *snapshot_meta; struct vm_get_dirty_page_list *page_list; + struct vmm_migration_pages_req *pages_req; #endif error = vmm_priv_check(curthread->td_ucred); @@ -909,6 +910,10 @@ page_list = (struct vm_get_dirty_page_list *)data; error = vm_get_dirty_page_list(sc->vm, page_list); break; + case VM_COPY_VMM_PAGES: + pages_req = (struct vmm_migration_pages_req *)data; + error = vm_copy_vmm_pages(sc->vm, pages_req); + break; #endif default: error = ENOTTY; diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c --- a/usr.sbin/bhyve/migration.c +++ b/usr.sbin/bhyve/migration.c @@ -47,6 +47,7 @@ #ifndef WITHOUT_CAPSICUM #include #endif +#include #include #include @@ -806,6 +807,360 @@ return (error); } +#define MIGRATION_ROUNDS 4 + +static size_t +num_dirty_pages(char *page_list, size_t size) +{ + size_t num = 0; + size_t i; + + for (i = 0; i < size; i++) + if (page_list[i] == 1) + num++; + + return (num); +} + +static int +migration_fill_vmm_migration_pages_req(struct vmctx *ctx, + struct vmm_migration_pages_req *req, + char *page_list, + size_t size, + size_t *current_position) +{ + size_t i, count; + + count = 0; + for (i = *current_position; i < size; i++) { + if (count == VMM_PAGE_CHUNK) + break; + + if (page_list[i] == 1) { + req->pages[count].pindex = i; + count ++; + } + } + + *current_position = i; + req->pages_required = count; + req->req_type = VMM_GET_PAGES; + + return vm_copy_vmm_pages(ctx, req); +} + +static int +migrate_pages(struct vmctx *ctx, int socket, struct vmm_migration_pages_req *req, + char *page_list, size_t page_list_size, int already_locked, + enum migration_transfer_req migration_req) +{ + size_t dirty_pages; + size_t current_pos, i, count; + int rc; + + if ((migration_req != MIGRATION_SEND_REQ) && (migration_req != MIGRATION_RECV_REQ)) { + EPRINTF("wrong migration transfer req"); + return (-1); + } + + /* + * Transfer the state of the pages (dirty/not dirty) from the source + * host to the destination host. The pages that are dirty will be + * transferred in the next steps. + */ + rc = migration_transfer_data(socket, page_list, page_list_size, migration_req); + if (rc < 0) { + DPRINTF("Could not transfer page_list remote"); + return (-1); + } + + dirty_pages = num_dirty_pages(page_list, page_list_size); + + current_pos = 0; + while (1) { + if (current_pos >= page_list_size) + break; + + for (i = 0; i < VMM_PAGE_CHUNK; i++) + req->pages[i].pindex = -1; + + req->pages_required = 0; + + /* Only the source host pauses the vcpus */ + if (migration_req == MIGRATION_SEND_REQ) { + if (!already_locked) + vm_vcpu_pause(ctx); + + rc = migration_fill_vmm_migration_pages_req(ctx, req, page_list, + page_list_size, + ¤t_pos); + + if (!already_locked) + vm_vcpu_resume(ctx); + + if (rc < 0) { + DPRINTF("Could not get pages"); + return (-1); + } + } else { + count = 0; + for (i = current_pos; i < page_list_size; i++) { + if (count == VMM_PAGE_CHUNK) + break; + + if (page_list[i] == 1) { + req->pages[count].pindex = i; + count ++; + } + } + + current_pos = i; + req->pages_required = count; + } + + for (i = 0; i < req->pages_required; i++) { + rc = migration_transfer_data(socket, req->pages[i].page, PAGE_SIZE, migration_req); + if (rc < 0) { + DPRINTF("Cound not transfer page %zu", req->pages[i].pindex); + return (-1); + } + } + + if (migration_req == MIGRATION_RECV_REQ) { + req->req_type = VMM_SET_PAGES; + + rc = vm_copy_vmm_pages(ctx, req); + if (rc < 0) { + EPRINTF("Could not copy pages into guest memory"); + return (-1); + } + } + } + + return (0); +} + +static int +search_dirty_pages(struct vmctx *ctx, char *page_list) +{ + size_t lowmem_pages, highmem_pages, pages; + int error; + + if (page_list == NULL) + return (-1); + + error = vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages); + if (error != 0) { + DPRINTF("Error while trying to get page number"); + return (-1); + } + + pages = lowmem_pages + highmem_pages; + vm_get_dirty_page_list(ctx, page_list, pages); + + return (0); +} + +static inline void +fill_page_list(char *page_list, size_t list_len, char c) +{ + size_t index; + + if (page_list == NULL) + return; + + for (index = 0; index < list_len; index ++) + page_list[index] = c; +} + +static int +live_migrate_send(struct vmctx *ctx, int socket) +{ + int error, i, rc; + uint8_t rounds; + size_t memory_size, lowmem_size, highmem_size; + size_t migration_completed; + size_t lowmem_pages, highmem_pages, pages; + char *baseaddr, *page_list_indexes; + struct vmm_migration_pages_req memory_req; + + error = 0; + memory_size = lowmem_size = highmem_size = 0; + page_list_indexes = NULL; + rounds = MIGRATION_ROUNDS; + + /* Send the number of memory rounds to destination */ + error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Could not send the number of rounds remote"); + goto done; + } + + /* Compute memory_size and pages*/ + vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size); + + memory_size = lowmem_size + highmem_size; + vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages); + pages = lowmem_pages + highmem_pages; + + /* alloc page_list_indexes */ + page_list_indexes = malloc (pages * sizeof(char)); + if (page_list_indexes == NULL) { + perror("Page list indexes could not be allocated"); + error = -1; + goto done; + } + + error = vm_init_vmm_migration_pages_req(ctx, &memory_req); + if (error < 0) { + DPRINTF("Could not initialize struct vmm_migration_pages_req"); + return (error); + } + + for (i = 0; i <= MIGRATION_ROUNDS; i++) { + if (i == MIGRATION_ROUNDS) { // Last Round + rc = vm_pause_user_devs(ctx); + if (rc != 0) { + DPRINTF("Could not pause devices"); + error = rc; + goto done; + } + + vm_vcpu_pause(ctx); + } + + if (i == 0) { // First Round + fill_page_list(page_list_indexes, pages, 1); + } else { + DPRINTF("ROUND: %d", i); + fill_page_list(page_list_indexes, pages, 0); + + if (i != MIGRATION_ROUNDS) { + vm_vcpu_pause(ctx); + } + + /* Search the dirty pages and populate page_list_index */ + error = search_dirty_pages(ctx, page_list_indexes); + + if (error != 0) { + DPRINTF("Couldn't search for the dirty pages"); + goto unlock_vm_and_exit; + } + + if (i != MIGRATION_ROUNDS) { + vm_vcpu_resume(ctx); + } + } + + error = migrate_pages(ctx, socket, &memory_req, page_list_indexes, + pages, i == MIGRATION_ROUNDS ? 1 : 0, MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Couldn't send dirty pages to dest"); + goto done; + } + } + + // Send kern data + error = migrate_kern_data(ctx, socket, MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Could not send kern data to destination"); + goto unlock_vm_and_exit; + } + + // Send PCI data + error = migrate_devs(ctx, socket, MIGRATION_SEND_REQ); + if (error != 0) { + DPRINTF("Could not send pci devs to destination"); + goto unlock_vm_and_exit; + } + + // Wait for migration completed + error = migration_transfer_data(socket, &migration_completed, + sizeof(migration_completed), MIGRATION_RECV_REQ); + if ((error < 0) || (migration_completed != MIGRATION_SPECS_OK)) { + DPRINTF("Could not recv migration completed remote or received error"); + goto unlock_vm_and_exit; + } + + vm_destroy(ctx); + exit(0); + +unlock_vm_and_exit: + vm_vcpu_resume(ctx); +done: + rc = vm_resume_user_devs(ctx); + if (rc != 0) + EPRINTF("Could not resume devices"); + if (page_list_indexes != NULL) + free(page_list_indexes); + return (error); +} + +static int +live_migrate_recv(struct vmctx *ctx, int socket) +{ + int error, index; + uint8_t rounds; + size_t memory_size, lowmem_size, highmem_size; + size_t lowmem_pages, highmem_pages, pages; + char *baseaddr, *page_list_indexes; + struct vmm_migration_pages_req memory_req; + + memory_size = lowmem_size = highmem_size = 0; + page_list_indexes = NULL; + + error = migration_transfer_data(socket, &rounds, sizeof(rounds), MIGRATION_RECV_REQ); + if (error != 0) { + DPRINTF("Could not recv the number of rounds from remote"); + goto done; + } + + /* Compute memory_size and pages*/ + vm_get_guestmem_from_ctx(ctx, &baseaddr, &lowmem_size, &highmem_size); + + memory_size = lowmem_size + highmem_size; + vm_get_pages_num(ctx, &lowmem_pages, &highmem_pages); + pages = lowmem_pages + highmem_pages; + + /* alloc page_list_indexes */ + page_list_indexes = malloc(pages * sizeof(char)); + if (page_list_indexes == NULL) { + perror("Page list indexes could not be allocated"); + error = -1; + goto done; + } + + error = vm_init_vmm_migration_pages_req(ctx, &memory_req); + if (error < 0) { + DPRINTF("Could not initialize struct vmm_migration_pages_req"); + return (error); + } + + /* The following iteration contains the preliminary round in which the + * entire memory is migrated to the destination. Then, for + * MIGRATION_ROUNDS - 1 rounds, only the dirtied pages will be migrated. + * In the final round, the rest of the pages are migrated. + * Since the vcpus are not started, we don't need to lock them, so we + * can do the memory migration pretty straight-forward. + */ + for (index = 0; index <= rounds; index ++) { + fill_page_list(page_list_indexes, pages, 0); + + error = migrate_pages(ctx, socket, &memory_req, page_list_indexes, pages, true, MIGRATION_RECV_REQ); + if (error != 0) { + DPRINTF("Couldn't recv dirty pages from source"); + goto done; + } + } + + error = 0; +done: + if (page_list_indexes != NULL) { + free(page_list_indexes); + } + return (error); +} + static inline int migrate_connections(struct migrate_req req, int *socket_fd, int *connection_socket_fd, @@ -944,12 +1299,10 @@ } if (live) { - EPRINTF("Live migration not yet implemented"); - rc = -1; - if (rc != 0) { + rc = live_migrate_send(ctx, s); + if (rc != 0) EPRINTF("Could not live migrate the guest's memory"); - error = rc; - } + error = rc; goto done; } // else continue the warm migration procedure @@ -1038,8 +1391,7 @@ * way in which the memory is migrated. */ if (is_live_migration) { - EPRINTF("Live migration not yet implemented"); - rc = -1; + rc = live_migrate_recv(ctx, con_socket); if (rc != 0) { EPRINTF("Could not live migrate the guest's memory"); close(con_socket);