diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -94,6 +94,13 @@ LIBADD= vmmapi md nv pthread z util sbuf cam 9p +.if ${MK_CASPER} != "no" +LIBADD+= casper +LIBADD+= cap_net +LIBADD+= cap_sysctl +CFLAGS+=-DWITH_CASPER +.endif + .if ${MK_BHYVE_SNAPSHOT} != "no" LIBADD+= ucl xo .endif diff --git a/usr.sbin/bhyve/migration.h b/usr.sbin/bhyve/migration.h --- a/usr.sbin/bhyve/migration.h +++ b/usr.sbin/bhyve/migration.h @@ -19,9 +19,48 @@ struct vmctx; +/* Warm Migration */ +#define MAX_DEV_NAME_LEN 64 + +#define MAX_SPEC_LEN 256 + +#define MIGRATION_SPECS_OK 0 +#define MIGRATION_SPECS_NOT_OK 1 + +enum migration_transfer_req { + MIGRATION_SEND_REQ = 0, + MIGRATION_RECV_REQ = 1 +}; + +enum message_type { + MESSAGE_TYPE_SPECS = 1, + MESSAGE_TYPE_METADATA = 2, + MESSAGE_TYPE_RAM = 3, + MESSAGE_TYPE_KERN = 4, + MESSAGE_TYPE_DEV = 5, + MESSAGE_TYPE_UNKNOWN = 8, +}; + struct migrate_req { char host[MAXHOSTNAMELEN]; unsigned int port; }; +struct migration_message_type { + size_t len; + unsigned int type; /* enum message_type */ + unsigned int req_type; /* enum snapshot_req */ + char name[MAX_DEV_NAME_LEN]; +} __packed; + +struct migration_system_specs { + char hw_machine[MAX_SPEC_LEN]; + char hw_model[MAX_SPEC_LEN]; + size_t hw_pagesize; +} __packed; + int receive_vm_migration(struct vmctx *ctx, char *migration_data); +int vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool live); +#ifdef WITH_CASPER +int migration_cap_setup(void); +#endif diff --git a/usr.sbin/bhyve/migration.c b/usr.sbin/bhyve/migration.c --- a/usr.sbin/bhyve/migration.c +++ b/usr.sbin/bhyve/migration.c @@ -50,6 +50,11 @@ fprintf(stderr, "%s: " FMT "\n", __func__, ##__VA_ARGS__); \ }) +static cap_channel_t *capnet; +static cap_channel_t *capsysctl; + +static int vm_recv_migrate_req(struct vmctx *ctx, struct migrate_req req); + int receive_vm_migration(struct vmctx *ctx, char *migration_data) { @@ -89,10 +94,490 @@ strlcpy(req.host, hostname, MAXHOSTNAMELEN); - // rc = vm_recv_migrate_req(ctx, req); - rc = EOPNOTSUPP; - EPRINTF("Migration not implemented yet"); + rc = vm_recv_migrate_req(ctx, req); free(hostname); return (rc); } + +static int +get_system_specs_for_migration(struct migration_system_specs *specs) +{ + size_t len; + int rc; + + /* For sending we use casper function, for recv we can't */ + len = sizeof(specs->hw_machine); + if (capsysctl != NULL) + rc = cap_sysctlbyname(capsysctl, "hw.machine", specs->hw_machine, &len, NULL, 0); + else + rc = sysctlbyname("hw.machine", specs->hw_machine, &len, NULL, 0); + if (rc != 0) { + perror("Could not retrieve HW_MACHINE specs"); + return (rc); + } + + len = sizeof(specs->hw_model); + if (capsysctl != NULL) + rc = cap_sysctlbyname(capsysctl, "hw.model", specs->hw_model, &len, NULL, 0); + else + rc = sysctlbyname("hw.model", specs->hw_model, &len, NULL, 0); + if (rc != 0) { + perror("Could not retrieve HW_MODEL specs"); + return (rc); + } + + len = sizeof(specs->hw_pagesize); + if (capsysctl != NULL) + rc = cap_sysctlbyname(capsysctl, "hw.pagesize", &specs->hw_pagesize, &len, NULL, 0); + else + rc = sysctlbyname("hw.pagesize", &specs->hw_pagesize, &len, NULL, 0); + if (rc != 0) { + perror("Could not retrieve HW_PAGESIZE specs"); + return (rc); + } + + return (0); +} + +static int +migration_transfer_data(int socket, void *msg, size_t len, enum migration_transfer_req req) +{ + size_t to_transfer, total_transferred; + ssize_t transferred; + + to_transfer = len; + total_transferred = 0; + + while (to_transfer > 0) { + switch (req) { + case MIGRATION_SEND_REQ: + transferred = send(socket, (char *) msg + total_transferred, + to_transfer, 0); + break; + case MIGRATION_RECV_REQ: + transferred = recv(socket, (char *) msg + total_transferred, + to_transfer, 0); + break; + default: + DPRINTF("Unknown transfer option"); + return (EINVAL); + break; + } + + if (transferred == 0) + break; + if (transferred < 0) { + perror("Error while transfering data"); + return (errno); + } + + to_transfer -= transferred; + total_transferred += transferred; + } + + return (to_transfer == 0 ? 0 : ECONNRESET); +} + +static int +migration_check_specs(int socket, enum migration_transfer_req req) +{ + struct migration_system_specs local_specs; + struct migration_system_specs remote_specs; + struct migration_system_specs transfer_specs; + struct migration_message_type msg; + enum migration_transfer_req rev_req; + size_t response; + int rc; + + if ((req != MIGRATION_SEND_REQ) && (req != MIGRATION_RECV_REQ)) { + EPRINTF("Unknown option for migration req"); + return (EINVAL); + } + + if (req == MIGRATION_SEND_REQ) + rev_req = MIGRATION_RECV_REQ; + else + rev_req = MIGRATION_SEND_REQ; + + rc = get_system_specs_for_migration(&local_specs); + if (rc != 0) { + EPRINTF("Could not retrieve local specs"); + return (rc); + } + + if (req == MIGRATION_SEND_REQ) + msg.type = MESSAGE_TYPE_SPECS; + + rc = migration_transfer_data(socket, &msg, sizeof(msg), req); + if (rc != 0) { + EPRINTF("Could not send message type"); + return (rc); + } + + if ((req == MIGRATION_RECV_REQ) && (msg.type != MESSAGE_TYPE_SPECS)) { + EPRINTF("Wrong message type received from remote"); + return (EINVAL); + } + + /* For the send req, we send the local specs and for the receive req + * we receive the remote specs. + */ + if (req == MIGRATION_SEND_REQ) + transfer_specs = local_specs; + + rc = migration_transfer_data(socket, &transfer_specs, sizeof(transfer_specs), req); + if (rc != 0) { + EPRINTF("Could not transfer system specs"); + return (rc); + } + + if (req == MIGRATION_RECV_REQ) { + remote_specs = transfer_specs; + + /* Check specs */ + response = MIGRATION_SPECS_OK; + if ((strncmp(local_specs.hw_model, remote_specs.hw_model, MAX_SPEC_LEN) != 0) + || (strncmp(local_specs.hw_machine, remote_specs.hw_machine, MAX_SPEC_LEN) != 0) + || (local_specs.hw_pagesize != remote_specs.hw_pagesize) + ) { + EPRINTF("System specification mismatch"); + DPRINTF("Local specs vs Remote Specs: \n" + "\tmachine: %s vs %s\n" + "\tmodel: %s vs %s\n" + "\tpagesize: %zu vs %zu\n", + local_specs.hw_machine, + remote_specs.hw_machine, + local_specs.hw_model, + remote_specs.hw_model, + local_specs.hw_pagesize, + remote_specs.hw_pagesize + ); + response = MIGRATION_SPECS_NOT_OK; + } + } + + /* The source will receive the result of the checkup (i.e. + * whether the migration is possible or the source and destination + * are incompatible for migration) and the destination will send the + * result of the checkup. + */ + rc = migration_transfer_data(socket, &response, sizeof(response), rev_req); + if (rc != 0) { + EPRINTF("Could not transfer response from server"); + return (rc); + } + + if (response == MIGRATION_SPECS_NOT_OK) + return (EINVAL); + + printf("%s: System specification accepted\n", __func__); + + return (0); + +} + +static int +get_migration_address(const char *hostname, struct in_addr *addr) +{ + struct addrinfo hints, *res; + int rc, error = 0; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; + + if (capnet != NULL) + rc = cap_getaddrinfo(capnet, hostname, NULL, &hints, &res); + else + rc = getaddrinfo(hostname, NULL, &hints, &res); + + if (rc != 0) { + EPRINTF("Could not get address info: %s", gai_strerror(rc)); + return (rc); + } + + switch(res->ai_family) { + case AF_INET: + addr->s_addr = ((struct sockaddr_in *) res->ai_addr)->sin_addr.s_addr; + break; + default: + EPRINTF("Unknown address family."); + error = EINVAL; + } + + freeaddrinfo(res); + return (error); +} + +static inline int +migrate_connections(struct migrate_req req, int *socket_fd, + enum migration_transfer_req type) +{ + int error; + int s, con_socket; + struct sockaddr_in sa, client_sa; + struct in_addr req_addr; + socklen_t client_len; + int rc; + + rc = get_migration_address(req.host, &req_addr); + + if (rc != 0) { + EPRINTF("Invalid address."); + DPRINTF("IP address used for migration: %s;\n" + "Port used for migration: %d", + req.host, req.port); + return (rc); + } + + s = socket(AF_INET, SOCK_STREAM, 0); + + if (s < 0) { + perror("Could not create socket"); + return (errno); + } + + bzero(&sa, sizeof(sa)); + sa.sin_family = AF_INET; + sa.sin_port = htons(req.port); + + switch (type) { + case MIGRATION_SEND_REQ: + printf("%s: Starting connection to %s on %d port...\n", + __func__, inet_ntoa(req_addr), req.port); + + sa.sin_addr = req_addr; + rc = cap_connect(capnet, s, (struct sockaddr *)&sa, sizeof(sa)); + + if (rc != 0) { + perror("Could not connect to the remote host"); + error = errno; + goto done_close_s; + } + + *socket_fd = s; + break; + case MIGRATION_RECV_REQ: + printf("%s: Waiting for connections from %s on %d port...\n", + __func__, inet_ntoa(req_addr), req.port); + + sa.sin_addr.s_addr = htonl(INADDR_ANY); + + rc = bind(s, (struct sockaddr *)&sa, sizeof(sa)); + + if (rc != 0) { + perror("Could not bind"); + error = errno; + goto done_close_s; + } + + listen(s, 1); + + while (1) { + client_len = sizeof(client_sa); + con_socket = accept(s, (struct sockaddr *)&client_sa, &client_len); + if (con_socket < 0) { + EPRINTF("Could not accept connection"); + error = errno; + goto done_close_s; + } + + if (client_sa.sin_addr.s_addr == req_addr.s_addr) { + printf("%s: Accepted connection from %s\n", __func__, inet_ntoa(req_addr)); + break; + } else { + DPRINTF("Invalid connection from IP: %s", inet_ntoa(client_sa.sin_addr)); + } + close(con_socket); + } + *socket_fd = con_socket; + close(s); + break; + default: + DPRINTF("unknown operation request"); + error = EINVAL; + goto done; + } + + error = 0; + goto done; + +done_close_s: + close(s); +done: + return (error); +} + +int +vm_send_migrate_req(struct vmctx *ctx, struct migrate_req req, bool is_live) +{ + int s; + int rc, error; + size_t migration_completed; + +#if !defined(WITHOUT_CAPSICUM) && !defined(WITH_CASPER) + EPRINTF("Migration is not possible with Capsicum enabled and without Casper support"); + return (EOPNOTSUPP); +#endif + + rc = migrate_connections(req, &s, MIGRATION_SEND_REQ); + if (rc != 0) { + EPRINTF("Could not create connection"); + return (rc); + } + + rc = migration_check_specs(s, MIGRATION_SEND_REQ); + if (rc != 0) { + EPRINTF("Error while checking system requirements"); + error = rc; + goto done; + } + + rc = migration_transfer_data(s, &is_live, sizeof(is_live), MIGRATION_SEND_REQ); + if (rc != 0) { + EPRINTF("Could not send migration type"); + error = rc; + goto done; + } + + vm_vcpu_pause(ctx); + + rc = vm_pause_user_devs(); + if (rc != 0) { + EPRINTF("Could not pause devices"); + error = rc; + goto unlock_vm_and_exit; + } + + rc = migration_transfer_data(s, &migration_completed, + sizeof(migration_completed), MIGRATION_RECV_REQ); + if ((rc != 0) || (migration_completed != MIGRATION_SPECS_OK)) { + EPRINTF("Could not recv 'migration completed' from remote or received error"); + error = rc != 0 ? rc : EINVAL; + goto unlock_vm_and_exit; + } + + EPRINTF("Rest of migration not yet implemented"); + error = EOPNOTSUPP; + goto unlock_vm_and_exit; + + vm_destroy(ctx); + exit(0); + +unlock_vm_and_exit: + rc = vm_resume_user_devs(); + if (rc != 0) + EPRINTF("Could not resume devices"); + vm_vcpu_resume(ctx); + +done: + close(s); + return (error); +} + +static int +vm_recv_migrate_req(struct vmctx __unused *ctx, struct migrate_req req) +{ + int s; + int rc = 0; + bool is_live; + size_t migration_completed; + + rc = migrate_connections(req, &s, MIGRATION_RECV_REQ); + if (rc != 0) { + EPRINTF("Could not create connections"); + return (rc); + } + + rc = migration_check_specs(s, MIGRATION_RECV_REQ); + if (rc != 0) { + EPRINTF("Error while checking specs"); + goto done; + } + + rc = migration_transfer_data(s, &is_live, sizeof(is_live), MIGRATION_RECV_REQ); + if (rc != 0) { + EPRINTF("Could not recv migration type"); + goto done; + } + + // fprintf(stdout, "%s: Migration completed\n", __func__); + + migration_completed = MIGRATION_SPECS_OK; + rc = migration_transfer_data(s, &migration_completed, + sizeof(migration_completed), MIGRATION_SEND_REQ); + if (rc != 0) { + EPRINTF("Could not send 'migration completed' to remote"); + goto done; + } + +done: + close(s); + EPRINTF("Rest of migration not currently implemented"); + rc = EOPNOTSUPP; + return (rc); +} + +#ifdef WITH_CASPER +int +migration_cap_setup(void) +{ + cap_channel_t *capcas; + cap_net_limit_t *limit_net; + cap_sysctl_limit_t *limit_sysctl; + int familylimit; + + capcas = cap_init(); + if (capcas == NULL) { + warn("Unable to create casper process"); + return (errno); + } + + caph_cache_catpages(); + + capnet = cap_service_open(capcas, "system.net"); + if (capnet == NULL) { + warn("Unable to open system.net service"); + return (errno); + } + + capsysctl = cap_service_open(capcas, "system.sysctl"); + if (capsysctl == NULL) { + warn("Unable to open system.sysctl service"); + return (errno); + } + + cap_close(capcas); + + limit_net = cap_net_limit_init(capnet, CAPNET_NAME2ADDR | CAPNET_CONNECT); + if (limit_net == NULL) { + warn("Unable to create cap_net limits."); + return (errno); + } + + familylimit = AF_INET; + cap_net_limit_name2addr_family(limit_net, &familylimit, 1); + + if (cap_net_limit(limit_net) < 0) { + warn("Unable to apply cap_net limits."); + return (errno); + } + + limit_sysctl = cap_sysctl_limit_init(capsysctl); + if (limit_sysctl == NULL) { + warn("Unable to create cap_sysctl limits."); + return (errno); + } + + cap_sysctl_limit_name(limit_sysctl, "hw.machine", CAP_SYSCTL_READ); + cap_sysctl_limit_name(limit_sysctl, "hw.model", CAP_SYSCTL_READ); + cap_sysctl_limit_name(limit_sysctl, "hw.pagesize", CAP_SYSCTL_READ); + + if (cap_sysctl_limit(limit_sysctl) < 0) { + warn("Unable to apply cap_sysctl limits."); + return (errno); + } + + return (0); +} +#endif diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h --- a/usr.sbin/bhyve/snapshot.h +++ b/usr.sbin/bhyve/snapshot.h @@ -91,6 +91,8 @@ void checkpoint_cpu_add(int vcpu); void checkpoint_cpu_resume(int vcpu); void checkpoint_cpu_suspend(int vcpu); +void vm_vcpu_pause(struct vmctx *ctx); +void vm_vcpu_resume(struct vmctx *ctx); int restore_vm_mem(struct vmctx *ctx, struct restore_state *rstate); int vm_restore_kern_structs(struct vmctx *ctx, struct restore_state *rstate); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -1285,7 +1285,7 @@ pthread_mutex_unlock(&vcpu_lock); } -static void +void vm_vcpu_pause(struct vmctx *ctx) { @@ -1297,7 +1297,7 @@ pthread_mutex_unlock(&vcpu_lock); } -static void +void vm_vcpu_resume(struct vmctx *ctx) { @@ -1493,7 +1493,7 @@ IPC_COMMAND(ipc_cmd_set, checkpoint, vm_do_checkpoint); static int -vm_do_migrate(struct vmctx __unused *ctx, const nvlist_t *nvl) +vm_do_migrate(struct vmctx *ctx, const nvlist_t *nvl) { size_t len; struct migrate_req req; @@ -1520,9 +1520,7 @@ req.host, req.port); - // return (vm_send_migrate_req(ctx, req, nvlist_get_bool(nvl, "live"))); - EPRINTLN("Migration operation not implemented yet\n"); - return (EOPNOTSUPP); + return (vm_send_migrate_req(ctx, req, nvlist_get_bool(nvl, "live"))); } IPC_COMMAND(ipc_cmd_set, migrate, vm_do_migrate); @@ -1592,7 +1590,15 @@ if (caph_rights_limit(socket_fd, &rights) == -1) errx(EX_OSERR, "Unable to apply rights for sandbox"); + +#ifdef WITH_CASPER + err = migration_cap_setup(); + if (err != 0) { + errx(EX_OSERR, "Unable to setup capabilities for migration"); + } #endif +#endif + checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd;