diff --git a/Makefile.inc1 b/Makefile.inc1 --- a/Makefile.inc1 +++ b/Makefile.inc1 @@ -2753,7 +2753,7 @@ -DNO_CPU_CFLAGS \ -DNO_PIC \ SSP_CFLAGS= \ - MK_CASPER=no \ + MK_CASPER=yes \ MK_CLANG_EXTRAS=no \ MK_CLANG_FORMAT=no \ MK_CLANG_FULL=no \ diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c --- a/lib/libvmmapi/vmmapi.c +++ b/lib/libvmmapi/vmmapi.c @@ -1694,7 +1694,7 @@ VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, - VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY }; + VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY, VM_SNAPSHOT_REQ }; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -90,8 +90,9 @@ LIBADD+= casper LIBADD+= cap_pwd LIBADD+= cap_grp +LIBADD+= cap_sysctl # Temporary disable capsicum, until we integrate checkpoint code with it. -#CFLAGS+=-DWITH_CASPER +CFLAGS+=-DWITH_CASPER .endif .if ${MK_BHYVE_SNAPSHOT} != "no" @@ -120,9 +121,6 @@ .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include -# Temporary disable capsicum, until we integrate checkpoint code with it. -CFLAGS+= -DWITHOUT_CAPSICUM - CFLAGS+= -DBHYVE_SNAPSHOT .endif diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -34,6 +34,8 @@ #include #ifndef WITHOUT_CAPSICUM #include +#include +#include #endif #include #ifdef BHYVE_SNAPSHOT @@ -1202,6 +1204,44 @@ set_config_bool("x86.strictmsr", true); } +#ifndef WITHOUT_CAPSICUM +char *get_ckp_path(char *str) +{ + char *strcopy; + char *aux1, *aux2, *aux3, *aux4, *aux5; + char *path = NULL; + const char delim[2] = ","; + + strcopy = strdup(str); + assert(strcopy != NULL); + + + aux1 = strtok(strcopy, delim); + aux2 = strtok(NULL, delim); + aux3 = strtok(NULL, delim); + if (aux3 != NULL) { + if (strcmp(aux3, "virtio-blk") || + strcmp(aux3, "ahci-hd") || + strcmp(aux3, "ahci")) { + + aux4 = realpath(aux3, NULL); + if (aux4 != NULL) { + aux5 = strrchr(aux4, '/'); + if (aux5 != NULL) { + *aux5 = '\0'; + path = strdup(aux4); + } + free(aux4); + } + } + } + + free(strcopy); + + return path; +} +#endif + int main(int argc, char *argv[]) { @@ -1219,6 +1259,8 @@ restore_file = NULL; #endif + cap_channel_t *capcas = NULL; + char *ckp_path = NULL; init_config(); set_defaults(); @@ -1285,8 +1327,13 @@ exit(0); } else if (pci_parse_slot(optarg) != 0) exit(4); - else + else { +#ifndef WITHOUT_CAPSISCUM + if (ckp_path == NULL) + ckp_path = get_ckp_path(optarg); +#endif break; + } case 'S': set_config_bool("memory.wired", true); break; @@ -1527,13 +1574,10 @@ setproctitle("%s", vmname); #ifndef WITHOUT_CAPSICUM - caph_cache_catpages(); - - if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); - - if (caph_enter() == -1) - errx(EX_OSERR, "cap_enter() failed"); + /* Open capability to Casper. */ + capcas = cap_init(); + if (capcas == NULL) + errx(EX_OSERR, "cap_init() failed"); #endif #ifdef BHYVE_SNAPSHOT @@ -1543,13 +1587,24 @@ /* * checkpointing thread for communication with bhyvectl */ - if (init_checkpoint_thread(ctx) < 0) + if (init_checkpoint_thread(ctx, ckp_path, capcas) < 0) printf("Failed to start checkpoint thread!\r\n"); if (restore_file != NULL) vm_restore_time(ctx); #endif +#ifndef WITHOUT_CAPSICUM + free(ckp_path); + caph_cache_catpages(); + + if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + + if (caph_enter() == -1) + errx(EX_OSERR, "cap_enter() failed"); +#endif + /* * Add CPU 0 */ diff --git a/usr.sbin/bhyve/snapshot.h b/usr.sbin/bhyve/snapshot.h --- a/usr.sbin/bhyve/snapshot.h +++ b/usr.sbin/bhyve/snapshot.h @@ -42,6 +42,11 @@ #include #include +#ifndef WITHOUT_CAPSICUM +#include +#include +#endif + #define BHYVE_RUN_DIR "/var/run/bhyve/" #define MAX_SNAPSHOT_FILENAME PATH_MAX @@ -88,6 +93,7 @@ struct checkpoint_thread_info { struct vmctx *ctx; int socket_fd; + cap_channel_t *channel; }; typedef int (*vm_snapshot_dev_cb)(struct vm_snapshot_meta *); @@ -126,7 +132,7 @@ int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); void *checkpoint_thread(void *param); -int init_checkpoint_thread(struct vmctx *ctx); +int init_checkpoint_thread(struct vmctx *ctx, char *ckp_path, cap_channel_t *chn); int load_restore_file(const char *filename, struct restore_state *rstate); diff --git a/usr.sbin/bhyve/snapshot.c b/usr.sbin/bhyve/snapshot.c --- a/usr.sbin/bhyve/snapshot.c +++ b/usr.sbin/bhyve/snapshot.c @@ -171,6 +171,7 @@ static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; +static int cdir_fd = AT_FDCWD; /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. @@ -224,6 +225,38 @@ ucl_parser_free(rstate->meta_parser); } +#ifndef WITHOUT_CAPSICUM +static void +limit_vmmem_socket(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_FSTAT, CAP_MMAP_R, CAP_IOCTL, CAP_READ); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +static void +limit_kernel_socket(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_FSTAT, CAP_MMAP_R, CAP_READ); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +static void +limit_metadata_socket(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_FSTAT, CAP_MMAP_R, CAP_READ); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} +#endif + static int load_vmmem_file(const char *filename, struct restore_state *rstate) { @@ -236,6 +269,10 @@ return (-1); } +#ifndef WITHOUT_CAPSICUM + limit_vmmem_socket(rstate->vmmem_fd); +#endif + err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); @@ -269,6 +306,10 @@ return (-1); } +#ifndef WITHOUT_CAPSICUM + limit_kernel_socket(rstate->kdata_fd); +#endif + err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); @@ -301,6 +342,7 @@ { const ucl_object_t *obj; struct ucl_parser *parser; + int md_fd = -1; int err; parser = ucl_parser_new(UCL_PARSER_DEFAULT); @@ -309,7 +351,13 @@ goto err_load_metadata; } - err = ucl_parser_add_file(parser, filename); + md_fd = open(filename, O_RDONLY); + +#ifndef WITHOUT_CAPSICUM + limit_metadata_socket(md_fd); +#endif + + err = ucl_parser_add_fd(parser, md_fd); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); @@ -330,6 +378,8 @@ return (0); err_load_metadata: + if (md_fd > 0) + close(md_fd); if (parser != NULL) ucl_parser_free(parser); return (err); @@ -1304,10 +1354,16 @@ static void vm_vcpu_pause(struct vmctx *ctx) { + int err; pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; - vm_suspend_cpu(ctx, -1); + err = vm_suspend_cpu(ctx, -1); + if (err != 0) { + fprintf(stderr, "%s: Could not suspend vcpus\r\n", __func__); + pthread_mutex_unlock(&vcpu_lock); + return; + } while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); @@ -1324,10 +1380,42 @@ pthread_cond_broadcast(&vcpus_can_run); } +#ifndef WITHOUT_CAPSICUM +#define DESTROY(vm, ch, err, LABEL) \ +do { \ + cap_channel_t *capsysctl = NULL; \ + char *name = "hw.vmm.destroy"; \ + void *limit; \ + \ + /* Create capability to the system.sysctl service with Casper. */ \ + capsysctl = cap_service_open(ch, "system.sysctl"); \ + if (capsysctl == NULL) \ + fprintf(stderr, "%s: Unable to open system.sysctl service", __func__); \ + \ + cap_close(ch); \ + \ + /* Create limit for one MIB with write access only. */ \ + limit = cap_sysctl_limit_init(capsysctl); \ + (void)cap_sysctl_limit_name(limit, name, CAP_SYSCTL_WRITE); \ + \ + /* Limit system.sysctl. */ \ + if (cap_sysctl_limit(limit) < 0) \ + fprintf(stderr, "%s: Unable to set limits", __func__); \ + \ + err = cap_sysctlbyname(capsysctl, name, NULL, NULL, (vm), strlen((vm))); \ + \ + cap_close(capsysctl); \ + if (err != 0) { \ + fprintf(stderr, "%s: err is %d\r\n", __func__, errno); \ + goto LABEL; \ + } \ +} while(0) +#endif + static int -vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, bool stop_vm) +vm_checkpoint(struct vmctx *ctx, char *checkpoint_file, cap_channel_t *chn, bool stop_vm) { - int fd_checkpoint = 0, kdata_fd = 0; + int fd_checkpoint = 0, kdata_fd = 0, meta_fd = 0; int ret = 0; int error = 0; size_t memsz; @@ -1335,6 +1423,8 @@ char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; + char vmname[MAX_VMNAME]; + kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { @@ -1342,15 +1432,14 @@ return (-1); } - kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); + kdata_fd = openat(cdir_fd, kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } - fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); - + fd_checkpoint = openat(cdir_fd, checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; @@ -1363,7 +1452,12 @@ goto done; } - meta_file = fopen(meta_filename, "w"); + meta_fd = openat(cdir_fd, meta_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); + if (meta_fd < 0) { + perror("Failed to open vm metadata snapshot file descriptor."); + goto done; + } + meta_file = fdopen(meta_fd, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); goto done; @@ -1382,6 +1476,10 @@ fprintf(stderr, "Could not pause devices\r\n"); error = ret; goto done; +#ifndef WITHOUT_CAPSICUM + if (cdir_fd > 0) + close(cdir_fd); +#endif } memsz = vm_snapshot_mem(ctx, fd_checkpoint, 0, true); @@ -1398,7 +1496,6 @@ goto done; } - ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); @@ -1415,8 +1512,19 @@ xo_finish_h(xop); + if (stop_vm) { - vm_destroy(ctx); + if (chn != NULL) { + error = vm_get_name(ctx, vmname, MAX_VMNAME - 1); + if (error != 0) { + fprintf(stderr, "%s: Failed to get VM name", __func__); + goto done; + } + DESTROY(vmname, chn, error, done); + free(ctx); + } else + vm_destroy(ctx); + exit(0); } @@ -1437,20 +1545,24 @@ fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); +#ifndef WITHOUT_CAPSICUM + if (cdir_fd > 0) + close(cdir_fd); +#endif return (error); } int -handle_message(struct ipc_message *imsg, struct vmctx *ctx) +handle_message(struct ipc_message *imsg, struct vmctx *ctx, cap_channel_t *chn) { int err; switch (imsg->code) { case START_CHECKPOINT: - err = vm_checkpoint(ctx, imsg->data.op.snapshot_filename, false); + err = vm_checkpoint(ctx, imsg->data.op.snapshot_filename, chn, false); break; case START_SUSPEND: - err = vm_checkpoint(ctx, imsg->data.op.snapshot_filename, true); + err = vm_checkpoint(ctx, imsg->data.op.snapshot_filename, chn, true); break; default: EPRINTLN("Unrecognized checkpoint operation\n"); @@ -1484,7 +1596,7 @@ * least determine the type of message. */ if (n >= sizeof(imsg.code)) - handle_message(&imsg, thread_info->ctx); + handle_message(&imsg, thread_info->ctx, thread_info->channel); else EPRINTLN("Failed to receive message: %s\n", n == -1 ? strerror(errno) : "unknown error"); @@ -1493,11 +1605,34 @@ return (NULL); } +#ifndef WITHOUT_CAPSICUM +static void +limit_control_socket(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_BIND, CAP_READ); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +static void +limit_file_operations() +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_LOOKUP, CAP_FTRUNCATE, CAP_PWRITE, CAP_PREAD, CAP_FCNTL, CAP_CREATE); + if (caph_rights_limit(cdir_fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +#endif + /* * Create the listening socket for IPC with bhyvectl */ int -init_checkpoint_thread(struct vmctx *ctx) +init_checkpoint_thread(struct vmctx *ctx, char *ckp_path, cap_channel_t *chn) { struct checkpoint_thread_info *checkpoint_info = NULL; struct sockaddr_un addr; @@ -1524,6 +1659,17 @@ goto fail; } + if (ckp_path != NULL) { + cdir_fd = open(ckp_path, O_RDONLY | O_DIRECTORY); + if (cdir_fd < 0) { + perror("Failed to open working directory."); + err = -1; + goto fail; + } + limit_control_socket(socket_fd); + limit_file_operations(); + } + addr.sun_family = AF_UNIX; err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1); @@ -1547,6 +1693,7 @@ checkpoint_info = calloc(1, sizeof(*checkpoint_info)); checkpoint_info->ctx = ctx; checkpoint_info->socket_fd = socket_fd; + checkpoint_info->channel = chn; ret = pthread_create(&checkpoint_pthread, NULL, checkpoint_thread, checkpoint_info);