Index: lib/libvmmapi/vmmapi.c =================================================================== --- lib/libvmmapi/vmmapi.c +++ lib/libvmmapi/vmmapi.c @@ -1719,7 +1719,7 @@ VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, VM_SET_INTINFO, VM_GET_INTINFO, VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, - VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY }; + VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY, VM_SNAPSHOT_REQ }; if (len == NULL) { cmds = malloc(sizeof(vm_ioctl_cmds)); Index: usr.sbin/bhyve/Makefile =================================================================== --- usr.sbin/bhyve/Makefile +++ usr.sbin/bhyve/Makefile @@ -91,8 +91,8 @@ LIBADD+= casper LIBADD+= cap_pwd LIBADD+= cap_grp -# Temporary disable capsicum, until we integrate checkpoint code with it. -#CFLAGS+=-DWITH_CASPER +LIBADD+= cap_sysctl +CFLAGS+=-DWITH_CASPER .endif .if ${MK_BHYVE_SNAPSHOT} != "no" @@ -121,9 +121,6 @@ .if ${MK_BHYVE_SNAPSHOT} != "no" CFLAGS+= -I${SRCTOP}/contrib/libucl/include -# Temporary disable capsicum, until we integrate checkpoint code with it. -CFLAGS+= -DWITHOUT_CAPSICUM - CFLAGS+= -DBHYVE_SNAPSHOT .endif Index: usr.sbin/bhyve/bhyverun.c =================================================================== --- usr.sbin/bhyve/bhyverun.c +++ usr.sbin/bhyve/bhyverun.c @@ -1244,13 +1244,14 @@ restore_file = NULL; #endif + char *ckp_path = NULL; init_config(); set_defaults(); progname = basename(argv[0]); #ifdef BHYVE_SNAPSHOT - optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:r:"; + optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:r:t:"; #else optstr = "aehuwxACDHIPSWYk:o:p:G:c:s:m:l:K:U:"; #endif @@ -1302,6 +1303,9 @@ case 'r': restore_file = optarg; break; + case 't': + ckp_path = optarg; + break; #endif case 's': if (strncmp(optarg, "help", strlen(optarg)) == 0) { @@ -1547,22 +1551,12 @@ */ setproctitle("%s", vmname); -#ifndef WITHOUT_CAPSICUM - caph_cache_catpages(); - - if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) - errx(EX_OSERR, "Unable to apply rights for sandbox"); - - if (caph_enter() == -1) - errx(EX_OSERR, "cap_enter() failed"); -#endif - #ifdef BHYVE_SNAPSHOT if (restore_file != NULL) destroy_restore_state(&rstate); /* initialize mutex/cond variables */ - init_snapshot(); + init_snapshot(ckp_path); /* * checkpointing thread for communication with bhyvectl @@ -1574,6 +1568,16 @@ vm_restore_time(ctx); #endif +#ifndef WITHOUT_CAPSICUM + caph_cache_catpages(); + + if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + + if (caph_enter() == -1) + errx(EX_OSERR, "cap_enter() failed"); +#endif + /* * Add CPU 0 */ Index: usr.sbin/bhyve/snapshot.h =================================================================== --- usr.sbin/bhyve/snapshot.h +++ usr.sbin/bhyve/snapshot.h @@ -42,6 +42,11 @@ #include #include +#ifndef WITHOUT_CAPSICUM +#include +#include +#endif + #define BHYVE_RUN_DIR "/var/run/bhyve/" #define MAX_SNAPSHOT_FILENAME PATH_MAX @@ -101,8 +106,8 @@ int get_checkpoint_msg(int conn_fd, struct vmctx *ctx); void *checkpoint_thread(void *param); +void init_snapshot(char *ckp_path); int init_checkpoint_thread(struct vmctx *ctx); -void init_snapshot(void); int load_restore_file(const char *filename, struct restore_state *rstate); Index: usr.sbin/bhyve/snapshot.c =================================================================== --- usr.sbin/bhyve/snapshot.c +++ usr.sbin/bhyve/snapshot.c @@ -37,9 +37,7 @@ __FBSDID("$FreeBSD$"); #include -#ifndef WITHOUT_CAPSICUM #include -#endif #include #include #include @@ -49,9 +47,7 @@ #include #include -#ifndef WITHOUT_CAPSICUM #include -#endif #include #include #include @@ -70,9 +66,7 @@ #include #include -#ifndef WITHOUT_CAPSICUM #include -#endif #include #include @@ -95,6 +89,8 @@ #include "spinup_ap.h" #include "rtc.h" +#include +#include #include #include @@ -171,6 +167,9 @@ static pthread_cond_t vcpus_idle, vcpus_can_run; static bool checkpoint_active; +static int cdir_fd = AT_FDCWD; +static cap_channel_t *capsysctl = NULL; + /* * TODO: Harden this function and all of its callers since 'base_str' is a user * provided string. @@ -210,18 +209,58 @@ return; } - if (rstate->kdata_map != MAP_FAILED) + if (rstate->kdata_map != MAP_FAILED) { munmap(rstate->kdata_map, rstate->kdata_len); + EPRINTLN("%s: destroying kdata_map", __func__); + } - if (rstate->kdata_fd > 0) + if (rstate->kdata_fd > 0) { close(rstate->kdata_fd); - if (rstate->vmmem_fd > 0) + EPRINTLN("%s: destroying kdata_fd", __func__); + } + if (rstate->vmmem_fd > 0) { close(rstate->vmmem_fd); + EPRINTLN("%s: destroying vmmem_fd", __func__); + } - if (rstate->meta_root_obj != NULL) + if (rstate->meta_root_obj != NULL) { ucl_object_unref(rstate->meta_root_obj); - if (rstate->meta_parser != NULL) + EPRINTLN("%s: destroying meta_root_obj", __func__); + } + if (rstate->meta_parser != NULL) { ucl_parser_free(rstate->meta_parser); + EPRINTLN("%s: destroying meta_parser", __func__); + } +} + +static void +limit_vmmem_rights(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_FSTAT, CAP_MMAP_R, CAP_IOCTL, CAP_READ); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +static void +limit_kerneldata_rights(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_FSTAT, CAP_MMAP_R, CAP_READ); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +static void +limit_metadata_rights(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_FSTAT, CAP_MMAP_R, CAP_READ); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); } static int @@ -236,6 +275,8 @@ return (-1); } + limit_vmmem_rights(rstate->vmmem_fd); + err = fstat(rstate->vmmem_fd, &sb); if (err < 0) { perror("Failed to stat restore file"); @@ -269,6 +310,8 @@ return (-1); } + limit_kerneldata_rights(rstate->kdata_fd); + err = fstat(rstate->kdata_fd, &sb); if (err < 0) { perror("Failed to stat kernel data file"); @@ -301,15 +344,24 @@ { const ucl_object_t *obj; struct ucl_parser *parser; + int md_fd = -1; int err; + md_fd = open(filename, O_RDONLY); + if (md_fd < 0) { + perror("Failed to open metadata snapshot file."); + return (-1); + } + parser = ucl_parser_new(UCL_PARSER_DEFAULT); if (parser == NULL) { fprintf(stderr, "Failed to initialize UCL parser.\n"); goto err_load_metadata; } - err = ucl_parser_add_file(parser, filename); + limit_metadata_rights(md_fd); + + err = ucl_parser_add_fd(parser, md_fd); if (err == 0) { fprintf(stderr, "Failed to parse metadata file: '%s'\n", filename); @@ -330,6 +382,8 @@ return (0); err_load_metadata: + if (md_fd > 0) + close(md_fd); if (parser != NULL) ucl_parser_free(parser); return (err); @@ -1304,10 +1358,16 @@ static void vm_vcpu_pause(struct vmctx *ctx) { + int err; pthread_mutex_lock(&vcpu_lock); checkpoint_active = true; - vm_suspend_cpu(ctx, -1); + err = vm_suspend_cpu(ctx, -1); + if (err != 0) { + EPRINTLN("%s: Could not suspend vcpus", __func__); + pthread_mutex_unlock(&vcpu_lock); + return; + } while (CPU_CMP(&vcpus_active, &vcpus_suspended) != 0) pthread_cond_wait(&vcpus_idle, &vcpu_lock); pthread_mutex_unlock(&vcpu_lock); @@ -1324,10 +1384,42 @@ pthread_cond_broadcast(&vcpus_can_run); } +static int +local_vm_destroy(char *vm) +{ + char *name = "hw.vmm.destroy"; + void *limit; + int err = 0; + int ret = 0; + + /* Create limit for one MIB with write access only. */ + limit = cap_sysctl_limit_init(capsysctl); + (void)cap_sysctl_limit_name(limit, name, CAP_SYSCTL_WRITE); + + /* Limit system.sysctl. */ + if (cap_sysctl_limit(limit) < 0) { + EPRINTLN("%s: Unable to set limits", __func__); + ret = -1; + goto done; + } + + err = cap_sysctlbyname(capsysctl, name, NULL, NULL, vm, strlen(vm)); + + cap_close(capsysctl); + if (err != 0) { + EPRINTLN("%s: err is %d\r\n", __func__, errno); + ret = errno; + goto done; + } + +done: + return (ret); +} + static int vm_checkpoint(struct vmctx *ctx, const char *checkpoint_file, bool stop_vm) { - int fd_checkpoint = 0, kdata_fd = 0; + int fd_checkpoint = 0, kdata_fd = 0, meta_fd = 0; int ret = 0; int error = 0; size_t memsz; @@ -1335,6 +1427,7 @@ char *meta_filename = NULL; char *kdata_filename = NULL; FILE *meta_file = NULL; + char vmname[MAX_VMNAME]; kdata_filename = strcat_extension(checkpoint_file, ".kern"); if (kdata_filename == NULL) { @@ -1342,15 +1435,14 @@ return (-1); } - kdata_fd = open(kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); + kdata_fd = openat(cdir_fd, kdata_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); if (kdata_fd < 0) { perror("Failed to open kernel data snapshot file."); error = -1; goto done; } - fd_checkpoint = open(checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); - + fd_checkpoint = openat(cdir_fd, checkpoint_file, O_RDWR | O_CREAT | O_TRUNC, 0700); if (fd_checkpoint < 0) { perror("Failed to create checkpoint file"); error = -1; @@ -1363,7 +1455,13 @@ goto done; } - meta_file = fopen(meta_filename, "w"); + meta_fd = openat(cdir_fd, meta_filename, O_WRONLY | O_CREAT | O_TRUNC, 0700); + if (meta_fd < 0) { + perror("Failed to open vm metadata snapshot file descriptor."); + goto done; + } + + meta_file = fdopen(meta_fd, "w"); if (meta_file == NULL) { perror("Failed to open vm metadata snapshot file."); goto done; @@ -1398,7 +1496,6 @@ goto done; } - ret = vm_snapshot_kern_structs(ctx, kdata_fd, xop); if (ret != 0) { fprintf(stderr, "Failed to snapshot vm kernel data.\n"); @@ -1415,8 +1512,19 @@ xo_finish_h(xop); + if (stop_vm) { - vm_destroy(ctx); + if (capsysctl != NULL) { + error = vm_get_name(ctx, vmname, MAX_VMNAME - 1); + if (error != 0) { + EPRINTLN("%s: Failed to get VM name.", __func__); + goto done; + } + local_vm_destroy(vmname); + free(ctx); + } else + vm_destroy(ctx); + exit(0); } @@ -1437,10 +1545,12 @@ fclose(meta_file); if (kdata_fd > 0) close(kdata_fd); + if (cdir_fd > 0) + close(cdir_fd); return (error); } -static int +int handle_message(struct vmctx *ctx, nvlist_t *nvl) { int err; @@ -1453,13 +1563,13 @@ if (strcmp(cmd, "checkpoint") == 0) { if (!nvlist_exists_string(nvl, "filename") || !nvlist_exists_bool(nvl, "suspend")) - err = -1; + err = -1; else - err = vm_checkpoint(ctx, nvlist_get_string(nvl, "filename"), - nvlist_get_bool(nvl, "suspend")); + err = vm_checkpoint(ctx, nvlist_get_string(nvl, "filename"), + nvlist_get_bool(nvl, "suspend")); } else { - EPRINTLN("Unrecognized checkpoint operation\n"); - err = -1; + EPRINTLN("Unrecognized checkpoint operation\n"); + err = -1; } if (err != 0) @@ -1483,6 +1593,11 @@ for (;;) { nvl = nvlist_recv(thread_info->socket_fd, 0); + + /* + * slight sanity check: see if there's enough data to at + * least determine the type of message. + */ if (nvl != NULL) handle_message(thread_info->ctx, nvl); else @@ -1492,8 +1607,58 @@ return (NULL); } +static void +limit_control_socket(int s) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_BIND, CAP_READ, CAP_GETSOCKOPT); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +static void +limit_file_operations(void) +{ + cap_rights_t rights; + + cap_rights_init(&rights, CAP_LOOKUP, CAP_FTRUNCATE, CAP_PWRITE, CAP_PREAD, CAP_FCNTL, CAP_CREATE); + if (caph_rights_limit(cdir_fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} + +static void +init_capsicum_info(char *ckp_path) +{ + /* Open capability to Casper. */ + cap_channel_t *casper_channel = cap_init(); + if (casper_channel == NULL) + errx(EX_OSERR, "cap_init() failed"); + + /* Create capability to the system.sysctl service with Casper. */ + capsysctl = cap_service_open(casper_channel, "system.sysctl"); + if (capsysctl == NULL) + fprintf(stderr, "%s: Unable to open system.sysctl service", __func__); + + cap_close(casper_channel); + + /* + * If the path for the parent directory is not specified then + * the directory where the bhyve command is called will be used + */ + if (ckp_path == NULL) { + ckp_path = "."; + } + + cdir_fd = open(ckp_path, O_RDONLY | O_DIRECTORY); + if (cdir_fd < 0) + errc(1, cdir_fd, "open snapshot files directory"); + + limit_file_operations(); +} + void -init_snapshot(void) +init_snapshot(char *ckp_path) { int err; @@ -1506,6 +1671,7 @@ err = pthread_cond_init(&vcpus_can_run, NULL); if (err != 0) errc(1, err, "checkpoint cv init (vcpus_can_run)"); + init_capsicum_info(ckp_path); } /* @@ -1519,7 +1685,7 @@ int socket_fd; pthread_t checkpoint_pthread; char vmname_buf[MAX_VMNAME]; - int err; + int err = 0; memset(&addr, 0, sizeof(addr)); @@ -1530,6 +1696,8 @@ goto fail; } + limit_control_socket(socket_fd); + addr.sun_family = AF_UNIX; err = vm_get_name(ctx, vmname_buf, MAX_VMNAME - 1);