diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index c1001e6b81c2..aaca670080b8 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -1,848 +1,849 @@ /* * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (c) 2020, Felix Dörre * All rights reserved. */ #include #include #include #include #include #include #define PAM_SM_AUTH #define PAM_SM_PASSWORD #define PAM_SM_SESSION #include #if defined(__linux__) #include #define MAP_FLAGS MAP_PRIVATE | MAP_ANONYMOUS #elif defined(__FreeBSD__) #include static void pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...) { (void) pamh; va_list args; va_start(args, fmt); vsyslog(loglevel, fmt, args); va_end(args); } #define MAP_FLAGS MAP_PRIVATE | MAP_ANON | MAP_NOCORE #endif #include #include #include #include #include #include #include #include static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok"; static libzfs_handle_t *g_zfs; static void destroy_pw(pam_handle_t *pamh, void *data, int errcode); typedef int (*mlock_func_t) (const void *, size_t); typedef struct { size_t len; char *value; } pw_password_t; /* * Try to mlock(2) or munlock(2) addr while handling EAGAIN by retrying ten * times and sleeping 10 milliseconds in between for a total of 0.1 * seconds. lock_func must point to either mlock(2) or munlock(2). */ static int try_lock(mlock_func_t lock_func, const void *addr, size_t len) { int err; int retries = 10; useconds_t sleep_dur = 10 * 1000; if ((err = (*lock_func)(addr, len)) != EAGAIN) { return (err); } for (int i = retries; i > 0; --i) { (void) usleep(sleep_dur); if ((err = (*lock_func)(addr, len)) != EAGAIN) { break; } } return (err); } static pw_password_t * alloc_pw_size(size_t len) { pw_password_t *pw = malloc(sizeof (pw_password_t)); if (!pw) { return (NULL); } pw->len = len; /* * We use mmap(2) rather than malloc(3) since later on we mlock(2) the * memory region. Since mlock(2) and munlock(2) operate on whole memory * pages we should allocate a whole page here as mmap(2) does. Further * this ensures that the addresses passed to mlock(2) an munlock(2) are * on a page boundary as suggested by FreeBSD and required by some * other implementations. Finally we avoid inadvertently munlocking * memory mlocked by an concurrently running instance of us. */ pw->value = mmap(NULL, pw->len, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); if (pw->value == MAP_FAILED) { free(pw); return (NULL); } if (try_lock(mlock, pw->value, pw->len) != 0) { (void) munmap(pw->value, pw->len); free(pw); return (NULL); } return (pw); } static pw_password_t * alloc_pw_string(const char *source) { size_t len = strlen(source) + 1; pw_password_t *pw = alloc_pw_size(len); if (!pw) { return (NULL); } memcpy(pw->value, source, pw->len); return (pw); } static void pw_free(pw_password_t *pw) { memset(pw->value, 0, pw->len); if (try_lock(munlock, pw->value, pw->len) == 0) { (void) munmap(pw->value, pw->len); } free(pw); } static pw_password_t * pw_fetch(pam_handle_t *pamh) { const char *token; if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "couldn't get password from PAM stack"); return (NULL); } if (!token) { pam_syslog(pamh, LOG_ERR, "token from PAM stack is null"); return (NULL); } return (alloc_pw_string(token)); } static const pw_password_t * pw_fetch_lazy(pam_handle_t *pamh) { pw_password_t *pw = pw_fetch(pamh); if (pw == NULL) { return (NULL); } int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw); if (ret != PAM_SUCCESS) { pw_free(pw); pam_syslog(pamh, LOG_ERR, "pam_set_data failed"); return (NULL); } return (pw); } static const pw_password_t * pw_get(pam_handle_t *pamh) { const pw_password_t *authtok = NULL; int ret = pam_get_data(pamh, PASSWORD_VAR_NAME, (const void**)(&authtok)); if (ret == PAM_SUCCESS) return (authtok); if (ret == PAM_NO_MODULE_DATA) return (pw_fetch_lazy(pamh)); pam_syslog(pamh, LOG_ERR, "password not available"); return (NULL); } static int pw_clear(pam_handle_t *pamh) { int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL); if (ret != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "clearing password failed"); return (-1); } return (0); } static void destroy_pw(pam_handle_t *pamh, void *data, int errcode) { (void) pamh, (void) errcode; if (data != NULL) { pw_free((pw_password_t *)data); } } static int pam_zfs_init(pam_handle_t *pamh) { int error = 0; if ((g_zfs = libzfs_init()) == NULL) { error = errno; pam_syslog(pamh, LOG_ERR, "Zfs initialization error: %s", libzfs_error_init(error)); } return (error); } static void pam_zfs_free(void) { libzfs_fini(g_zfs); } static pw_password_t * prepare_passphrase(pam_handle_t *pamh, zfs_handle_t *ds, const char *passphrase, nvlist_t *nvlist) { pw_password_t *key = alloc_pw_size(WRAPPING_KEY_LEN); if (!key) { return (NULL); } uint64_t salt; uint64_t iters; if (nvlist != NULL) { int fd = open("/dev/urandom", O_RDONLY); if (fd < 0) { pw_free(key); return (NULL); } int bytes_read = 0; char *buf = (char *)&salt; size_t bytes = sizeof (uint64_t); while (bytes_read < bytes) { ssize_t len = read(fd, buf + bytes_read, bytes - bytes_read); if (len < 0) { close(fd); pw_free(key); return (NULL); } bytes_read += len; } close(fd); if (nvlist_add_uint64(nvlist, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt)) { pam_syslog(pamh, LOG_ERR, "failed to add salt to nvlist"); pw_free(key); return (NULL); } iters = DEFAULT_PBKDF2_ITERATIONS; if (nvlist_add_uint64(nvlist, zfs_prop_to_name( ZFS_PROP_PBKDF2_ITERS), iters)) { pam_syslog(pamh, LOG_ERR, "failed to add iters to nvlist"); pw_free(key); return (NULL); } } else { salt = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_SALT); iters = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_ITERS); } salt = LE_64(salt); if (!PKCS5_PBKDF2_HMAC_SHA1((char *)passphrase, strlen(passphrase), (uint8_t *)&salt, sizeof (uint64_t), iters, WRAPPING_KEY_LEN, (uint8_t *)key->value)) { pam_syslog(pamh, LOG_ERR, "pbkdf failed"); pw_free(key); return (NULL); } return (key); } static int is_key_loaded(pam_handle_t *pamh, const char *ds_name) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } int keystatus = zfs_prop_get_int(ds, ZFS_PROP_KEYSTATUS); zfs_close(ds); return (keystatus != ZFS_KEYSTATUS_UNAVAILABLE); } static int change_key(pam_handle_t *pamh, const char *ds_name, const char *passphrase) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } nvlist_t *nvlist = fnvlist_alloc(); pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, nvlist); if (key == NULL) { nvlist_free(nvlist); zfs_close(ds); return (-1); } if (nvlist_add_string(nvlist, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt")) { pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keylocation"); pw_free(key); nvlist_free(nvlist); zfs_close(ds); return (-1); } if (nvlist_add_uint64(nvlist, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_PASSPHRASE)) { pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keyformat"); pw_free(key); nvlist_free(nvlist); zfs_close(ds); return (-1); } int ret = lzc_change_key(ds_name, DCP_CMD_NEW_KEY, nvlist, (uint8_t *)key->value, WRAPPING_KEY_LEN); pw_free(key); if (ret) { pam_syslog(pamh, LOG_ERR, "change_key failed: %d", ret); nvlist_free(nvlist); zfs_close(ds); return (-1); } nvlist_free(nvlist); zfs_close(ds); return (0); } static int decrypt_mount(pam_handle_t *pamh, const char *ds_name, const char *passphrase) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, NULL); if (key == NULL) { zfs_close(ds); return (-1); } int ret = lzc_load_key(ds_name, B_FALSE, (uint8_t *)key->value, WRAPPING_KEY_LEN); pw_free(key); if (ret) { pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); zfs_close(ds); return (-1); } ret = zfs_mount(ds, NULL, 0); if (ret) { pam_syslog(pamh, LOG_ERR, "mount failed: %d", ret); zfs_close(ds); return (-1); } zfs_close(ds); return (0); } static int unmount_unload(pam_handle_t *pamh, const char *ds_name) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } int ret = zfs_unmount(ds, NULL, 0); if (ret) { pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); zfs_close(ds); return (-1); } ret = lzc_unload_key(ds_name); if (ret) { pam_syslog(pamh, LOG_ERR, "unload_key failed with: %d", ret); zfs_close(ds); return (-1); } zfs_close(ds); return (0); } typedef struct { char *homes_prefix; char *runstatedir; char *homedir; char *dsname; uid_t uid; const char *username; int unmount_and_unload; } zfs_key_config_t; static int zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, int argc, const char **argv) { config->homes_prefix = strdup("rpool/home"); if (config->homes_prefix == NULL) { pam_syslog(pamh, LOG_ERR, "strdup failure"); return (-1); } config->runstatedir = strdup(RUNSTATEDIR "/pam_zfs_key"); if (config->runstatedir == NULL) { pam_syslog(pamh, LOG_ERR, "strdup failure"); free(config->homes_prefix); return (-1); } const char *name; if (pam_get_user(pamh, &name, NULL) != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "couldn't get username from PAM stack"); free(config->runstatedir); free(config->homes_prefix); return (-1); } struct passwd *entry = getpwnam(name); if (!entry) { free(config->runstatedir); free(config->homes_prefix); return (-1); } config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = 1; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { if (strncmp(argv[c], "homes=", 6) == 0) { free(config->homes_prefix); config->homes_prefix = strdup(argv[c] + 6); } else if (strncmp(argv[c], "runstatedir=", 12) == 0) { free(config->runstatedir); config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = 0; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { - config->homedir = strdup(entry->pw_dir); + if (config->homedir == NULL) + config->homedir = strdup(entry->pw_dir); } } return (0); } static void zfs_key_config_free(zfs_key_config_t *config) { free(config->homes_prefix); free(config->runstatedir); free(config->homedir); free(config->dsname); } static int find_dsname_by_prop_value(zfs_handle_t *zhp, void *data) { zfs_type_t type = zfs_get_type(zhp); zfs_key_config_t *target = data; char mountpoint[ZFS_MAXPROPLEN]; /* Skip any datasets whose type does not match */ if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } /* Skip any datasets whose mountpoint does not match */ (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE); if (strcmp(target->homedir, mountpoint) != 0) { zfs_close(zhp); return (0); } target->dsname = strdup(zfs_get_name(zhp)); zfs_close(zhp); return (1); } static char * zfs_key_config_get_dataset(zfs_key_config_t *config) { if (config->homedir != NULL && config->homes_prefix != NULL) { zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, ZFS_TYPE_FILESYSTEM); if (zhp == NULL) { pam_syslog(NULL, LOG_ERR, "dataset %s not found", config->homes_prefix); return (NULL); } (void) zfs_iter_filesystems(zhp, find_dsname_by_prop_value, config); zfs_close(zhp); char *dsname = config->dsname; config->dsname = NULL; return (dsname); } if (config->homes_prefix == NULL) { return (NULL); } size_t len = ZFS_MAX_DATASET_NAME_LEN; size_t total_len = strlen(config->homes_prefix) + 1 + strlen(config->username); if (total_len > len) { return (NULL); } char *ret = malloc(len + 1); if (!ret) { return (NULL); } ret[0] = 0; strcat(ret, config->homes_prefix); strcat(ret, "/"); strcat(ret, config->username); return (ret); } static int zfs_key_config_modify_session_counter(pam_handle_t *pamh, zfs_key_config_t *config, int delta) { const char *runtime_path = config->runstatedir; if (mkdir(runtime_path, S_IRWXU) != 0 && errno != EEXIST) { pam_syslog(pamh, LOG_ERR, "Can't create runtime path: %d", errno); return (-1); } if (chown(runtime_path, 0, 0) != 0) { pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d", errno); return (-1); } if (chmod(runtime_path, S_IRWXU) != 0) { pam_syslog(pamh, LOG_ERR, "Can't chmod runtime path: %d", errno); return (-1); } size_t runtime_path_len = strlen(runtime_path); size_t counter_path_len = runtime_path_len + 1 + 10; char *counter_path = malloc(counter_path_len + 1); if (!counter_path) { return (-1); } counter_path[0] = 0; strcat(counter_path, runtime_path); snprintf(counter_path + runtime_path_len, counter_path_len, "/%d", config->uid); const int fd = open(counter_path, O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW, S_IRUSR | S_IWUSR); free(counter_path); if (fd < 0) { pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", errno); return (-1); } if (flock(fd, LOCK_EX) != 0) { pam_syslog(pamh, LOG_ERR, "Can't lock counter file: %d", errno); close(fd); return (-1); } char counter[20]; char *pos = counter; int remaining = sizeof (counter) - 1; int ret; counter[sizeof (counter) - 1] = 0; while (remaining > 0 && (ret = read(fd, pos, remaining)) > 0) { remaining -= ret; pos += ret; } *pos = 0; long int counter_value = strtol(counter, NULL, 10); counter_value += delta; if (counter_value < 0) { counter_value = 0; } lseek(fd, 0, SEEK_SET); if (ftruncate(fd, 0) != 0) { pam_syslog(pamh, LOG_ERR, "Can't truncate counter file: %d", errno); close(fd); return (-1); } snprintf(counter, sizeof (counter), "%ld", counter_value); remaining = strlen(counter); pos = counter; while (remaining > 0 && (ret = write(fd, pos, remaining)) > 0) { remaining -= ret; pos += ret; } close(fd); return (counter_value); } __attribute__((visibility("default"))) PAM_EXTERN int pam_sm_authenticate(pam_handle_t *pamh, int flags, int argc, const char **argv) { (void) flags, (void) argc, (void) argv; if (pw_fetch_lazy(pamh) == NULL) { return (PAM_AUTH_ERR); } return (PAM_SUCCESS); } __attribute__((visibility("default"))) PAM_EXTERN int pam_sm_setcred(pam_handle_t *pamh, int flags, int argc, const char **argv) { (void) pamh, (void) flags, (void) argc, (void) argv; return (PAM_SUCCESS); } __attribute__((visibility("default"))) PAM_EXTERN int pam_sm_chauthtok(pam_handle_t *pamh, int flags, int argc, const char **argv) { if (geteuid() != 0) { pam_syslog(pamh, LOG_ERR, "Cannot zfs_mount when not being root."); return (PAM_PERM_DENIED); } zfs_key_config_t config; if (zfs_key_config_load(pamh, &config, argc, argv) == -1) { return (PAM_SERVICE_ERR); } if (config.uid < 1000) { zfs_key_config_free(&config); return (PAM_SUCCESS); } { if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } int key_loaded = is_key_loaded(pamh, dataset); if (key_loaded == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } free(dataset); pam_zfs_free(); if (! key_loaded) { pam_syslog(pamh, LOG_ERR, "key not loaded, returning try_again"); zfs_key_config_free(&config); return (PAM_PERM_DENIED); } } if ((flags & PAM_UPDATE_AUTHTOK) != 0) { const pw_password_t *token = pw_get(pamh); if (token == NULL) { zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } if (change_key(pamh, dataset, token->value) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } free(dataset); pam_zfs_free(); zfs_key_config_free(&config); if (pw_clear(pamh) == -1) { return (PAM_SERVICE_ERR); } } else { zfs_key_config_free(&config); } return (PAM_SUCCESS); } PAM_EXTERN int pam_sm_open_session(pam_handle_t *pamh, int flags, int argc, const char **argv) { (void) flags; if (geteuid() != 0) { pam_syslog(pamh, LOG_ERR, "Cannot zfs_mount when not being root."); return (PAM_SUCCESS); } zfs_key_config_t config; zfs_key_config_load(pamh, &config, argc, argv); if (config.uid < 1000) { zfs_key_config_free(&config); return (PAM_SUCCESS); } int counter = zfs_key_config_modify_session_counter(pamh, &config, 1); if (counter != 1) { zfs_key_config_free(&config); return (PAM_SUCCESS); } const pw_password_t *token = pw_get(pamh); if (token == NULL) { zfs_key_config_free(&config); return (PAM_SESSION_ERR); } if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } if (decrypt_mount(pamh, dataset, token->value) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } free(dataset); pam_zfs_free(); zfs_key_config_free(&config); if (pw_clear(pamh) == -1) { return (PAM_SERVICE_ERR); } return (PAM_SUCCESS); } __attribute__((visibility("default"))) PAM_EXTERN int pam_sm_close_session(pam_handle_t *pamh, int flags, int argc, const char **argv) { (void) flags; if (geteuid() != 0) { pam_syslog(pamh, LOG_ERR, "Cannot zfs_mount when not being root."); return (PAM_SUCCESS); } zfs_key_config_t config; zfs_key_config_load(pamh, &config, argc, argv); if (config.uid < 1000) { zfs_key_config_free(&config); return (PAM_SUCCESS); } int counter = zfs_key_config_modify_session_counter(pamh, &config, -1); if (counter != 0) { zfs_key_config_free(&config); return (PAM_SUCCESS); } if (config.unmount_and_unload) { if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SESSION_ERR); } if (unmount_unload(pamh, dataset) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SESSION_ERR); } free(dataset); pam_zfs_free(); } zfs_key_config_free(&config); return (PAM_SUCCESS); } diff --git a/module/zfs/zcp_get.c b/module/zfs/zcp_get.c index 0a0466d46969..8230a4193662 100644 --- a/module/zfs/zcp_get.c +++ b/module/zfs/zcp_get.c @@ -1,809 +1,810 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif static int get_objset_type(dsl_dataset_t *ds, zfs_type_t *type) { int error; objset_t *os; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); if (ds->ds_is_snapshot) { *type = ZFS_TYPE_SNAPSHOT; } else { switch (os->os_phys->os_type) { case DMU_OST_ZFS: *type = ZFS_TYPE_FILESYSTEM; break; case DMU_OST_ZVOL: *type = ZFS_TYPE_VOLUME; break; default: return (EINVAL); } } return (0); } /* * Returns the string name of ds's type in str (a buffer which should be * at least 12 bytes long). */ static int get_objset_type_name(dsl_dataset_t *ds, char *str) { zfs_type_t type = ZFS_TYPE_INVALID; int error = get_objset_type(ds, &type); if (error != 0) return (error); switch (type) { case ZFS_TYPE_SNAPSHOT: (void) strlcpy(str, "snapshot", ZAP_MAXVALUELEN); break; case ZFS_TYPE_FILESYSTEM: (void) strlcpy(str, "filesystem", ZAP_MAXVALUELEN); break; case ZFS_TYPE_VOLUME: (void) strlcpy(str, "volume", ZAP_MAXVALUELEN); break; default: return (EINVAL); } return (0); } /* * Determines the source of a property given its setpoint and * property type. It pushes the source to the lua stack. */ static void get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop) { if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) { lua_pushnil(state); } else { const char *src; if (strcmp("", setpoint) == 0) { src = "default"; } else { src = setpoint; } (void) lua_pushstring(state, src); } } /* * Given an error encountered while getting properties, either longjmp's for * a fatal error or pushes nothing to the stack for a non fatal one. */ static int zcp_handle_error(lua_State *state, const char *dataset_name, const char *property_name, int error) { ASSERT3S(error, !=, 0); if (error == ENOENT) { return (0); } else if (error == EINVAL) { return (luaL_error(state, "property '%s' is not a valid property on dataset '%s'", property_name, dataset_name)); } else if (error == EIO) { return (luaL_error(state, "I/O error while retrieving property '%s' on dataset '%s'", property_name, dataset_name)); } else { return (luaL_error(state, "unexpected error %d while " "retrieving property '%s' on dataset '%s'", error, property_name, dataset_name)); } } /* * Look up a user defined property in the zap object. If it exists, push it * and the setpoint onto the stack, otherwise don't push anything. */ static int zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *property_name) { int error; char *buf; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * zcp_dataset_hold will either successfully return the requested * dataset or throw a lua error and longjmp out of the zfs.get_prop call * without returning. */ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN, buf, setpoint); dsl_dataset_rele(ds, FTAG); if (error != 0) { kmem_free(buf, ZAP_MAXVALUELEN); return (zcp_handle_error(state, dataset_name, property_name, error)); } (void) lua_pushstring(state, buf); (void) lua_pushstring(state, setpoint); kmem_free(buf, ZAP_MAXVALUELEN); return (2); } /* * Check if the property we're looking for is stored in the ds_dir. If so, * return it in the 'val' argument. Return 0 on success and ENOENT and if * the property is not present. */ static int get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val) { dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); switch (zfs_prop) { case ZFS_PROP_USEDSNAP: *val = dsl_dir_get_usedsnap(dd); break; case ZFS_PROP_USEDCHILD: *val = dsl_dir_get_usedchild(dd); break; case ZFS_PROP_USEDDS: *val = dsl_dir_get_usedds(dd); break; case ZFS_PROP_USEDREFRESERV: *val = dsl_dir_get_usedrefreserv(dd); break; case ZFS_PROP_LOGICALUSED: *val = dsl_dir_get_logicalused(dd); break; default: mutex_exit(&dd->dd_lock); return (SET_ERROR(ENOENT)); } mutex_exit(&dd->dd_lock); return (0); } /* * Check if the property we're looking for is stored at the dsl_dataset or * dsl_dir level. If so, push the property value and source onto the lua stack * and return 0. If it is not present or a failure occurs in lookup, return a * non-zero error value. */ static int get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, zfs_prop_t zfs_prop) { int error = 0; objset_t *os; uint64_t numval = 0; char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); char setpoint[ZFS_MAX_DATASET_NAME_LEN] = "Internal error - setpoint not determined"; zfs_type_t ds_type = ZFS_TYPE_INVALID; zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); (void) get_objset_type(ds, &ds_type); switch (zfs_prop) { case ZFS_PROP_REFRATIO: numval = dsl_get_refratio(ds); break; case ZFS_PROP_USED: numval = dsl_get_used(ds); break; case ZFS_PROP_CLONES: { nvlist_t *clones = fnvlist_alloc(); error = get_clones_stat_impl(ds, clones); if (error == 0) { /* push list to lua stack */ VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0ULL)); /* source */ (void) lua_pushnil(state); } nvlist_free(clones); kmem_free(strval, ZAP_MAXVALUELEN); return (error); } case ZFS_PROP_COMPRESSRATIO: numval = dsl_get_compressratio(ds); break; case ZFS_PROP_CREATION: numval = dsl_get_creation(ds); break; case ZFS_PROP_REFERENCED: numval = dsl_get_referenced(ds); break; case ZFS_PROP_AVAILABLE: numval = dsl_get_available(ds); break; case ZFS_PROP_LOGICALREFERENCED: numval = dsl_get_logicalreferenced(ds); break; case ZFS_PROP_CREATETXG: numval = dsl_get_creationtxg(ds); break; case ZFS_PROP_GUID: numval = dsl_get_guid(ds); break; case ZFS_PROP_UNIQUE: numval = dsl_get_unique(ds); break; case ZFS_PROP_OBJSETID: numval = dsl_get_objsetid(ds); break; case ZFS_PROP_ORIGIN: dsl_dir_get_origin(ds->ds_dir, strval); break; case ZFS_PROP_USERACCOUNTING: error = dmu_objset_from_ds(ds, &os); if (error == 0) numval = dmu_objset_userspace_present(os); break; case ZFS_PROP_WRITTEN: error = dsl_get_written(ds, &numval); break; case ZFS_PROP_TYPE: error = get_objset_type_name(ds, strval); break; case ZFS_PROP_PREV_SNAP: error = dsl_get_prev_snap(ds, strval); break; case ZFS_PROP_NAME: dsl_dataset_name(ds, strval); break; case ZFS_PROP_MOUNTPOINT: error = dsl_get_mountpoint(ds, dsname, strval, setpoint); break; case ZFS_PROP_VERSION: /* should be a snapshot or filesystem */ ASSERT(ds_type != ZFS_TYPE_VOLUME); error = dmu_objset_from_ds(ds, &os); /* look in the master node for the version */ if (error == 0) { error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, sizeof (numval), 1, &numval); } break; case ZFS_PROP_DEFER_DESTROY: numval = dsl_get_defer_destroy(ds); break; case ZFS_PROP_USERREFS: numval = dsl_get_userrefs(ds); break; case ZFS_PROP_FILESYSTEM_COUNT: error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval); (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_SNAPSHOT_COUNT: error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_NUMCLONES: numval = dsl_get_numclones(ds); break; case ZFS_PROP_INCONSISTENT: numval = dsl_get_inconsistent(ds); break; case ZFS_PROP_IVSET_GUID: if (dsl_dataset_is_zapified(ds)) { error = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_IVSET_GUID, sizeof (numval), 1, &numval); } else { error = ENOENT; } break; case ZFS_PROP_RECEIVE_RESUME_TOKEN: { char *token = get_receive_resume_token(ds); if (token != NULL) { (void) strlcpy(strval, token, ZAP_MAXVALUELEN); kmem_strfree(token); } else { error = ENOENT; } break; } case ZFS_PROP_VOLSIZE: ASSERT(ds_type == ZFS_TYPE_VOLUME || ds_type == ZFS_TYPE_SNAPSHOT); error = dmu_objset_from_ds(ds, &os); if (error == 0) { error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", sizeof (numval), 1, &numval); } if (error == 0) (void) strlcpy(setpoint, dsname, ZFS_MAX_DATASET_NAME_LEN); break; case ZFS_PROP_VOLBLOCKSIZE: { ASSERT(ds_type == ZFS_TYPE_VOLUME); dmu_object_info_t doi; error = dmu_objset_from_ds(ds, &os); if (error == 0) { error = dmu_object_info(os, ZVOL_OBJ, &doi); if (error == 0) numval = doi.doi_data_block_size; } break; } case ZFS_PROP_KEYSTATUS: case ZFS_PROP_KEYFORMAT: { /* provide defaults in case no crypto obj exists */ setpoint[0] = '\0'; if (zfs_prop == ZFS_PROP_KEYSTATUS) numval = ZFS_KEYSTATUS_NONE; else numval = ZFS_KEYFORMAT_NONE; nvlist_t *nvl, *propval; nvl = fnvlist_alloc(); dsl_dataset_crypt_stats(ds, nvl); if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop), &propval) == 0) { char *source; (void) nvlist_lookup_uint64(propval, ZPROP_VALUE, &numval); if (nvlist_lookup_string(propval, ZPROP_SOURCE, &source) == 0) strlcpy(setpoint, source, sizeof (setpoint)); } nvlist_free(nvl); break; } case ZFS_PROP_SNAPSHOTS_CHANGED: numval = dsl_dir_snap_cmtime(ds->ds_dir).tv_sec; break; default: /* Did not match these props, check in the dsl_dir */ error = get_dsl_dir_prop(ds, zfs_prop, &numval); } if (error != 0) { kmem_free(strval, ZAP_MAXVALUELEN); return (error); } switch (prop_type) { case PROP_TYPE_NUMBER: { (void) lua_pushnumber(state, numval); break; } case PROP_TYPE_STRING: { (void) lua_pushstring(state, strval); break; } case PROP_TYPE_INDEX: { const char *propval; error = zfs_prop_index_to_string(zfs_prop, numval, &propval); if (error != 0) { kmem_free(strval, ZAP_MAXVALUELEN); return (error); } (void) lua_pushstring(state, propval); break; } } kmem_free(strval, ZAP_MAXVALUELEN); /* Push the source to the stack */ get_prop_src(state, setpoint, zfs_prop); return (0); } /* * Look up a property and its source in the zap object. If the value is * present and successfully retrieved, push the value and source on the * lua stack and return 0. On failure, return a non-zero error value. */ static int get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) { int error = 0; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); uint64_t numval; const char *prop_name = zfs_prop_to_name(zfs_prop); zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); if (prop_type == PROP_TYPE_STRING) { /* Push value to lua stack */ error = dsl_prop_get_ds(ds, prop_name, 1, ZAP_MAXVALUELEN, strval, setpoint); if (error == 0) (void) lua_pushstring(state, strval); } else { error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), 1, &numval, setpoint); #ifdef _KERNEL /* Fill in temporary value for prop, if applicable */ (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint); #else + kmem_free(strval, ZAP_MAXVALUELEN); return (luaL_error(state, "temporary properties only supported in kernel mode", prop_name)); #endif /* Push value to lua stack */ if (prop_type == PROP_TYPE_INDEX) { const char *propval; error = zfs_prop_index_to_string(zfs_prop, numval, &propval); if (error == 0) (void) lua_pushstring(state, propval); } else { if (error == 0) (void) lua_pushnumber(state, numval); } } kmem_free(strval, ZAP_MAXVALUELEN); if (error == 0) get_prop_src(state, setpoint, zfs_prop); return (error); } /* * Determine whether property is valid for a given dataset */ boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) { zfs_type_t zfs_type = ZFS_TYPE_INVALID; /* properties not supported */ if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) || (zfs_prop == ZFS_PROP_MOUNTED)) return (B_FALSE); /* if we want the origin prop, ds must be a clone */ if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir))) return (B_FALSE); int error = get_objset_type(ds, &zfs_type); if (error != 0) return (B_FALSE); return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE)); } /* * Look up a given dataset property. On success return 2, the number of * values pushed to the lua stack (property value and source). On a fatal * error, longjmp. On a non fatal error push nothing. */ static int zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, zfs_prop_t zfs_prop) { int error; /* * zcp_dataset_hold will either successfully return the requested * dataset or throw a lua error and longjmp out of the zfs.get_prop call * without returning. */ dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ /* Check that the property is valid for the given dataset */ const char *prop_name = zfs_prop_to_name(zfs_prop); if (!prop_valid_for_ds(ds, zfs_prop)) { dsl_dataset_rele(ds, FTAG); return (0); } /* Check if the property can be accessed directly */ error = get_special_prop(state, ds, dataset_name, zfs_prop); if (error == 0) { dsl_dataset_rele(ds, FTAG); /* The value and source have been pushed by get_special_prop */ return (2); } if (error != ENOENT) { dsl_dataset_rele(ds, FTAG); return (zcp_handle_error(state, dataset_name, prop_name, error)); } /* If we were unable to find it, look in the zap object */ error = get_zap_prop(state, ds, zfs_prop); dsl_dataset_rele(ds, FTAG); if (error != 0) { return (zcp_handle_error(state, dataset_name, prop_name, error)); } /* The value and source have been pushed by get_zap_prop */ return (2); } #ifdef _KERNEL static zfs_userquota_prop_t get_userquota_prop(const char *prop_name) { zfs_userquota_prop_t type; /* Figure out the property type ({user|group}{quota|used}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(prop_name, zfs_userquota_prop_prefixes[type], strlen(zfs_userquota_prop_prefixes[type])) == 0) break; } return (type); } /* * Given the name of a zfs_userquota_prop, this function determines the * prop type as well as the numeric group/user ids based on the string * following the '@' in the property name. On success, returns 0. On failure, * returns a non-zero error. * 'domain' must be free'd by caller using kmem_strfree() */ static int parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, char **domain, uint64_t *rid) { char *cp, *end, *domain_val; *type = get_userquota_prop(prop_name); if (*type >= ZFS_NUM_USERQUOTA_PROPS) return (EINVAL); *rid = 0; cp = strchr(prop_name, '@') + 1; if (strncmp(cp, "S-1-", 4) == 0) { /* * It's a numeric SID (eg "S-1-234-567-89") and we want to * separate the domain id and the rid */ int domain_len = strrchr(cp, '-') - cp; domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); (void) strncpy(domain_val, cp, domain_len); domain_val[domain_len] = '\0'; cp += domain_len + 1; (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); if (*end != '\0') { kmem_strfree(domain_val); return (EINVAL); } } else { /* It's only a user/group ID (eg "12345"), just get the rid */ domain_val = NULL; (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); if (*end != '\0') return (EINVAL); } *domain = domain_val; return (0); } /* * Look up {user|group}{quota|used} property for given dataset. On success * push the value (quota or used amount) and the setpoint. On failure, push * a lua error. */ static int zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *prop_name) { zfsvfs_t *zfvp; zfsvfs_t *zfsvfs; int error; zfs_userquota_prop_t type; char *domain; uint64_t rid, value = 0; objset_t *os; dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (ds == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ error = parse_userquota_prop(prop_name, &type, &domain, &rid); if (error == 0) { error = dmu_objset_from_ds(ds, &os); if (error == 0) { zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = zfsvfs_create_impl(&zfvp, zfsvfs, os); if (error == 0) { error = zfs_userspace_one(zfvp, type, domain, rid, &value); zfsvfs_free(zfvp); } } if (domain != NULL) kmem_strfree(domain); } dsl_dataset_rele(ds, FTAG); if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) || (type == ZFS_PROP_GROUPQUOTA))) error = SET_ERROR(ENOENT); if (error != 0) { return (zcp_handle_error(state, dataset_name, prop_name, error)); } (void) lua_pushnumber(state, value); (void) lua_pushstring(state, dataset_name); return (2); } #endif /* * Determines the name of the snapshot referenced in the written property * name. Returns snapshot name in snap_name, a buffer that must be at least * as large as ZFS_MAX_DATASET_NAME_LEN */ static void parse_written_prop(const char *dataset_name, const char *prop_name, char *snap_name) { ASSERT(zfs_prop_written(prop_name)); const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN; if (strchr(name, '@') == NULL) { (void) snprintf(snap_name, ZFS_MAX_DATASET_NAME_LEN, "%s@%s", dataset_name, name); } else { (void) strlcpy(snap_name, name, ZFS_MAX_DATASET_NAME_LEN); } } /* * Look up written@ property for given dataset. On success * push the value and the setpoint. If error is fatal, we will * longjmp, otherwise push nothing. */ static int zcp_get_written_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, const char *prop_name) { char snap_name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t used, comp, uncomp; dsl_dataset_t *old; int error = 0; parse_written_prop(dataset_name, prop_name, snap_name); dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG); if (new == NULL) return (1); /* not reached; zcp_dataset_hold() longjmp'd */ error = dsl_dataset_hold(dp, snap_name, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); return (zcp_dataset_hold_error(state, dp, snap_name, error)); } error = dsl_dataset_space_written(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); if (error != 0) { return (zcp_handle_error(state, dataset_name, snap_name, error)); } (void) lua_pushnumber(state, used); (void) lua_pushstring(state, dataset_name); return (2); } static int zcp_get_prop(lua_State *state); static const zcp_lib_info_t zcp_get_prop_info = { .name = "get_prop", .func = zcp_get_prop, .pargs = { { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, { .za_name = "property", .za_lua_type = LUA_TSTRING }, {NULL, 0} }, .kwargs = { {NULL, 0} } }; static int zcp_get_prop(lua_State *state) { const char *dataset_name; const char *property_name; dsl_pool_t *dp = zcp_run_info(state)->zri_pool; const zcp_lib_info_t *libinfo = &zcp_get_prop_info; zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); dataset_name = lua_tostring(state, 1); property_name = lua_tostring(state, 2); /* User defined property */ if (zfs_prop_user(property_name)) { return (zcp_get_user_prop(state, dp, dataset_name, property_name)); } /* userspace property */ if (zfs_prop_userquota(property_name)) { #ifdef _KERNEL return (zcp_get_userquota_prop(state, dp, dataset_name, property_name)); #else return (luaL_error(state, "user quota properties only supported in kernel mode", property_name)); #endif } /* written@ property */ if (zfs_prop_written(property_name)) { return (zcp_get_written_prop(state, dp, dataset_name, property_name)); } zfs_prop_t zfs_prop = zfs_name_to_prop(property_name); /* Valid system property */ if (zfs_prop != ZPROP_INVAL) { return (zcp_get_system_prop(state, dp, dataset_name, zfs_prop)); } /* Invalid property name */ return (luaL_error(state, "'%s' is not a valid property", property_name)); } int zcp_load_get_lib(lua_State *state) { lua_pushcclosure(state, zcp_get_prop_info.func, 0); lua_setfield(state, -2, zcp_get_prop_info.name); return (1); } diff --git a/tests/zfs-tests/cmd/draid.c b/tests/zfs-tests/cmd/draid.c index 869ca902d083..39b58a709cec 100644 --- a/tests/zfs-tests/cmd/draid.c +++ b/tests/zfs-tests/cmd/draid.c @@ -1,1404 +1,1407 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include /* * The number of rows to generate for new permutation maps. */ #define MAP_ROWS_DEFAULT 256 /* * Key values for dRAID maps when stored as nvlists. */ #define MAP_SEED "seed" #define MAP_CHECKSUM "checksum" #define MAP_WORST_RATIO "worst_ratio" #define MAP_AVG_RATIO "avg_ratio" #define MAP_CHILDREN "children" #define MAP_NPERMS "nperms" #define MAP_PERMS "perms" static void draid_usage(void) { (void) fprintf(stderr, "usage: draid command args ...\n" "Available commands are:\n" "\n" "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" "\tdraid verify [-rv] FILE\n" "\tdraid dump [-v] [-m min] [-n max] FILE\n" "\tdraid table FILE\n" "\tdraid merge FILE SRC SRC...\n"); exit(1); } static int read_map(const char *filename, nvlist_t **allcfgs) { int block_size = 131072; int buf_size = 131072; int tmp_size, error; char *tmp_buf; struct stat64 stat; if (lstat64(filename, &stat) != 0) return (errno); if (stat.st_size == 0 || !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { return (EINVAL); } gzFile fp = gzopen(filename, "rb"); if (fp == Z_NULL) return (errno); char *buf = malloc(buf_size); if (buf == NULL) { (void) gzclose(fp); return (ENOMEM); } ssize_t rc, bytes = 0; while (!gzeof(fp)) { rc = gzread(fp, buf + bytes, block_size); if ((rc < 0) || (rc == 0 && !gzeof(fp))) { free(buf); (void) gzclose(fp); (void) gzerror(fp, &error); return (error); } else { bytes += rc; if (bytes + block_size >= buf_size) { tmp_size = 2 * buf_size; tmp_buf = malloc(tmp_size); if (tmp_buf == NULL) { free(buf); (void) gzclose(fp); return (ENOMEM); } memcpy(tmp_buf, buf, bytes); free(buf); buf = tmp_buf; buf_size = tmp_size; } } } (void) gzclose(fp); error = nvlist_unpack(buf, bytes, allcfgs, 0); free(buf); return (error); } /* * Read a map from the specified filename. A file contains multiple maps * which are indexed by the number of children. The caller is responsible * for freeing the configuration returned. */ static int read_map_key(const char *filename, const char *key, nvlist_t **cfg) { nvlist_t *allcfgs, *foundcfg = NULL; int error; error = read_map(filename, &allcfgs); if (error != 0) return (error); (void) nvlist_lookup_nvlist(allcfgs, key, &foundcfg); if (foundcfg != NULL) { nvlist_dup(foundcfg, cfg, KM_SLEEP); error = 0; } else { error = ENOENT; } nvlist_free(allcfgs); return (error); } /* * Write all mappings to the map file. */ static int write_map(const char *filename, nvlist_t *allcfgs) { size_t buflen = 0; int error; error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); if (error) return (error); char *buf = malloc(buflen); if (buf == NULL) return (ENOMEM); error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error) { free(buf); return (error); } /* * Atomically update the file using a temporary file and the * traditional unlink then rename steps. This code provides * no locking, it only guarantees the packed nvlist on disk * is updated atomically and is internally consistent. */ char *tmpname = calloc(1, MAXPATHLEN); if (tmpname == NULL) { free(buf); return (ENOMEM); } snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); int fd = mkstemp(tmpname); if (fd < 0) { error = errno; free(buf); free(tmpname); return (error); } (void) close(fd); gzFile fp = gzopen(tmpname, "w9b"); if (fp == Z_NULL) { error = errno; free(buf); free(tmpname); return (errno); } ssize_t rc, bytes = 0; while (bytes < buflen) { size_t size = MIN(buflen - bytes, 131072); rc = gzwrite(fp, buf + bytes, size); if (rc < 0) { free(buf); (void) gzerror(fp, &error); (void) gzclose(fp); (void) unlink(tmpname); free(tmpname); return (error); } else if (rc == 0) { break; } else { bytes += rc; } } free(buf); (void) gzclose(fp); if (bytes != buflen) { (void) unlink(tmpname); free(tmpname); return (EIO); } /* * Unlink the previous config file and replace it with the updated * version. If we're able to unlink the file then directory is * writable by us and the subsequent rename should never fail. */ error = unlink(filename); if (error != 0 && errno != ENOENT) { error = errno; (void) unlink(tmpname); free(tmpname); return (error); } error = rename(tmpname, filename); if (error != 0) { error = errno; (void) unlink(tmpname); free(tmpname); return (error); } free(tmpname); return (0); } /* * Add the dRAID map to the file and write it out. */ static int write_map_key(const char *filename, char *key, draid_map_t *map, double worst_ratio, double avg_ratio) { nvlist_t *nv_cfg, *allcfgs; int error; /* * Add the configuration to an existing or new file. The new * configuration will replace an existing configuration with the * same key if it has a lower ratio and is therefore better. */ error = read_map(filename, &allcfgs); if (error == ENOENT) { allcfgs = fnvlist_alloc(); } else if (error != 0) { return (error); } error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); if (error == 0) { uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, MAP_WORST_RATIO); double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; if (worst_ratio < nv_worst_ratio) { /* Replace old map with the more balanced new map. */ fnvlist_remove(allcfgs, key); } else { /* The old map is preferable, keep it. */ nvlist_free(allcfgs); return (EEXIST); } } nvlist_t *cfg = fnvlist_alloc(); fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, map->dm_children * map->dm_nperms * sizeof (uint8_t)); fnvlist_add_uint64(cfg, MAP_WORST_RATIO, (uint64_t)(worst_ratio * 1000.0)); fnvlist_add_uint64(cfg, MAP_AVG_RATIO, (uint64_t)(avg_ratio * 1000.0)); error = nvlist_add_nvlist(allcfgs, key, cfg); if (error == 0) error = write_map(filename, allcfgs); nvlist_free(cfg); nvlist_free(allcfgs); return (error); } static void dump_map(draid_map_t *map, const char *key, double worst_ratio, double avg_ratio, int verbose) { if (verbose == 0) { return; } else if (verbose == 1) { printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, worst_ratio, avg_ratio); return; } else { printf(" \"%s\":\n" " seed: 0x%016llx\n" " checksum: 0x%016llx\n" " worst_ratio: %2.03f\n" " avg_ratio: %2.03f\n" " children: %llu\n" " nperms: %llu\n", key, (u_longlong_t)map->dm_seed, (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, (u_longlong_t)map->dm_children, (u_longlong_t)map->dm_nperms); if (verbose > 2) { printf(" perms = {\n"); for (int i = 0; i < map->dm_nperms; i++) { printf(" { "); for (int j = 0; j < map->dm_children; j++) { printf("%3d%s ", map->dm_perms[ i * map->dm_children + j], j < map->dm_children - 1 ? "," : ""); } printf(" },\n"); } printf(" }\n"); } else if (verbose == 2) { printf(" draid_perms = \n"); } } } static void dump_map_nv(const char *key, nvlist_t *cfg, int verbose) { draid_map_t map; uint_t c; uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); map.dm_perms = fnvlist_lookup_uint8_array(cfg, MAP_PERMS, &c); dump_map(&map, key, (double)worst_ratio / 1000.0, avg_ratio / 1000.0, verbose); } /* * Print a summary of the mapping. */ static int dump_map_key(const char *filename, const char *key, int verbose) { nvlist_t *cfg; int error; error = read_map_key(filename, key, &cfg); if (error != 0) return (error); dump_map_nv(key, cfg, verbose); return (0); } /* * Allocate a new permutation map for evaluation. */ static int alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, draid_map_t **mapp) { draid_map_t *map; int error; map = malloc(sizeof (draid_map_t)); if (map == NULL) return (ENOMEM); map->dm_children = children; map->dm_nperms = nperms; map->dm_seed = seed; map->dm_checksum = 0; error = vdev_draid_generate_perms(map, &map->dm_perms); if (error) { free(map); return (error); } *mapp = map; return (0); } /* * Allocate the fixed permutation map for N children. */ static int alloc_fixed_map(uint64_t children, draid_map_t **mapp) { const draid_map_t *fixed_map; draid_map_t *map; int error; error = vdev_draid_lookup_map(children, &fixed_map); if (error) return (error); map = malloc(sizeof (draid_map_t)); if (map == NULL) return (ENOMEM); memcpy(map, fixed_map, sizeof (draid_map_t)); VERIFY3U(map->dm_checksum, !=, 0); error = vdev_draid_generate_perms(map, &map->dm_perms); if (error) { free(map); return (error); } *mapp = map; return (0); } /* * Free a permutation map. */ static void free_map(draid_map_t *map) { free(map->dm_perms); free(map); } /* * Check if dev is in the provided list of faulted devices. */ static inline boolean_t is_faulted(int *faulted_devs, int nfaulted, int dev) { for (int i = 0; i < nfaulted; i++) if (faulted_devs[i] == dev) return (B_TRUE); return (B_FALSE); } /* * Evaluate how resilvering I/O will be distributed given a list of faulted * vdevs. As a simplification we assume one IO is sufficient to repair each * damaged device in a group. */ static double eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) { uint64_t children = map->dm_children; uint64_t ngroups = 1; uint64_t ndisks = children - nspares; /* * Calculate the minimum number of groups required to fill a slice. */ while (ngroups * (groupwidth) % (children - nspares) != 0) ngroups++; int *ios = calloc(map->dm_children, sizeof (uint64_t)); /* Resilver all rows */ for (int i = 0; i < map->dm_nperms; i++) { uint8_t *row = &map->dm_perms[i * map->dm_children]; /* Resilver all groups with faulted drives */ for (int j = 0; j < ngroups; j++) { uint64_t spareidx = map->dm_children - nspares; boolean_t repair_needed = B_FALSE; /* See if any devices in this group are faulted */ uint64_t groupstart = (j * groupwidth) % ndisks; for (int k = 0; k < groupwidth; k++) { uint64_t groupidx = (groupstart + k) % ndisks; repair_needed = is_faulted(faulted_devs, nfaulted, row[groupidx]); if (repair_needed) break; } if (repair_needed == B_FALSE) continue; /* * This group is degraded. Calculate the number of * reads the non-faulted drives require and the number * of writes to the distributed hot spare for this row. */ for (int k = 0; k < groupwidth; k++) { uint64_t groupidx = (groupstart + k) % ndisks; if (!is_faulted(faulted_devs, nfaulted, row[groupidx])) { ios[row[groupidx]]++; } else if (nspares > 0) { while (is_faulted(faulted_devs, nfaulted, row[spareidx])) { spareidx++; } ASSERT3U(spareidx, <, map->dm_children); ios[row[spareidx]]++; spareidx++; } } } } *min_child_ios = INT_MAX; *max_child_ios = 0; /* * Find the drives with fewest and most required I/O. These values * are used to calculate the imbalance ratio. To avoid returning an * infinite value for permutations which have children that perform * no IO a floor of 1 IO per child is set. This ensures a meaningful * ratio is returned for comparison and it is not an uncommon when * there are a large number of children. */ for (int i = 0; i < map->dm_children; i++) { if (is_faulted(faulted_devs, nfaulted, i)) { ASSERT0(ios[i]); continue; } if (ios[i] == 0) ios[i] = 1; if (ios[i] < *min_child_ios) *min_child_ios = ios[i]; if (ios[i] > *max_child_ios) *max_child_ios = ios[i]; } ASSERT3S(*min_child_ios, !=, INT_MAX); ASSERT3S(*max_child_ios, !=, 0); double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); free(ios); return (ratio); } /* * Evaluate the quality of the permutation mapping by considering possible * device failures. Returns the imbalance ratio for the worst mapping which * is defined to be the largest number of child IOs over the fewest number * child IOs. A value of 1.0 indicates the mapping is perfectly balance and * all children perform an equal amount of work during reconstruction. */ static void eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) { uint64_t children = map->dm_children; double worst_ratio = 1.0; double sum = 0; int worst_min_ios = 0, worst_max_ios = 0; int n = 0; /* * When there are only 2 children there can be no distributed * spare and no resilver to evaluate. Default to a ratio of 1.0 * for this degenerate case. */ if (children == VDEV_DRAID_MIN_CHILDREN) { *worst_ratiop = 1.0; *avg_ratiop = 1.0; return; } /* * Score the mapping as if it had either 1 or 2 distributed spares. */ for (int nspares = 1; nspares <= 2; nspares++) { uint64_t faults = nspares; /* * Score groupwidths up to 19. This value was chosen as the * largest reasonable width (16d+3p). dRAID pools may be still * be created with wider stripes but they are not considered in * this analysis in order to optimize for the most common cases. */ for (uint64_t groupwidth = 2; groupwidth <= MIN(children - nspares, 19); groupwidth++) { int faulted_devs[2]; int min_ios, max_ios; /* * Score possible devices faults. This is limited * to exactly one fault per distributed spare for * the purposes of this similation. */ for (int f1 = 0; f1 < children; f1++) { faulted_devs[0] = f1; double ratio; if (faults == 1) { ratio = eval_resilver(map, groupwidth, nspares, faulted_devs, faults, &min_ios, &max_ios); if (ratio > worst_ratio) { worst_ratio = ratio; worst_min_ios = min_ios; worst_max_ios = max_ios; } sum += ratio; n++; } else if (faults == 2) { for (int f2 = f1 + 1; f2 < children; f2++) { faulted_devs[1] = f2; ratio = eval_resilver(map, groupwidth, nspares, faulted_devs, faults, &min_ios, &max_ios); if (ratio > worst_ratio) { worst_ratio = ratio; worst_min_ios = min_ios; worst_max_ios = max_ios; } sum += ratio; n++; } } } } } *worst_ratiop = worst_ratio; *avg_ratiop = sum / n; /* * Log the min/max io values for particularly unbalanced maps. * Since the maps are generated entirely randomly these are possible * be exceedingly unlikely. We log it for possible investigation. */ if (worst_ratio > 100.0) { dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); printf("worst_min_ios=%d worst_max_ios=%d\n", worst_min_ios, worst_max_ios); } } static int eval_maps(uint64_t children, int passes, uint64_t *map_seed, draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) { draid_map_t *best_map = NULL; double best_worst_ratio = 1000.0; double best_avg_ratio = 1000.0; /* * Perform the requested number of passes evaluating randomly * generated permutation maps. Only the best version is kept. */ for (int i = 0; i < passes; i++) { double worst_ratio, avg_ratio; draid_map_t *map; int error; /* * Calculate the next seed and generate a new candidate map. */ error = alloc_new_map(children, MAP_ROWS_DEFAULT, vdev_draid_rand(map_seed), &map); - if (error) + if (error) { + if (best_map != NULL) + free_map(best_map); return (error); + } /* * Consider maps with a lower worst_ratio to be of higher * quality. Some maps may have a lower avg_ratio but they * are discarded since they might include some particularly * imbalanced permutations. The average is tracked to in * order to get a sense of the average permutation quality. */ eval_decluster(map, &worst_ratio, &avg_ratio); if (best_map == NULL || worst_ratio < best_worst_ratio) { if (best_map != NULL) free_map(best_map); best_map = map; best_worst_ratio = worst_ratio; best_avg_ratio = avg_ratio; } else { free_map(map); } } /* * After determining the best map generate a checksum over the full * permutation array. This checksum is verified when opening a dRAID * pool to ensure the generated in memory permutations are correct. */ zio_cksum_t cksum; fletcher_4_native_varsize(best_map->dm_perms, sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, &cksum); best_map->dm_checksum = cksum.zc_word[0]; *best_mapp = best_map; *best_ratiop = best_worst_ratio; *avg_ratiop = best_avg_ratio; return (0); } static int draid_generate(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; uint64_t map_seed; int c, fd, error, verbose = 0, passes = 1, continuous = 0; int min_children = VDEV_DRAID_MIN_CHILDREN; int max_children = VDEV_DRAID_MAX_CHILDREN; int restarts = 0; while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { switch (c) { case 'c': continuous++; break; case 'm': min_children = (int)strtol(optarg, NULL, 0); if (min_children < VDEV_DRAID_MIN_CHILDREN) { (void) fprintf(stderr, "A minimum of 2 " "children are required.\n"); return (1); } break; case 'n': max_children = (int)strtol(optarg, NULL, 0); if (max_children > VDEV_DRAID_MAX_CHILDREN) { (void) fprintf(stderr, "A maximum of %d " "children are allowed.\n", VDEV_DRAID_MAX_CHILDREN); return (1); } break; case 'p': passes = (int)strtol(optarg, NULL, 0); break; case 'v': /* * 0 - Only log when a better map is added to the file. * 1 - Log the current best map for each child count. * Minimal output on a single summary line. * 2 - Log the current best map for each child count. * More verbose includes most map fields. * 3 - Log the current best map for each child count. * Very verbose all fields including the full map. */ verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) strncpy(filename, argv[optind], MAXPATHLEN - 1); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } restart: /* * Start with a fresh seed from /dev/urandom. */ fd = open("/dev/urandom", O_RDONLY); if (fd < 0) { printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); return (1); } else { ssize_t bytes = sizeof (map_seed); ssize_t bytes_read = 0; while (bytes_read < bytes) { ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read, bytes - bytes_read); if (rc < 0) { printf("Unable to read /dev/urandom: %s\n:", strerror(errno)); close(fd); return (1); } bytes_read += rc; } (void) close(fd); } if (restarts == 0) printf("Writing generated mappings to '%s':\n", filename); /* * Generate maps for all requested child counts. The best map for * each child count is written out to the specified file. If the file * already contains a better mapping this map will not be added. */ for (uint64_t children = min_children; children <= max_children; children++) { char key[8] = { 0 }; draid_map_t *map; double worst_ratio = 1000.0; double avg_ratio = 1000.0; error = eval_maps(children, passes, &map_seed, &map, &worst_ratio, &avg_ratio); if (error) { printf("Error eval_maps(): %s\n", strerror(error)); return (1); } if (worst_ratio < 1.0 || avg_ratio < 1.0) { printf("Error ratio < 1.0: worst_ratio = %2.03f " "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); return (1); } snprintf(key, 7, "%llu", (u_longlong_t)children); error = write_map_key(filename, key, map, worst_ratio, avg_ratio); if (error == 0) { /* The new map was added to the file. */ dump_map(map, key, worst_ratio, avg_ratio, MAX(verbose, 1)); } else if (error == EEXIST) { /* The existing map was preferable and kept. */ if (verbose > 0) dump_map_key(filename, key, verbose); } else { printf("Error write_map_key(): %s\n", strerror(error)); return (1); } free_map(map); } /* * When the continuous option is set restart at the minimum number of * children instead of exiting. This option is useful as a mechanism * to continuous try and refine the discovered permutations. */ if (continuous) { restarts++; printf("Restarting by request (-c): %d\n", restarts); goto restart; } return (0); } /* * Verify each map in the file by generating its in-memory permutation array * and comfirming its checksum is correct. */ static int draid_verify(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int n = 0, c, error, verbose = 1; int check_ratios = 0; while ((c = getopt(argc, argv, ":rv")) != -1) { switch (c) { case 'r': check_ratios++; break; case 'v': verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) { char *abspath = malloc(MAXPATHLEN); if (abspath == NULL) return (ENOMEM); if (realpath(argv[optind], abspath) != NULL) strncpy(filename, abspath, MAXPATHLEN - 1); else strncpy(filename, argv[optind], MAXPATHLEN - 1); free(abspath); } else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } printf("Verifying permutation maps: '%s'\n", filename); /* * Lookup hardcoded permutation map for each valid number of children * and verify a generated map has the correct checksum. Then compare * the generated map values with the nvlist map values read from the * reference file to cross-check the permutation. */ for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; children <= VDEV_DRAID_MAX_CHILDREN; children++) { draid_map_t *map; char key[8] = {0}; snprintf(key, 8, "%llu", (u_longlong_t)children); error = alloc_fixed_map(children, &map); if (error) { printf("Error alloc_fixed_map() failed: %s\n", error == ECKSUM ? "Invalid checksum" : strerror(error)); return (1); } uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; uint8_t *nv_perms; nvlist_t *cfg; uint_t c; error = read_map_key(filename, key, &cfg); if (error != 0) { printf("Error read_map_key() failed: %s\n", strerror(error)); free_map(map); return (1); } nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); /* * Compare draid_map_t and nvlist reference values. */ if (map->dm_seed != nv_seed) { printf("Error different seeds: 0x%016llx != " "0x%016llx\n", (u_longlong_t)map->dm_seed, (u_longlong_t)nv_seed); error = EINVAL; } if (map->dm_checksum != nv_checksum) { printf("Error different checksums: 0x%016llx " "!= 0x%016llx\n", (u_longlong_t)map->dm_checksum, (u_longlong_t)nv_checksum); error = EINVAL; } if (map->dm_children != nv_children) { printf("Error different children: %llu " "!= %llu\n", (u_longlong_t)map->dm_children, (u_longlong_t)nv_children); error = EINVAL; } if (map->dm_nperms != nv_nperms) { printf("Error different nperms: %llu " "!= %llu\n", (u_longlong_t)map->dm_nperms, (u_longlong_t)nv_nperms); error = EINVAL; } for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { if (map->dm_perms[i] != nv_perms[i]) { printf("Error different perms[%llu]: " "%d != %d\n", (u_longlong_t)i, (int)map->dm_perms[i], (int)nv_perms[i]); error = EINVAL; break; } } /* * For good measure recalculate the worst and average * ratios and confirm they match the nvlist values. */ if (check_ratios) { uint64_t nv_worst_ratio, nv_avg_ratio; double worst_ratio, avg_ratio; eval_decluster(map, &worst_ratio, &avg_ratio); nv_worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); nv_avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); if (worst_ratio < 1.0 || avg_ratio < 1.0) { printf("Error ratio out of range %2.03f, " "%2.03f\n", worst_ratio, avg_ratio); error = EINVAL; } if ((uint64_t)(worst_ratio * 1000.0) != nv_worst_ratio) { printf("Error different worst_ratio %2.03f " "!= %2.03f\n", (double)nv_worst_ratio / 1000.0, worst_ratio); error = EINVAL; } if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { printf("Error different average_ratio %2.03f " "!= %2.03f\n", (double)nv_avg_ratio / 1000.0, avg_ratio); error = EINVAL; } } if (error) { free_map(map); nvlist_free(cfg); return (1); } if (verbose > 0) { printf("- %llu children: good\n", (u_longlong_t)children); } n++; free_map(map); nvlist_free(cfg); } if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { printf("Error permutation maps missing: %d / %d checked\n", n, VDEV_DRAID_MAX_CHILDREN - 1); return (1); } printf("Successfully verified %d / %d permutation maps\n", n, VDEV_DRAID_MAX_CHILDREN - 1); return (0); } /* * Dump the contents of the specified mapping(s) for inspection. */ static int draid_dump(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int c, error, verbose = 1; int min_children = VDEV_DRAID_MIN_CHILDREN; int max_children = VDEV_DRAID_MAX_CHILDREN; while ((c = getopt(argc, argv, ":vm:n:")) != -1) { switch (c) { case 'm': min_children = (int)strtol(optarg, NULL, 0); if (min_children < 2) { (void) fprintf(stderr, "A minimum of 2 " "children are required.\n"); return (1); } break; case 'n': max_children = (int)strtol(optarg, NULL, 0); if (max_children > VDEV_DRAID_MAX_CHILDREN) { (void) fprintf(stderr, "A maximum of %d " "children are allowed.\n", VDEV_DRAID_MAX_CHILDREN); return (1); } break; case 'v': verbose++; break; case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc > optind) strncpy(filename, argv[optind], MAXPATHLEN - 1); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } /* * Dump maps for the requested child counts. */ for (uint64_t children = min_children; children <= max_children; children++) { char key[8] = { 0 }; snprintf(key, 7, "%llu", (u_longlong_t)children); error = dump_map_key(filename, key, verbose); if (error) { printf("Error dump_map_key(): %s\n", strerror(error)); return (1); } } return (0); } /* * Print all of the mappings as a C formatted draid_map_t array. This table * is found in the module/zcommon/zfs_draid.c file and is the definitive * source for all mapping used by dRAID. It cannot be updated without * changing the dRAID on disk format. */ static int draid_table(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int error; if (argc > optind) strncpy(filename, argv[optind], MAXPATHLEN - 1); else { (void) fprintf(stderr, "A FILE must be specified.\n"); return (1); } printf("static const draid_map_t " "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; children <= VDEV_DRAID_MAX_CHILDREN; children++) { uint64_t seed, checksum, nperms, avg_ratio; nvlist_t *cfg; char key[8] = {0}; snprintf(key, 8, "%llu", (u_longlong_t)children); error = read_map_key(filename, key, &cfg); if (error != 0) { printf("Error read_map_key() failed: %s\n", strerror(error)); return (1); } seed = fnvlist_lookup_uint64(cfg, MAP_SEED); checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" "/* %2.03f */\n", (u_longlong_t)children, (u_longlong_t)nperms, (u_longlong_t)seed, (u_longlong_t)checksum, (double)avg_ratio / 1000.0); nvlist_free(cfg); } printf("};\n"); return (0); } static int draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) { nvlist_t *srccfgs; nvpair_t *elem = NULL; int error, merged = 0; error = read_map(srcfilename, &srccfgs); if (error != 0) return (error); while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { uint64_t nv_worst_ratio; uint64_t allcfg_worst_ratio; nvlist_t *cfg, *allcfg; char *key; switch (nvpair_type(elem)) { case DATA_TYPE_NVLIST: (void) nvpair_value_nvlist(elem, &cfg); key = nvpair_name(elem); nv_worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); if (error == 0) { allcfg_worst_ratio = fnvlist_lookup_uint64( allcfg, MAP_WORST_RATIO); if (nv_worst_ratio < allcfg_worst_ratio) { fnvlist_remove(allcfgs, key); error = nvlist_add_nvlist(allcfgs, key, cfg); merged++; } } else if (error == ENOENT) { error = nvlist_add_nvlist(allcfgs, key, cfg); merged++; } else { return (error); } break; default: continue; } } nvlist_free(srccfgs); *mergedp = merged; return (0); } /* * Merge the best map for each child count found in the listed files into * a new file. This allows 'draid generate' to be run in parallel and for * the results maps to be combined. */ static int draid_merge(int argc, char *argv[]) { char filename[MAXPATHLEN] = {0}; int c, error, total_merged = 0; nvlist_t *allcfgs; while ((c = getopt(argc, argv, ":")) != -1) { switch (c) { case ':': (void) fprintf(stderr, "missing argument for '%c' option\n", optopt); draid_usage(); break; case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); draid_usage(); break; } } if (argc < 4) { (void) fprintf(stderr, "A FILE and multiple SRCs must be specified.\n"); return (1); } strncpy(filename, argv[optind], MAXPATHLEN - 1); optind++; error = read_map(filename, &allcfgs); if (error == ENOENT) { allcfgs = fnvlist_alloc(); } else if (error != 0) { printf("Error read_map(): %s\n", strerror(error)); return (error); } while (optind < argc) { char srcfilename[MAXPATHLEN] = {0}; int merged = 0; strncpy(srcfilename, argv[optind], MAXPATHLEN - 1); error = draid_merge_impl(allcfgs, srcfilename, &merged); if (error) { printf("Error draid_merge_impl(): %s\n", strerror(error)); nvlist_free(allcfgs); return (1); } total_merged += merged; printf("Merged %d key(s) from '%s' into '%s'\n", merged, srcfilename, filename); optind++; } if (total_merged > 0) write_map(filename, allcfgs); printf("Merged a total of %d key(s) into '%s'\n", total_merged, filename); nvlist_free(allcfgs); return (0); } int main(int argc, char *argv[]) { if (argc < 2) draid_usage(); char *subcommand = argv[1]; if (strcmp(subcommand, "generate") == 0) { return (draid_generate(argc - 1, argv + 1)); } else if (strcmp(subcommand, "verify") == 0) { return (draid_verify(argc - 1, argv + 1)); } else if (strcmp(subcommand, "dump") == 0) { return (draid_dump(argc - 1, argv + 1)); } else if (strcmp(subcommand, "table") == 0) { return (draid_table(argc - 1, argv + 1)); } else if (strcmp(subcommand, "merge") == 0) { return (draid_merge(argc - 1, argv + 1)); } else { draid_usage(); } } diff --git a/tests/zfs-tests/cmd/mkfile.c b/tests/zfs-tests/cmd/mkfile.c index 7ce50e6a37c4..3b61deed6bf5 100644 --- a/tests/zfs-tests/cmd/mkfile.c +++ b/tests/zfs-tests/cmd/mkfile.c @@ -1,280 +1,284 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #define BLOCKSIZE 512 /* bytes */ #define KILOBYTE 1024 #define MEGABYTE (KILOBYTE * KILOBYTE) #define GIGABYTE (KILOBYTE * MEGABYTE) #define FILE_MODE (S_ISVTX + S_IRUSR + S_IWUSR) static __attribute__((noreturn)) void usage(void) { (void) fprintf(stderr, gettext( "Usage: mkfile [-nv] [g|k|b|m] [] ...\n")); exit(1); } int main(int argc, char **argv) { char *opts; off_t size; size_t len; size_t mult = 1; char *buf = NULL; size_t bufsz = 0; int errors = 0; int i; int verbose = 0; /* option variable */ int nobytes = 0; /* option variable */ int saverr; if (argc == 1) usage(); while (argv[1] && argv[1][0] == '-') { opts = &argv[1][0]; while (*(++opts)) { switch (*opts) { case 'v': verbose++; break; case 'n': nobytes++; break; default: usage(); } } argc--; argv++; } if (argc < 3) usage(); len = strlen(argv[1]); if (len && isalpha(argv[1][len-1])) { switch (argv[1][len-1]) { case 'k': case 'K': mult = KILOBYTE; break; case 'b': case 'B': mult = BLOCKSIZE; break; case 'm': case 'M': mult = MEGABYTE; break; case 'g': case 'G': mult = GIGABYTE; break; default: (void) fprintf(stderr, gettext("unknown size %s\n"), argv[1]); usage(); } for (i = 0; i <= (len-2); i++) { if (!isdigit(argv[1][i])) { (void) fprintf(stderr, gettext("unknown size %s\n"), argv[1]); usage(); } } argv[1][len-1] = '\0'; } size = ((off_t)atoll(argv[1]) * (off_t)mult); argv++; argc--; while (argc > 1) { int fd; if (verbose) (void) fprintf(stdout, gettext("%s %lld bytes\n"), argv[1], (offset_t)size); fd = open(argv[1], O_CREAT|O_TRUNC|O_RDWR, FILE_MODE); if (fd < 0) { saverr = errno; (void) fprintf(stderr, gettext("Could not open %s: %s\n"), argv[1], strerror(saverr)); errors++; argv++; argc--; continue; } else if (fchown(fd, getuid(), getgid()) < 0) { saverr = errno; (void) fprintf(stderr, gettext( "Could not set owner/group of %s: %s\n"), argv[1], strerror(saverr)); (void) close(fd); errors++; argv++; argc--; continue; } else if (lseek(fd, (off_t)size-1, SEEK_SET) < 0) { saverr = errno; (void) fprintf(stderr, gettext( "Could not seek to offset %ld in %s: %s\n"), (unsigned long)size-1, argv[1], strerror(saverr)); (void) close(fd); errors++; argv++; argc--; continue; } else if (write(fd, "", 1) != 1) { saverr = errno; (void) fprintf(stderr, gettext( "Could not set length of %s: %s\n"), argv[1], strerror(saverr)); (void) close(fd); errors++; argv++; argc--; continue; } if (!nobytes) { off_t written = 0; struct stat64 st; if (lseek(fd, (off_t)0, SEEK_SET) < 0) { saverr = errno; (void) fprintf(stderr, gettext( "Could not seek to beginning of %s: %s\n"), argv[1], strerror(saverr)); (void) close(fd); errors++; argv++; argc--; continue; } if (fstat64(fd, &st) < 0) { saverr = errno; (void) fprintf(stderr, gettext( "Could not fstat64 %s: %s\n"), argv[1], strerror(saverr)); (void) close(fd); errors++; argv++; argc--; continue; } if (bufsz != st.st_blksize) { if (buf) free(buf); bufsz = (size_t)st.st_blksize; buf = calloc(1, bufsz); if (buf == NULL) { (void) fprintf(stderr, gettext( "Could not allocate buffer of" " size %d\n"), (int)bufsz); (void) close(fd); bufsz = 0; errors++; argv++; argc--; continue; } } while (written < size) { ssize_t result; size_t bytes = (size_t)MIN(bufsz, size-written); if ((result = write(fd, buf, bytes)) != (ssize_t)bytes) { saverr = errno; if (result < 0) result = 0; written += result; (void) fprintf(stderr, gettext( "%s: initialized %lu of %lu bytes: %s\n"), argv[1], (unsigned long)written, (unsigned long)size, strerror(saverr)); errors++; break; } written += bytes; } /* * A write(2) call in the above loop failed so * close out this file and go on (error was * already incremented when the write(2) failed). */ if (written < size) { (void) close(fd); argv++; argc--; continue; } } if (close(fd) < 0) { saverr = errno; (void) fprintf(stderr, gettext( "Error encountered when closing %s: %s\n"), argv[1], strerror(saverr)); errors++; argv++; argc--; continue; } /* * Only set the modes (including the sticky bit) if we * had no problems. It is not an error for the chmod(2) * to fail, but do issue a warning. */ if (chmod(argv[1], FILE_MODE) < 0) (void) fprintf(stderr, gettext( "warning: couldn't set mode to %#o\n"), FILE_MODE); argv++; argc--; } + + if (buf) + free(buf); + return (errors); }