diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 0953cc51922f..9ffe90458dbd 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.2.0 Release: rc4 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.4 +Linux-Maximum: 6.5 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c index a8d084bb4bd3..2f040ff7582c 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -1,1307 +1,1308 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, 2017, Intel Corporation. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. */ /* * ZFS syseventd module. * * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c * * The purpose of this module is to identify when devices are added to the * system, and appropriately online or replace the affected vdevs. * * When a device is added to the system: * * 1. Search for any vdevs whose devid matches that of the newly added * device. * * 2. If no vdevs are found, then search for any vdevs whose udev path * matches that of the new device. * * 3. If no vdevs match by either method, then ignore the event. * * 4. Attempt to online the device with a flag to indicate that it should * be unspared when resilvering completes. If this succeeds, then the * same device was inserted and we should continue normally. * * 5. If the pool does not have the 'autoreplace' property set, attempt to * online the device again without the unspare flag, which will * generate a FMA fault. * * 6. If the pool has the 'autoreplace' property set, and the matching vdev * is a whole disk, then label the new disk and attempt a 'zpool * replace'. * * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK * event indicates that a device failed to open during pool load, but the * autoreplace property was set. In this case, we deferred the associated * FMA fault until our module had a chance to process the autoreplace logic. * If the device could not be replaced, then the second online attempt will * trigger the FMA fault that we skipped earlier. * * On Linux udev provides a disk insert for both the disk and the partition. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_agents.h" #include "../zed_log.h" #define DEV_BYID_PATH "/dev/disk/by-id/" #define DEV_BYPATH_PATH "/dev/disk/by-path/" #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); libzfs_handle_t *g_zfshdl; list_t g_pool_list; /* list of unavailable pools at initialization */ list_t g_device_list; /* list of disks with asynchronous label request */ tpool_t *g_tpool; boolean_t g_enumeration_done; pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ typedef struct unavailpool { zpool_handle_t *uap_zhp; list_node_t uap_node; } unavailpool_t; typedef struct pendingdev { char pd_physpath[128]; list_node_t pd_node; } pendingdev_t; static int zfs_toplevel_state(zpool_handle_t *zhp) { nvlist_t *nvroot; vdev_stat_t *vs; unsigned int c; verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); return (vs->vs_state); } static int zfs_unavail_pool(zpool_handle_t *zhp, void *data) { zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { unavailpool_t *uap; uap = malloc(sizeof (unavailpool_t)); if (uap == NULL) { perror("malloc"); exit(EXIT_FAILURE); } uap->uap_zhp = zhp; list_insert_tail((list_t *)data, uap); } else { zpool_close(zhp); } return (0); } /* * Two stage replace on Linux * since we get disk notifications * we can wait for partitioned disk slice to show up! * * First stage tags the disk, initiates async partitioning, and returns * Second stage finds the tag and proceeds to ZFS labeling/replace * * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach * * 1. physical match with no fs, no partition * tag it top, partition disk * * 2. physical match again, see partition and tag * */ /* * The device associated with the given vdev (either by devid or physical path) * has been added to the system. If 'isdisk' is set, then we only attempt a * replacement if it's a whole disk. This also implies that we should label the * disk first. * * First, we attempt to online the device (making sure to undo any spare * operation when finished). If this succeeds, then we're done. If it fails, * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, * but that the label was not what we expected. If the 'autoreplace' property * is enabled, then we relabel the disk (if specified), and attempt a 'zpool * replace'. If the online is successful, but the new state is something else * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of * race, and we should avoid attempting to relabel the disk. * * Also can arrive here from a ESC_ZFS_VDEV_CHECK event */ static void zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) { const char *path; vdev_state_t newstate; nvlist_t *nvroot, *newvd; pendingdev_t *device; uint64_t wholedisk = 0ULL; uint64_t offline = 0ULL, faulted = 0ULL; uint64_t guid = 0ULL; uint64_t is_spare = 0; const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; char rawpath[PATH_MAX], fullpath[PATH_MAX]; char devpath[PATH_MAX]; int ret; int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; boolean_t is_sd = B_FALSE; boolean_t is_mpath_wholedisk = B_FALSE; uint_t c; vdev_stat_t *vs; if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) return; /* Skip healthy disks */ verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (vs->vs_state == VDEV_STATE_HEALTHY) { zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", __func__, path); return; } (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_IS_SPARE, &is_spare); /* * Special case: * * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH * entry in their config. For example, on this force-faulted disk: * * children[0]: * type: 'disk' * id: 0 * guid: 14309659774640089719 * path: '/dev/disk/by-vdev/L28' * whole_disk: 0 * DTL: 654 * create_txg: 4 * com.delphix:vdev_zap_leaf: 1161 * faulted: 1 * aux_state: 'external' * children[1]: * type: 'disk' * id: 1 * guid: 16002508084177980912 * path: '/dev/disk/by-vdev/L29' * devid: 'dm-uuid-mpath-35000c500a61d68a3' * phys_path: 'L29' * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' * whole_disk: 0 * DTL: 1028 * create_txg: 4 * com.delphix:vdev_zap_leaf: 131 * * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name. */ if (physpath == NULL && path != NULL) { /* If path begins with "/dev/disk/by-vdev/" ... */ if (strncmp(path, DEV_BYVDEV_PATH, strlen(DEV_BYVDEV_PATH)) == 0) { /* Set physpath to the char after "/dev/disk/by-vdev" */ physpath = &path[strlen(DEV_BYVDEV_PATH)]; } } /* * We don't want to autoreplace offlined disks. However, we do want to * replace force-faulted disks (`zpool offline -f`). Force-faulted * disks have both offline=1 and faulted=1 in the nvlist. */ if (offline && !faulted) { zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace", __func__, path); return; } is_mpath_wholedisk = is_mpath_whole_disk(path); zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', " "(guid %llu)", zpool_get_name(zhp), path, physpath ? physpath : "NULL", wholedisk ? "is" : "not", is_mpath_wholedisk? "is" : "not", labeled ? "is" : "not", enc_sysfs_path, (long long unsigned int)guid); /* * The VDEV guid is preferred for identification (gets passed in path) */ if (guid != 0) { (void) snprintf(fullpath, sizeof (fullpath), "%llu", (long long unsigned int)guid); } else { /* * otherwise use path sans partition suffix for whole disks */ (void) strlcpy(fullpath, path, sizeof (fullpath)); if (wholedisk) { char *spath = zfs_strip_partition(fullpath); if (!spath) { zed_log_msg(LOG_INFO, "%s: Can't alloc", __func__); return; } (void) strlcpy(fullpath, spath, sizeof (fullpath)); free(spath); } } if (is_spare) online_flag |= ZFS_ONLINE_SPARE; /* * Attempt to online the device. */ if (zpool_vdev_online(zhp, fullpath, online_flag, &newstate) == 0 && (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED)) { zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev '%s' ('%s') is " "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ? "HEALTHY" : "DEGRADED"); return; } /* * vdev_id alias rule for using scsi_debug devices (FMA automated * testing) */ if (physpath != NULL && strcmp("scsidebug", physpath) == 0) is_sd = B_TRUE; /* * If the pool doesn't have the autoreplace property set, then use * vdev online to trigger a FMA fault by posting an ereport. */ if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) { (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " "not a blank disk for '%s' ('%s')", fullpath, physpath); return; } /* * Convert physical path into its current device node. Rawpath * needs to be /dev/disk/by-vdev for a scsi_debug device since * /dev/disk/by-path will not be present. */ (void) snprintf(rawpath, sizeof (rawpath), "%s%s", is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", rawpath, strerror(errno)); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", fullpath, libzfs_error_description(g_zfshdl)); return; } /* Only autoreplace bad disks */ if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && + (vs->vs_state != VDEV_STATE_REMOVED) && (vs->vs_state != VDEV_STATE_CANT_OPEN)) { zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " "a bad state (currently %llu)", vs->vs_state); return; } nvlist_lookup_string(vdev, "new_devid", &new_devid); if (is_mpath_wholedisk) { /* Don't label device mapper or multipath disks. */ } else if (!labeled) { /* * we're auto-replacing a raw disk, so label it first */ char *leafname; /* * If this is a request to label a whole disk, then attempt to * write out the label. Before we can label the disk, we need * to map the physical string that was matched on to the under * lying device node. * * If any part of this process fails, then do a force online * to trigger a ZFS fault for the device (and any hot spare * replacement). */ leafname = strrchr(devpath, '/') + 1; /* * If this is a request to label a whole disk, then attempt to * write out the label. */ if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { zed_log_msg(LOG_INFO, " zpool_label_disk: could not " "label '%s' (%s)", leafname, libzfs_error_description(g_zfshdl)); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; } /* * The disk labeling is asynchronous on Linux. Just record * this label request and return as there will be another * disk add event for the partition after the labeling is * completed. */ device = malloc(sizeof (pendingdev_t)); if (device == NULL) { perror("malloc"); exit(EXIT_FAILURE); } (void) strlcpy(device->pd_physpath, physpath, sizeof (device->pd_physpath)); list_insert_tail(&g_device_list, device); zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", leafname, (u_longlong_t)guid); return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ } else /* labeled */ { boolean_t found = B_FALSE; /* * match up with request above to label the disk */ for (device = list_head(&g_device_list); device != NULL; device = list_next(&g_device_list, device)) { if (strcmp(physpath, device->pd_physpath) == 0) { list_remove(&g_device_list, device); free(device); found = B_TRUE; break; } zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", physpath, device->pd_physpath); } if (!found) { /* unexpected partition slice encountered */ zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", fullpath); (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); return; } zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", physpath, (u_longlong_t)guid); (void) snprintf(devpath, sizeof (devpath), "%s%s", DEV_BYID_PATH, new_devid); } /* * Construct the root vdev to pass to zpool_vdev_attach(). While adding * the entire vdev structure is harmless, we construct a reduced set of * path/physpath/wholedisk to keep it simple. */ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); return; } if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); nvlist_free(nvroot); return; } if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || (physpath != NULL && nvlist_add_string(newvd, ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || (enc_sysfs_path != NULL && nvlist_add_string(newvd, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&newvd, 1) != 0) { zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); nvlist_free(newvd); nvlist_free(nvroot); return; } nvlist_free(newvd); /* * Wait for udev to verify the links exist, then auto-replace * the leaf disk at same physical location. */ if (zpool_label_disk_wait(path, 3000) != 0) { zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " "disk %s is missing", path); nvlist_free(nvroot); return; } /* * Prefer sequential resilvering when supported (mirrors and dRAID), * otherwise fallback to a traditional healing resilver. */ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); if (ret != 0) { ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); } zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : libzfs_error_description(g_zfshdl)); nvlist_free(nvroot); } /* * Utility functions to find a vdev matching given criteria. */ typedef struct dev_data { const char *dd_compare; const char *dd_prop; zfs_process_func_t dd_func; boolean_t dd_found; boolean_t dd_islabeled; uint64_t dd_pool_guid; uint64_t dd_vdev_guid; uint64_t dd_new_vdev_guid; const char *dd_new_devid; uint64_t dd_num_spares; } dev_data_t; static void zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) { dev_data_t *dp = data; const char *path = NULL; uint_t c, children; nvlist_t **child; uint64_t guid = 0; uint64_t isspare = 0; /* * First iterate over any children. */ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_iter_vdev(zhp, child[c], data); } /* * Iterate over any spares and cache devices */ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_iter_vdev(zhp, child[c], data); } if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_iter_vdev(zhp, child[c], data); } /* once a vdev was matched and processed there is nothing left to do */ if (dp->dd_found && dp->dd_num_spares == 0) return; (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &guid); /* * Match by GUID if available otherwise fallback to devid or physical */ if (dp->dd_vdev_guid != 0) { if (guid != dp->dd_vdev_guid) return; zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); dp->dd_found = B_TRUE; } else if (dp->dd_compare != NULL) { /* * NOTE: On Linux there is an event for partition, so unlike * illumos, substring matching is not required to accommodate * the partition suffix. An exact match will be present in * the dp->dd_compare value. * If the attached disk already contains a vdev GUID, it means * the disk is not clean. In such a scenario, the physical path * would be a match that makes the disk faulted when trying to * online it. So, we would only want to proceed if either GUID * matches with the last attached disk or the disk is in clean * state. */ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || strcmp(dp->dd_compare, path) != 0) { return; } if (dp->dd_new_vdev_guid != 0 && dp->dd_new_vdev_guid != guid) { zed_log_msg(LOG_INFO, " %s: no match (GUID:%llu" " != vdev GUID:%llu)", __func__, dp->dd_new_vdev_guid, guid); return; } zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", dp->dd_prop, path); dp->dd_found = B_TRUE; /* pass the new devid for use by replacing code */ if (dp->dd_new_devid != NULL) { (void) nvlist_add_string(nvl, "new_devid", dp->dd_new_devid); } } if (dp->dd_found == B_TRUE && nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare) dp->dd_num_spares++; (dp->dd_func)(zhp, nvl, dp->dd_islabeled); } static void zfs_enable_ds(void *arg) { unavailpool_t *pool = (unavailpool_t *)arg; (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); zpool_close(pool->uap_zhp); free(pool); } static int zfs_iter_pool(zpool_handle_t *zhp, void *data) { nvlist_t *config, *nvl; dev_data_t *dp = data; uint64_t pool_guid; unavailpool_t *pool; zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); /* * For each vdev in this pool, look for a match to apply dd_func */ if ((config = zpool_get_config(zhp, NULL)) != NULL) { if (dp->dd_pool_guid == 0 || (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { (void) nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl); zfs_iter_vdev(zhp, nvl, data); } } else { zed_log_msg(LOG_INFO, "%s: no config\n", __func__); } /* * if this pool was originally unavailable, * then enable its datasets asynchronously */ if (g_enumeration_done) { for (pool = list_head(&g_pool_list); pool != NULL; pool = list_next(&g_pool_list, pool)) { if (strcmp(zpool_get_name(zhp), zpool_get_name(pool->uap_zhp))) continue; if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { list_remove(&g_pool_list, pool); (void) tpool_dispatch(g_tpool, zfs_enable_ds, pool); break; } } } zpool_close(zhp); /* cease iteration after a match */ return (dp->dd_found && dp->dd_num_spares == 0); } /* * Given a physical device location, iterate over all * (pool, vdev) pairs which correspond to that location. */ static boolean_t devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, boolean_t is_slice, uint64_t new_vdev_guid) { dev_data_t data = { 0 }; data.dd_compare = physical; data.dd_func = func; data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; data.dd_found = B_FALSE; data.dd_islabeled = is_slice; data.dd_new_devid = devid; /* used by auto replace code */ data.dd_new_vdev_guid = new_vdev_guid; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Given a device identifier, find any vdevs with a matching by-vdev * path. Normally we shouldn't need this as the comparison would be * made earlier in the devphys_iter(). For example, if we were replacing * /dev/disk/by-vdev/L28, normally devphys_iter() would match the * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28" * of the new disk config. However, we've seen cases where * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's * an example of a real 2-disk mirror pool where one disk was force * faulted: * * com.delphix:vdev_zap_top: 129 * children[0]: * type: 'disk' * id: 0 * guid: 14309659774640089719 * path: '/dev/disk/by-vdev/L28' * whole_disk: 0 * DTL: 654 * create_txg: 4 * com.delphix:vdev_zap_leaf: 1161 * faulted: 1 * aux_state: 'external' * children[1]: * type: 'disk' * id: 1 * guid: 16002508084177980912 * path: '/dev/disk/by-vdev/L29' * devid: 'dm-uuid-mpath-35000c500a61d68a3' * phys_path: 'L29' * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' * whole_disk: 0 * DTL: 1028 * create_txg: 4 * com.delphix:vdev_zap_leaf: 131 * * So in the case above, the only thing we could compare is the path. * * We can do this because we assume by-vdev paths are authoritative as physical * paths. We could not assume this for normal paths like /dev/sda since the * physical location /dev/sda points to could change over time. */ static boolean_t by_vdev_path_iter(const char *by_vdev_path, const char *devid, zfs_process_func_t func, boolean_t is_slice) { dev_data_t data = { 0 }; data.dd_compare = by_vdev_path; data.dd_func = func; data.dd_prop = ZPOOL_CONFIG_PATH; data.dd_found = B_FALSE; data.dd_islabeled = is_slice; data.dd_new_devid = devid; if (strncmp(by_vdev_path, DEV_BYVDEV_PATH, strlen(DEV_BYVDEV_PATH)) != 0) { /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */ return (B_FALSE); } (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Given a device identifier, find any vdevs with a matching devid. * On Linux we can match devid directly which is always a whole disk. */ static boolean_t devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) { dev_data_t data = { 0 }; data.dd_compare = devid; data.dd_func = func; data.dd_prop = ZPOOL_CONFIG_DEVID; data.dd_found = B_FALSE; data.dd_islabeled = is_slice; data.dd_new_devid = devid; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Given a device guid, find any vdevs with a matching guid. */ static boolean_t guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, zfs_process_func_t func, boolean_t is_slice) { dev_data_t data = { 0 }; data.dd_func = func; data.dd_found = B_FALSE; data.dd_pool_guid = pool_guid; data.dd_vdev_guid = vdev_guid; data.dd_islabeled = is_slice; data.dd_new_devid = devid; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (data.dd_found); } /* * Handle a EC_DEV_ADD.ESC_DISK event. * * illumos * Expects: DEV_PHYS_PATH string in schema * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID * * path: '/dev/dsk/c0t1d0s0' (persistent) * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' * * linux * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID * * path: '/dev/sdc1' (not persistent) * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' */ static int zfs_deliver_add(nvlist_t *nvl) { const char *devpath = NULL, *devid = NULL; uint64_t pool_guid = 0, vdev_guid = 0; boolean_t is_slice; /* * Expecting a devid string and an optional physical location and guid */ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) { zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__); return (-1); } (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", devid, devpath ? devpath : "NULL", is_slice); /* * Iterate over all vdevs looking for a match in the following order: * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since * by-vdev paths represent physical paths). */ if (devid_iter(devid, zfs_process_add, is_slice)) return (0); if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add, is_slice, vdev_guid)) return (0); if (vdev_guid != 0) (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, is_slice); if (devpath != NULL) { /* Can we match a /dev/disk/by-vdev/ path? */ char by_vdev_path[MAXPATHLEN]; snprintf(by_vdev_path, sizeof (by_vdev_path), "/dev/disk/by-vdev/%s", devpath); if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add, is_slice)) return (0); } return (0); } /* * Called when we receive a VDEV_CHECK event, which indicates a device could not * be opened during initial pool open, but the autoreplace property was set on * the pool. In this case, we treat it as if it were an add event. */ static int zfs_deliver_check(nvlist_t *nvl) { dev_data_t data = { 0 }; if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &data.dd_pool_guid) != 0 || nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &data.dd_vdev_guid) != 0 || data.dd_vdev_guid == 0) return (0); zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", data.dd_pool_guid, data.dd_vdev_guid); data.dd_func = zfs_process_add; (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); return (0); } /* * Given a path to a vdev, lookup the vdev's physical size from its * config nvlist. * * Returns the vdev's physical size in bytes on success, 0 on error. */ static uint64_t vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path) { nvlist_t *nvl = NULL; boolean_t avail_spare, l2cache, log; vdev_stat_t *vs = NULL; uint_t c; nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); if (!nvl) return (0); verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (!vs) { zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__, vdev_path); return (0); } return (vs->vs_pspace); } /* * Given a path to a vdev, lookup if the vdev is a "whole disk" in the * config nvlist. "whole disk" means that ZFS was passed a whole disk * at pool creation time, which it partitioned up and has full control over. * Thus a partition with wholedisk=1 set tells us that zfs created the * partition at creation time. A partition without whole disk set would have * been created by externally (like with fdisk) and passed to ZFS. * * Returns the whole disk value (either 0 or 1). */ static uint64_t vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path) { nvlist_t *nvl = NULL; boolean_t avail_spare, l2cache, log; uint64_t wholedisk = 0; nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); if (!nvl) return (0); (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); return (wholedisk); } /* * If the device size grew more than 1% then return true. */ #define DEVICE_GREW(oldsize, newsize) \ ((newsize > oldsize) && \ ((newsize / (newsize - oldsize)) <= 100)) static int zfsdle_vdev_online(zpool_handle_t *zhp, void *data) { boolean_t avail_spare, l2cache; nvlist_t *udev_nvl = data; nvlist_t *tgt; int error; const char *tmp_devname; char devname[MAXPATHLEN] = ""; uint64_t guid; if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { sprintf(devname, "%llu", (u_longlong_t)guid); } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH, &tmp_devname) == 0) { strlcpy(devname, tmp_devname, MAXPATHLEN); zfs_append_partition(devname, MAXPATHLEN); } else { zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__); } zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", devname, zpool_get_name(zhp)); if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, &avail_spare, &l2cache, NULL)) != NULL) { const char *path; char fullpath[MAXPATHLEN]; uint64_t wholedisk = 0; error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); if (error) { zpool_close(zhp); return (0); } (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (wholedisk) { char *tmp; path = strrchr(path, '/'); if (path != NULL) { tmp = zfs_strip_partition(path + 1); if (tmp == NULL) { zpool_close(zhp); return (0); } } else { zpool_close(zhp); return (0); } (void) strlcpy(fullpath, tmp, sizeof (fullpath)); free(tmp); /* * We need to reopen the pool associated with this * device so that the kernel can update the size of * the expanded device. When expanding there is no * need to restart the scrub from the beginning. */ boolean_t scrub_restart = B_FALSE; (void) zpool_reopen_one(zhp, &scrub_restart); } else { (void) strlcpy(fullpath, path, sizeof (fullpath)); } if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { vdev_state_t newstate; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { /* * If this disk size has not changed, then * there's no need to do an autoexpand. To * check we look at the disk's size in its * config, and compare it to the disk size * that udev is reporting. */ uint64_t udev_size = 0, conf_size = 0, wholedisk = 0, udev_parent_size = 0; /* * Get the size of our disk that udev is * reporting. */ if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE, &udev_size) != 0) { udev_size = 0; } /* * Get the size of our disk's parent device * from udev (where sda1's parent is sda). */ if (nvlist_lookup_uint64(udev_nvl, DEV_PARENT_SIZE, &udev_parent_size) != 0) { udev_parent_size = 0; } conf_size = vdev_size_from_config(zhp, fullpath); wholedisk = vdev_whole_disk_from_config(zhp, fullpath); /* * Only attempt an autoexpand if the vdev size * changed. There are two different cases * to consider. * * 1. wholedisk=1 * If you do a 'zpool create' on a whole disk * (like /dev/sda), then zfs will create * partitions on the disk (like /dev/sda1). In * that case, wholedisk=1 will be set in the * partition's nvlist config. So zed will need * to see if your parent device (/dev/sda) * expanded in size, and if so, then attempt * the autoexpand. * * 2. wholedisk=0 * If you do a 'zpool create' on an existing * partition, or a device that doesn't allow * partitions, then wholedisk=0, and you will * simply need to check if the device itself * expanded in size. */ if (DEVICE_GREW(conf_size, udev_size) || (wholedisk && DEVICE_GREW(conf_size, udev_parent_size))) { error = zpool_vdev_online(zhp, fullpath, 0, &newstate); zed_log_msg(LOG_INFO, "%s: autoexpanding '%s' from %llu" " to %llu bytes in pool '%s': %d", __func__, fullpath, conf_size, MAX(udev_size, udev_parent_size), zpool_get_name(zhp), error); } } } zpool_close(zhp); return (1); } zpool_close(zhp); return (0); } /* * This function handles the ESC_DEV_DLE device change event. Use the * provided vdev guid when looking up a disk or partition, when the guid * is not present assume the entire disk is owned by ZFS and append the * expected -part1 partition information then lookup by physical path. */ static int zfs_deliver_dle(nvlist_t *nvl) { const char *devname; char name[MAXPATHLEN]; uint64_t guid; if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { sprintf(name, "%llu", (u_longlong_t)guid); } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { strlcpy(name, devname, MAXPATHLEN); zfs_append_partition(name, MAXPATHLEN); } else { sprintf(name, "unknown"); zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); } if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " "found", name); return (1); } return (0); } /* * syseventd daemon module event handler * * Handles syseventd daemon zfs device related events: * * EC_DEV_ADD.ESC_DISK * EC_DEV_STATUS.ESC_DEV_DLE * EC_ZFS.ESC_ZFS_VDEV_CHECK * * Note: assumes only one thread active at a time (not thread safe) */ static int zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) { int ret; boolean_t is_check = B_FALSE, is_dle = B_FALSE; if (strcmp(class, EC_DEV_ADD) == 0) { /* * We're mainly interested in disk additions, but we also listen * for new loop devices, to allow for simplified testing. */ if (strcmp(subclass, ESC_DISK) != 0 && strcmp(subclass, ESC_LOFI) != 0) return (0); is_check = B_FALSE; } else if (strcmp(class, EC_ZFS) == 0 && strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { /* * This event signifies that a device failed to open * during pool load, but the 'autoreplace' property was * set, so we should pretend it's just been added. */ is_check = B_TRUE; } else if (strcmp(class, EC_DEV_STATUS) == 0 && strcmp(subclass, ESC_DEV_DLE) == 0) { is_dle = B_TRUE; } else { return (0); } if (is_dle) ret = zfs_deliver_dle(nvl); else if (is_check) ret = zfs_deliver_check(nvl); else ret = zfs_deliver_add(nvl); return (ret); } static void * zfs_enum_pools(void *arg) { (void) arg; (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); /* * Linux - instead of using a thread pool, each list entry * will spawn a thread when an unavailable pool transitions * to available. zfs_slm_fini will wait for these threads. */ g_enumeration_done = B_TRUE; return (NULL); } /* * called from zed daemon at startup * * sent messages from zevents or udev monitor * * For now, each agent has its own libzfs instance */ int zfs_slm_init(void) { if ((g_zfshdl = libzfs_init()) == NULL) return (-1); /* * collect a list of unavailable pools (asynchronously, * since this can take a while) */ list_create(&g_pool_list, sizeof (struct unavailpool), offsetof(struct unavailpool, uap_node)); if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { list_destroy(&g_pool_list); libzfs_fini(g_zfshdl); return (-1); } pthread_setname_np(g_zfs_tid, "enum-pools"); list_create(&g_device_list, sizeof (struct pendingdev), offsetof(struct pendingdev, pd_node)); return (0); } void zfs_slm_fini(void) { unavailpool_t *pool; pendingdev_t *device; /* wait for zfs_enum_pools thread to complete */ (void) pthread_join(g_zfs_tid, NULL); /* destroy the thread pool */ if (g_tpool != NULL) { tpool_wait(g_tpool); tpool_destroy(g_tpool); } while ((pool = list_remove_head(&g_pool_list)) != NULL) { zpool_close(pool->uap_zhp); free(pool); } list_destroy(&g_pool_list); while ((device = list_remove_head(&g_device_list)) != NULL) free(device); list_destroy(&g_device_list); libzfs_fini(g_zfshdl); } void zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) { zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); (void) zfs_slm_deliver_event(class, subclass, nvl); } diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh index 46bfc1b866f1..40cb61f17307 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh @@ -1,242 +1,242 @@ #!/bin/sh # shellcheck disable=SC2154 # # Turn off/on vdevs' enclosure fault LEDs when their pool's state changes. # # Turn a vdev's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL. # Turn its LED off when it's back ONLINE again. # # This script run in two basic modes: # # 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then # only set the LED for that particular vdev. This is the case for statechange # events and some vdev_* events. # # 2. If those vars are not set, then check the state of all vdevs in the pool # and set the LEDs accordingly. This is the case for pool_import events. # # Note that this script requires that your enclosure be supported by the # Linux SCSI Enclosure services (SES) driver. The script will do nothing # if you have no enclosure, or if your enclosure isn't supported. # # Exit codes: # 0: enclosure led successfully set # 1: enclosure leds not available # 2: enclosure leds administratively disabled # 3: The led sysfs path passed from ZFS does not exist # 4: $ZPOOL not set # 5: awk is not installed [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" if [ ! -d /sys/class/enclosure ] && [ ! -d /sys/bus/pci/slots ] ; then # No JBOD enclosure or NVMe slots exit 1 fi if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then exit 2 fi zed_check_cmd "$ZPOOL" || exit 4 zed_check_cmd awk || exit 5 # Global used in set_led debug print vdev="" # check_and_set_led (file, val) # # Read an enclosure sysfs file, and write it if it's not already set to 'val' # # Arguments # file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault) # val: value to set it to # # Return # 0 on success, 3 on missing sysfs path # check_and_set_led() { file="$1" val="$2" if [ -z "$val" ]; then return 0 fi if [ ! -e "$file" ] ; then return 3 fi # If another process is accessing the LED when we attempt to update it, # the update will be lost so retry until the LED actually changes or we # timeout. for _ in 1 2 3 4 5; do # We want to check the current state first, since writing to the # 'fault' entry always causes a SES command, even if the # current state is already what you want. read -r current < "${file}" # On some enclosures if you write 1 to fault, and read it back, # it will return 2. Treat all non-zero values as 1 for # simplicity. if [ "$current" != "0" ] ; then current=1 fi if [ "$current" != "$val" ] ; then echo "$val" > "$file" zed_log_msg "vdev $vdev set '$file' LED to $val" else break fi done } # Fault LEDs for JBODs and NVMe drives are handled a little differently. # # On JBODs the fault LED is called 'fault' and on a path like this: # # /sys/class/enclosure/0:0:1:0/SLOT 10/fault # # On NVMe it's called 'attention' and on a path like this: # # /sys/bus/pci/slot/0/attention # # This function returns the full path to the fault LED file for a given # enclosure/slot directory. # path_to_led() { dir=$1 if [ -f "$dir/fault" ] ; then echo "$dir/fault" elif [ -f "$dir/attention" ] ; then echo "$dir/attention" fi } state_to_val() { state="$1" case "$state" in - FAULTED|DEGRADED|UNAVAIL) + FAULTED|DEGRADED|UNAVAIL|REMOVED) echo 1 ;; ONLINE) echo 0 ;; *) echo "invalid state: $state" ;; esac } # # Given a nvme name like 'nvme0n1', pass back its slot directory # like "/sys/bus/pci/slots/0" # nvme_dev_to_slot() { dev="$1" # Get the address "0000:01:00.0" read -r address < "/sys/class/block/$dev/device/address" find /sys/bus/pci/slots -regex '.*/[0-9]+/address$' | \ while read -r sys_addr; do read -r this_address < "$sys_addr" # The format of address is a little different between # /sys/class/block/$dev/device/address and # /sys/bus/pci/slots/ # # address= "0000:01:00.0" # this_address = "0000:01:00" # if echo "$address" | grep -Eq ^"$this_address" ; then echo "${sys_addr%/*}" break fi done } # process_pool (pool) # # Iterate through a pool and set the vdevs' enclosure slot LEDs to # those vdevs' state. # # Arguments # pool: Pool name. # # Return # 0 on success, 3 on missing sysfs path # process_pool() { pool="$1" # The output will be the vdevs only (from "grep '/dev/'"): # # U45 ONLINE 0 0 0 /dev/sdk 0 # U46 ONLINE 0 0 0 /dev/sdm 0 # U47 ONLINE 0 0 0 /dev/sdn 0 # U50 ONLINE 0 0 0 /dev/sdbn 0 # ZPOOL_SCRIPTS_AS_ROOT=1 $ZPOOL status -c upath,fault_led "$pool" | grep '/dev/' | ( rc=0 while read -r vdev state _ _ _ therest; do # Read out current LED value and path # Get dev name (like 'sda') dev=$(basename "$(echo "$therest" | awk '{print $(NF-1)}')") vdev_enc_sysfs_path=$(realpath "/sys/class/block/$dev/device/enclosure_device"*) if [ ! -d "$vdev_enc_sysfs_path" ] ; then # This is not a JBOD disk, but it could be a PCI NVMe drive vdev_enc_sysfs_path=$(nvme_dev_to_slot "$dev") fi current_val=$(echo "$therest" | awk '{print $NF}') if [ "$current_val" != "0" ] ; then current_val=1 fi if [ -z "$vdev_enc_sysfs_path" ] ; then # Skip anything with no sysfs LED entries continue fi led_path=$(path_to_led "$vdev_enc_sysfs_path") if [ ! -e "$led_path" ] ; then rc=3 zed_log_msg "vdev $vdev '$led_path' doesn't exist" continue fi val=$(state_to_val "$state") if [ "$current_val" = "$val" ] ; then # LED is already set correctly continue fi if ! check_and_set_led "$led_path" "$val"; then rc=3 fi done exit "$rc"; ) } if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then # Got a statechange for an individual vdev val=$(state_to_val "$ZEVENT_VDEV_STATE_STR") vdev=$(basename "$ZEVENT_VDEV_PATH") ledpath=$(path_to_led "$ZEVENT_VDEV_ENC_SYSFS_PATH") check_and_set_led "$ledpath" "$val" else # Process the entire pool poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID") process_pool "$poolname" fi diff --git a/sys/contrib/openzfs/config/kernel-blkdev.m4 b/sys/contrib/openzfs/config/kernel-blkdev.m4 index 887acee670ba..e04a2bd2c3b6 100644 --- a/sys/contrib/openzfs/config/kernel-blkdev.m4 +++ b/sys/contrib/openzfs/config/kernel-blkdev.m4 @@ -1,533 +1,611 @@ dnl # dnl # 2.6.38 API change, dnl # Added blkdev_get_by_path() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [ ZFS_LINUX_TEST_SRC([blkdev_get_by_path], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)) = NULL; const char *path = "path"; fmode_t mode = 0; void *holder = NULL; bdev = blkdev_get_by_path(path, mode, holder); ]) ]) +dnl # +dnl # 6.5.x API change, +dnl # blkdev_get_by_path() takes 4 args +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [ + ZFS_LINUX_TEST_SRC([blkdev_get_by_path_4arg], [ + #include + #include + ], [ + struct block_device *bdev __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + struct blk_holder_ops h; + + bdev = blkdev_get_by_path(path, mode, holder, &h); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ - AC_MSG_CHECKING([whether blkdev_get_by_path() exists]) + AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 4 args]) + ZFS_LINUX_TEST_RESULT([blkdev_get_by_path_4arg], [ + AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH_4ARG, 1, + [blkdev_get_by_path() exists and takes 4 args]) + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) + ]) +]) + +dnl # +dnl # 6.5.x API change +dnl # blk_mode_t was added as a type to supercede some places where fmode_t +dnl # is used +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T], [ + ZFS_LINUX_TEST_SRC([blk_mode_t], [ + #include + #include + ], [ + blk_mode_t m __attribute((unused)) = (blk_mode_t)0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [ + AC_MSG_CHECKING([whether blk_mode_t is defined]) + ZFS_LINUX_TEST_RESULT([blk_mode_t], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MODE_T, 1, [blk_mode_t is defined]) + ], [ + AC_MSG_RESULT(no) ]) ]) dnl # dnl # 2.6.38 API change, dnl # Added blkdev_put() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [ ZFS_LINUX_TEST_SRC([blkdev_put], [ #include #include ], [ struct block_device *bdev = NULL; fmode_t mode = 0; blkdev_put(bdev, mode); ]) ]) +dnl # +dnl # 6.5.x API change. +dnl # blkdev_put() takes (void* holder) as arg 2 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [ + ZFS_LINUX_TEST_SRC([blkdev_put_holder], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + void *holder = NULL; + + blkdev_put(bdev, holder); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_put()]) + AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) + ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1, + [blkdev_put() accepts void* as arg 2]) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_put()]) + ]) ]) ]) dnl # dnl # 4.1 API, exported blkdev_reread_part() symbol, back ported to the dnl # 3.10.0 CentOS 7.x enterprise kernels. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART], [ ZFS_LINUX_TEST_SRC([blkdev_reread_part], [ #include #include ], [ struct block_device *bdev = NULL; int error; error = blkdev_reread_part(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ AC_MSG_CHECKING([whether blkdev_reread_part() exists]) ZFS_LINUX_TEST_RESULT([blkdev_reread_part], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1, [blkdev_reread_part() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # check_disk_change() was removed in 5.10 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE], [ ZFS_LINUX_TEST_SRC([check_disk_change], [ #include #include ], [ struct block_device *bdev = NULL; bool error; error = check_disk_change(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ AC_MSG_CHECKING([whether check_disk_change() exists]) ZFS_LINUX_TEST_RESULT([check_disk_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_CHECK_DISK_CHANGE, 1, [check_disk_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 6.5.x API change dnl # disk_check_media_change() was added dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ ZFS_LINUX_TEST_SRC([disk_check_media_change], [ #include #include ], [ struct block_device *bdev = NULL; bool error; error = disk_check_media_change(bdev->bd_disk); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ AC_MSG_CHECKING([whether disk_check_media_change() exists]) ZFS_LINUX_TEST_RESULT([disk_check_media_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1, [disk_check_media_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # bdev_kobj() is introduced from 5.12 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ], [ ZFS_LINUX_TEST_SRC([bdev_kobj], [ #include #include #include ], [ struct block_device *bdev = NULL; struct kobject *disk_kobj; disk_kobj = bdev_kobj(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ], [ AC_MSG_CHECKING([whether bdev_kobj() exists]) ZFS_LINUX_TEST_RESULT([bdev_kobj], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_KOBJ, 1, [bdev_kobj() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # part_to_dev() was removed in 5.12 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV], [ ZFS_LINUX_TEST_SRC([part_to_dev], [ #include #include ], [ struct hd_struct *p = NULL; struct device *pdev; pdev = part_to_dev(p); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV], [ AC_MSG_CHECKING([whether part_to_dev() exists]) ZFS_LINUX_TEST_RESULT([part_to_dev], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_PART_TO_DEV, 1, [part_to_dev() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 5.10 API, check_disk_change() is removed, in favor of dnl # bdev_check_media_change(), which doesn't force revalidation dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ ZFS_LINUX_TEST_SRC([bdev_check_media_change], [ #include #include ], [ struct block_device *bdev = NULL; int error; error = bdev_check_media_change(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ AC_MSG_CHECKING([whether bdev_check_media_change() exists]) ZFS_LINUX_TEST_RESULT([bdev_check_media_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_CHECK_MEDIA_CHANGE, 1, [bdev_check_media_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 2.6.22 API change dnl # Single argument invalidate_bdev() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV], [ ZFS_LINUX_TEST_SRC([invalidate_bdev], [ #include #include ],[ struct block_device *bdev = NULL; invalidate_bdev(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV], [ AC_MSG_CHECKING([whether invalidate_bdev() exists]) ZFS_LINUX_TEST_RESULT([invalidate_bdev], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([invalidate_bdev()]) ]) ]) dnl # dnl # 5.11 API, lookup_bdev() takes dev_t argument. dnl # 2.6.27 API, lookup_bdev() was first exported. dnl # 4.4.0-6.21 API, lookup_bdev() on Ubuntu takes mode argument. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV], [ ZFS_LINUX_TEST_SRC([lookup_bdev_devt], [ #include ], [ int error __attribute__ ((unused)); const char path[] = "/example/path"; dev_t dev; error = lookup_bdev(path, &dev); ]) ZFS_LINUX_TEST_SRC([lookup_bdev_1arg], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)); const char path[] = "/example/path"; bdev = lookup_bdev(path); ]) ZFS_LINUX_TEST_SRC([lookup_bdev_mode], [ #include ], [ struct block_device *bdev __attribute__ ((unused)); const char path[] = "/example/path"; bdev = lookup_bdev(path, FMODE_READ); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV], [ AC_MSG_CHECKING([whether lookup_bdev() wants dev_t arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_devt], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_DEVT_LOOKUP_BDEV, 1, [lookup_bdev() wants dev_t arg]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, [lookup_bdev() wants 1 arg]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether lookup_bdev() wants mode arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_mode], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_MODE_LOOKUP_BDEV, 1, [lookup_bdev() wants mode arg]) ], [ ZFS_LINUX_TEST_ERROR([lookup_bdev()]) ]) ]) ]) ]) dnl # dnl # 2.6.30 API change dnl # dnl # The bdev_physical_block_size() interface was added to provide a way dnl # to determine the smallest write which can be performed without a dnl # read-modify-write operation. dnl # dnl # Unfortunately, this interface isn't entirely reliable because dnl # drives are sometimes known to misreport this value. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ ZFS_LINUX_TEST_SRC([bdev_physical_block_size], [ #include ],[ struct block_device *bdev __attribute__ ((unused)) = NULL; bdev_physical_block_size(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) ZFS_LINUX_TEST_RESULT([bdev_physical_block_size], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bdev_physical_block_size()]) ]) ]) dnl # dnl # 2.6.30 API change dnl # Added bdev_logical_block_size(). dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ ZFS_LINUX_TEST_SRC([bdev_logical_block_size], [ #include ],[ struct block_device *bdev __attribute__ ((unused)) = NULL; bdev_logical_block_size(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ AC_MSG_CHECKING([whether bdev_logical_block_size() is available]) ZFS_LINUX_TEST_RESULT([bdev_logical_block_size], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bdev_logical_block_size()]) ]) ]) dnl # dnl # 5.11 API change dnl # Added bdev_whole() helper. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE], [ ZFS_LINUX_TEST_SRC([bdev_whole], [ #include ],[ struct block_device *bdev = NULL; bdev = bdev_whole(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [ AC_MSG_CHECKING([whether bdev_whole() is available]) ZFS_LINUX_TEST_RESULT([bdev_whole], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_WHOLE, 1, [bdev_whole() is available]) ],[ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 5.20 API change, dnl # Removed bdevname(), snprintf(.., %pg) should be used. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME], [ ZFS_LINUX_TEST_SRC([bdevname], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)) = NULL; char path[BDEVNAME_SIZE]; (void) bdevname(bdev, path); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ AC_MSG_CHECKING([whether bdevname() exists]) ZFS_LINUX_TEST_RESULT([bdevname], [ AC_DEFINE(HAVE_BDEVNAME, 1, [bdevname() is available]) AC_MSG_RESULT(yes) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 5.19 API: blkdev_issue_secure_erase() dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; int error __attribute__ ((unused)); error = blkdev_issue_secure_erase(bdev, sector, nr_sects, GFP_KERNEL); ]) ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; unsigned long flags = 0; int error __attribute__ ((unused)); error = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL, flags); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available]) ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_ISSUE_SECURE_ERASE, 1, [blkdev_issue_secure_erase() is available]) ],[ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, [blkdev_issue_discard() is available]) ],[ ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) ]) ]) ]) dnl # dnl # 5.13 API change dnl # blkdev_get_by_path() no longer handles ERESTARTSYS dnl # dnl # Unfortunately we're forced to rely solely on the kernel version dnl # number in order to determine the expected behavior. This was an dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607. dnl # AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [ AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS]) AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1, [blkdev_get_by_path() handles ERESTARTSYS]) ],[ AC_MSG_RESULT(no) ],[ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 6.5.x API change dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [ ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [ #include ],[ blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT; ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [ AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined]) ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [ AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined]) AC_MSG_RESULT(yes) ], [ AC_MSG_RESULT(no) ]) ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG ZFS_AC_KERNEL_SRC_BLKDEV_PUT + ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT + ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_BLKDEV_PUT ZFS_AC_KERNEL_BLKDEV_REREAD_PART ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_BLKDEV_BDEVNAME ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT + ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T ]) diff --git a/sys/contrib/openzfs/config/kernel-block-device-operations.m4 b/sys/contrib/openzfs/config/kernel-block-device-operations.m4 index 84e39dc8a2f6..d13c1337b1fb 100644 --- a/sys/contrib/openzfs/config/kernel-block-device-operations.m4 +++ b/sys/contrib/openzfs/config/kernel-block-device-operations.m4 @@ -1,102 +1,133 @@ dnl # dnl # 2.6.38 API change dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ ZFS_LINUX_TEST_SRC([block_device_operations_check_events], [ #include unsigned int blk_check_events(struct gendisk *disk, unsigned int clearing) { (void) disk, (void) clearing; return (0); } static const struct block_device_operations bops __attribute__ ((unused)) = { .check_events = blk_check_events, }; ], [], []) ]) AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ AC_MSG_CHECKING([whether bops->check_events() exists]) ZFS_LINUX_TEST_RESULT([block_device_operations_check_events], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bops->check_events()]) ]) ]) dnl # dnl # 3.10.x API change dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ ZFS_LINUX_TEST_SRC([block_device_operations_release_void], [ #include void blk_release(struct gendisk *g, fmode_t mode) { (void) g, (void) mode; return; } static const struct block_device_operations bops __attribute__ ((unused)) = { .open = NULL, .release = blk_release, .ioctl = NULL, .compat_ioctl = NULL, }; ], [], []) ]) +dnl # +dnl # 5.9.x API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [ + ZFS_LINUX_TEST_SRC([block_device_operations_release_void_1arg], [ + #include + + void blk_release(struct gendisk *g) { + (void) g; + return; + } + + static const struct block_device_operations + bops __attribute__ ((unused)) = { + .open = NULL, + .release = blk_release, + .ioctl = NULL, + .compat_ioctl = NULL, + }; + ], [], []) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ - AC_MSG_CHECKING([whether bops->release() is void]) + AC_MSG_CHECKING([whether bops->release() is void and takes 2 args]) ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [ AC_MSG_RESULT(yes) ],[ - ZFS_LINUX_TEST_ERROR([bops->release()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bops->release() is void and takes 1 arg]) + ZFS_LINUX_TEST_RESULT([block_device_operations_release_void_1arg], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [1], + [Define if release() in block_device_operations takes 1 arg]) + ],[ + ZFS_LINUX_TEST_ERROR([bops->release()]) + ]) ]) ]) dnl # dnl # 5.13 API change dnl # block_device_operations->revalidate_disk() was removed dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ ZFS_LINUX_TEST_SRC([block_device_operations_revalidate_disk], [ #include int blk_revalidate_disk(struct gendisk *disk) { (void) disk; return(0); } static const struct block_device_operations bops __attribute__ ((unused)) = { .revalidate_disk = blk_revalidate_disk, }; ], [], []) ]) AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ AC_MSG_CHECKING([whether bops->revalidate_disk() exists]) ZFS_LINUX_TEST_RESULT([block_device_operations_revalidate_disk], [ AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [1], [Define if revalidate_disk() in block_device_operations]) AC_MSG_RESULT(yes) ],[ AC_MSG_RESULT(no) ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [ ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK ]) AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS], [ ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK ]) diff --git a/sys/contrib/openzfs/config/kernel-filemap-splice-read.m4 b/sys/contrib/openzfs/config/kernel-filemap-splice-read.m4 new file mode 100644 index 000000000000..4c83b31d738a --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-filemap-splice-read.m4 @@ -0,0 +1,25 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ], [ + dnl # + dnl # Kernel 6.5 - generic_file_splice_read was removed in favor + dnl # of copy_splice_read for the .splice_read member of the + dnl # file_operations struct. + dnl # + ZFS_LINUX_TEST_SRC([has_copy_splice_read], [ + #include + + struct file_operations fops __attribute__((unused)) = { + .splice_read = copy_splice_read, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_COPY_SPLICE_READ], [ + AC_MSG_CHECKING([whether copy_splice_read() exists]) + ZFS_LINUX_TEST_RESULT([has_copy_splice_read], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_COPY_SPLICE_READ, 1, + [copy_splice_read exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel-register_sysctl_table.m4 b/sys/contrib/openzfs/config/kernel-register_sysctl_table.m4 new file mode 100644 index 000000000000..a5e934f56d29 --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-register_sysctl_table.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 6.5 removes register_sysctl_table +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [ + ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [ + #include + + static struct ctl_table dummy_table[] = { + {} + }; + + ],[ + struct ctl_table_header *h + __attribute((unused)) = register_sysctl_table(dummy_table); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ + AC_MSG_CHECKING([whether register_sysctl_table exists]) + ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_REGISTER_SYSCTL_TABLE, 1, + [register_sysctl_table exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel-vfs-iov_iter.m4 b/sys/contrib/openzfs/config/kernel-vfs-iov_iter.m4 index e0617faab02c..ff560ff3eef0 100644 --- a/sys/contrib/openzfs/config/kernel-vfs-iov_iter.m4 +++ b/sys/contrib/openzfs/config/kernel-vfs-iov_iter.m4 @@ -1,204 +1,226 @@ dnl # dnl # Check for available iov_iter functionality. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ ZFS_LINUX_TEST_SRC([iov_iter_types], [ #include #include ],[ - int type __attribute__ ((unused)) = - ITER_IOVEC | ITER_KVEC | ITER_BVEC | ITER_PIPE; + int type __attribute__ ((unused)) = ITER_KVEC; ]) ZFS_LINUX_TEST_SRC([iov_iter_advance], [ #include #include ],[ struct iov_iter iter = { 0 }; size_t advance = 512; iov_iter_advance(&iter, advance); ]) ZFS_LINUX_TEST_SRC([iov_iter_revert], [ #include #include ],[ struct iov_iter iter = { 0 }; size_t revert = 512; iov_iter_revert(&iter, revert); ]) ZFS_LINUX_TEST_SRC([iov_iter_fault_in_readable], [ #include #include ],[ struct iov_iter iter = { 0 }; size_t size = 512; int error __attribute__ ((unused)); error = iov_iter_fault_in_readable(&iter, size); ]) ZFS_LINUX_TEST_SRC([fault_in_iov_iter_readable], [ #include #include ],[ struct iov_iter iter = { 0 }; size_t size = 512; int error __attribute__ ((unused)); error = fault_in_iov_iter_readable(&iter, size); ]) ZFS_LINUX_TEST_SRC([iov_iter_count], [ #include #include ],[ struct iov_iter iter = { 0 }; size_t bytes __attribute__ ((unused)); bytes = iov_iter_count(&iter); ]) ZFS_LINUX_TEST_SRC([copy_to_iter], [ #include #include ],[ struct iov_iter iter = { 0 }; char buf[512] = { 0 }; size_t size = 512; size_t bytes __attribute__ ((unused)); bytes = copy_to_iter((const void *)&buf, size, &iter); ]) ZFS_LINUX_TEST_SRC([copy_from_iter], [ #include #include ],[ struct iov_iter iter = { 0 }; char buf[512] = { 0 }; size_t size = 512; size_t bytes __attribute__ ((unused)); bytes = copy_from_iter((void *)&buf, size, &iter); ]) ZFS_LINUX_TEST_SRC([iov_iter_type], [ #include #include ],[ struct iov_iter iter = { 0 }; __attribute__((unused)) enum iter_type i = iov_iter_type(&iter); ]) + + ZFS_LINUX_TEST_SRC([iter_iov], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + __attribute__((unused)) const struct iovec *iov = iter_iov(&iter); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ enable_vfs_iov_iter="yes" AC_MSG_CHECKING([whether iov_iter types are available]) ZFS_LINUX_TEST_RESULT([iov_iter_types], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IOV_ITER_TYPES, 1, [iov_iter types are available]) ],[ AC_MSG_RESULT(no) enable_vfs_iov_iter="no" ]) AC_MSG_CHECKING([whether iov_iter_advance() is available]) ZFS_LINUX_TEST_RESULT([iov_iter_advance], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IOV_ITER_ADVANCE, 1, [iov_iter_advance() is available]) ],[ AC_MSG_RESULT(no) enable_vfs_iov_iter="no" ]) AC_MSG_CHECKING([whether iov_iter_revert() is available]) ZFS_LINUX_TEST_RESULT([iov_iter_revert], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IOV_ITER_REVERT, 1, [iov_iter_revert() is available]) ],[ AC_MSG_RESULT(no) enable_vfs_iov_iter="no" ]) AC_MSG_CHECKING([whether iov_iter_fault_in_readable() is available]) ZFS_LINUX_TEST_RESULT([iov_iter_fault_in_readable], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IOV_ITER_FAULT_IN_READABLE, 1, [iov_iter_fault_in_readable() is available]) ],[ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether fault_in_iov_iter_readable() is available]) ZFS_LINUX_TEST_RESULT([fault_in_iov_iter_readable], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_FAULT_IN_IOV_ITER_READABLE, 1, [fault_in_iov_iter_readable() is available]) ],[ AC_MSG_RESULT(no) enable_vfs_iov_iter="no" ]) ]) AC_MSG_CHECKING([whether iov_iter_count() is available]) ZFS_LINUX_TEST_RESULT([iov_iter_count], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IOV_ITER_COUNT, 1, [iov_iter_count() is available]) ],[ AC_MSG_RESULT(no) enable_vfs_iov_iter="no" ]) AC_MSG_CHECKING([whether copy_to_iter() is available]) ZFS_LINUX_TEST_RESULT([copy_to_iter], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_COPY_TO_ITER, 1, [copy_to_iter() is available]) ],[ AC_MSG_RESULT(no) enable_vfs_iov_iter="no" ]) AC_MSG_CHECKING([whether copy_from_iter() is available]) ZFS_LINUX_TEST_RESULT([copy_from_iter], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_COPY_FROM_ITER, 1, [copy_from_iter() is available]) ],[ AC_MSG_RESULT(no) enable_vfs_iov_iter="no" ]) dnl # dnl # This checks for iov_iter_type() in linux/uio.h. It is not dnl # required, however, and the module will compiled without it dnl # using direct access of the member attribute dnl # AC_MSG_CHECKING([whether iov_iter_type() is available]) ZFS_LINUX_TEST_RESULT([iov_iter_type], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_IOV_ITER_TYPE, 1, [iov_iter_type() is available]) ],[ AC_MSG_RESULT(no) ]) dnl # dnl # As of the 4.9 kernel support is provided for iovecs, kvecs, dnl # bvecs and pipes in the iov_iter structure. As long as the dnl # other support interfaces are all available the iov_iter can dnl # be correctly used in the uio structure. dnl # AS_IF([test "x$enable_vfs_iov_iter" = "xyes"], [ AC_DEFINE(HAVE_VFS_IOV_ITER, 1, [All required iov_iter interfaces are available]) ]) + + dnl # + dnl # Kernel 6.5 introduces the iter_iov() function that returns the + dnl # __iov member of an iov_iter*. The iov member was renamed to this + dnl # __iov member, and is intended to be accessed via the helper + dnl # function now. + dnl # + AC_MSG_CHECKING([whether iter_iov() is available]) + ZFS_LINUX_TEST_RESULT([iter_iov], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ITER_IOV, 1, + [iter_iov() is available]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 index 1487fa2e7793..df194ec72207 100644 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -1,1024 +1,1028 @@ dnl # dnl # Default ZFS kernel configuration dnl # AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ AM_COND_IF([BUILD_LINUX], [ dnl # Setup the kernel build environment. ZFS_AC_KERNEL ZFS_AC_QAT dnl # Sanity checks for module building and CONFIG_* defines ZFS_AC_KERNEL_CONFIG_DEFINED ZFS_AC_MODULE_SYMVERS dnl # Sequential ZFS_LINUX_TRY_COMPILE tests ZFS_AC_KERNEL_FPU_HEADER ZFS_AC_KERNEL_OBJTOOL_HEADER ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T ZFS_AC_KERNEL_MISC_MINOR ZFS_AC_KERNEL_DECLARE_EVENT_CLASS dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests ZFS_AC_KERNEL_TEST_SRC ZFS_AC_KERNEL_TEST_RESULT AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" ]) AC_SUBST(KERNEL_MAKE) ]) ]) dnl # dnl # Generate and compile all of the kernel API test cases to determine dnl # which interfaces are available. By invoking the kernel build system dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_FALLOCATE ZFS_AC_KERNEL_SRC_FADVISE ZFS_AC_KERNEL_SRC_GENERIC_FADVISE ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_SRC_RWSEM ZFS_AC_KERNEL_SRC_SCHED ZFS_AC_KERNEL_SRC_USLEEP_RANGE ZFS_AC_KERNEL_SRC_KMEM_CACHE ZFS_AC_KERNEL_SRC_KVMALLOC ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_SRC_WAIT ZFS_AC_KERNEL_SRC_INODE_TIMES ZFS_AC_KERNEL_SRC_INODE_LOCK ZFS_AC_KERNEL_SRC_GROUP_INFO_GID ZFS_AC_KERNEL_SRC_RW ZFS_AC_KERNEL_SRC_TIMER_SETUP ZFS_AC_KERNEL_SRC_SUPER_USER_NS ZFS_AC_KERNEL_SRC_PROC_OPERATIONS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_SRC_BIO ZFS_AC_KERNEL_SRC_BLKDEV ZFS_AC_KERNEL_SRC_BLK_QUEUE ZFS_AC_KERNEL_SRC_GENHD_FLAGS ZFS_AC_KERNEL_SRC_REVALIDATE_DISK ZFS_AC_KERNEL_SRC_GET_DISK_RO ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_SRC_XATTR ZFS_AC_KERNEL_SRC_ACL ZFS_AC_KERNEL_SRC_INODE_SETATTR ZFS_AC_KERNEL_SRC_INODE_GETATTR ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION ZFS_AC_KERNEL_SRC_SHOW_OPTIONS ZFS_AC_KERNEL_SRC_FILE_INODE ZFS_AC_KERNEL_SRC_FILE_DENTRY ZFS_AC_KERNEL_SRC_FSYNC ZFS_AC_KERNEL_SRC_AIO_FSYNC ZFS_AC_KERNEL_SRC_EVICT_INODE ZFS_AC_KERNEL_SRC_DIRTY_INODE ZFS_AC_KERNEL_SRC_SHRINKER ZFS_AC_KERNEL_SRC_MKDIR ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS ZFS_AC_KERNEL_SRC_CREATE ZFS_AC_KERNEL_SRC_PERMISSION ZFS_AC_KERNEL_SRC_GET_LINK ZFS_AC_KERNEL_SRC_PUT_LINK ZFS_AC_KERNEL_SRC_TMPFILE ZFS_AC_KERNEL_SRC_AUTOMOUNT ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_SRC_COMMIT_METADATA ZFS_AC_KERNEL_SRC_CLEAR_INODE ZFS_AC_KERNEL_SRC_SETATTR_PREPARE ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED ZFS_AC_KERNEL_SRC_DENTRY ZFS_AC_KERNEL_SRC_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FST_MOUNT ZFS_AC_KERNEL_SRC_BDI ZFS_AC_KERNEL_SRC_SET_NLINK ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO ZFS_AC_KERNEL_SRC_VFS_GETATTR ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_ITERATE ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT ZFS_AC_KERNEL_SRC_FPU ZFS_AC_KERNEL_SRC_FMODE_T ZFS_AC_KERNEL_SRC_KUIDGID_T ZFS_AC_KERNEL_SRC_KUID_HELPERS ZFS_AC_KERNEL_SRC_RENAME ZFS_AC_KERNEL_SRC_CURRENT_TIME ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_SRC_KTIME ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES ZFS_AC_KERNEL_SRC_KSTRTOUL ZFS_AC_KERNEL_SRC_PERCPU ZFS_AC_KERNEL_SRC_CPU_HOTPLUG ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR ZFS_AC_KERNEL_SRC_MKNOD ZFS_AC_KERNEL_SRC_SYMLINK ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS ZFS_AC_KERNEL_SRC_SIGNAL_STOP ZFS_AC_KERNEL_SRC_SIGINFO ZFS_AC_KERNEL_SRC_SYSFS ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_SRC_ADD_DISK ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM ZFS_AC_KERNEL_SRC_IDMAP_MNT_API ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED + ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; esac AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) AC_MSG_RESULT([done]) ]) dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_FADVISE ZFS_AC_KERNEL_GENERIC_FADVISE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_RWSEM ZFS_AC_KERNEL_SCHED ZFS_AC_KERNEL_USLEEP_RANGE ZFS_AC_KERNEL_KMEM_CACHE ZFS_AC_KERNEL_KVMALLOC ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_WAIT ZFS_AC_KERNEL_INODE_TIMES ZFS_AC_KERNEL_INODE_LOCK ZFS_AC_KERNEL_GROUP_INFO_GID ZFS_AC_KERNEL_RW ZFS_AC_KERNEL_TIMER_SETUP ZFS_AC_KERNEL_SUPER_USER_NS ZFS_AC_KERNEL_PROC_OPERATIONS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_BIO ZFS_AC_KERNEL_BLKDEV ZFS_AC_KERNEL_BLK_QUEUE ZFS_AC_KERNEL_GENHD_FLAGS ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_XATTR ZFS_AC_KERNEL_ACL ZFS_AC_KERNEL_INODE_SETATTR ZFS_AC_KERNEL_INODE_GETATTR ZFS_AC_KERNEL_INODE_SET_FLAGS ZFS_AC_KERNEL_INODE_SET_IVERSION ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY ZFS_AC_KERNEL_FSYNC ZFS_AC_KERNEL_AIO_FSYNC ZFS_AC_KERNEL_EVICT_INODE ZFS_AC_KERNEL_DIRTY_INODE ZFS_AC_KERNEL_SHRINKER ZFS_AC_KERNEL_MKDIR ZFS_AC_KERNEL_LOOKUP_FLAGS ZFS_AC_KERNEL_CREATE ZFS_AC_KERNEL_PERMISSION ZFS_AC_KERNEL_GET_LINK ZFS_AC_KERNEL_PUT_LINK ZFS_AC_KERNEL_TMPFILE ZFS_AC_KERNEL_AUTOMOUNT ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_COMMIT_METADATA ZFS_AC_KERNEL_CLEAR_INODE ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED ZFS_AC_KERNEL_DENTRY ZFS_AC_KERNEL_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT ZFS_AC_KERNEL_BDI ZFS_AC_KERNEL_SET_NLINK ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_LSEEK_EXECUTE ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_VFS_READ_FOLIO ZFS_AC_KERNEL_VFS_GETATTR ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN ZFS_AC_KERNEL_GENERIC_IO_ACCT ZFS_AC_KERNEL_FPU ZFS_AC_KERNEL_FMODE_T ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_KUID_HELPERS ZFS_AC_KERNEL_RENAME ZFS_AC_KERNEL_CURRENT_TIME ZFS_AC_KERNEL_USERNS_CAPABILITIES ZFS_AC_KERNEL_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_KTIME ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_TOTALHIGH_PAGES ZFS_AC_KERNEL_KSTRTOUL ZFS_AC_KERNEL_PERCPU ZFS_AC_KERNEL_CPU_HOTPLUG ZFS_AC_KERNEL_GENERIC_FILLATTR ZFS_AC_KERNEL_MKNOD ZFS_AC_KERNEL_SYMLINK ZFS_AC_KERNEL_BIO_MAX_SEGS ZFS_AC_KERNEL_SIGNAL_STOP ZFS_AC_KERNEL_SIGINFO ZFS_AC_KERNEL_SYSFS ZFS_AC_KERNEL_SET_SPECIAL_STATE ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_ADD_DISK ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_USER_NS_COMMON_INUM ZFS_AC_KERNEL_IDMAP_MNT_API ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED + ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_COPY_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; esac ]) dnl # dnl # Detect name used for Module.symvers file in kernel dnl # AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [ modpost=$LINUX/scripts/Makefile.modpost AC_MSG_CHECKING([kernel file name for module symbols]) AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [ AS_IF([grep -q Modules.symvers $modpost], [ LINUX_SYMBOLS=Modules.symvers ], [ LINUX_SYMBOLS=Module.symvers ]) AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed. If you are building with a custom kernel, make sure *** the kernel is configured, built, and the '--with-linux=PATH' *** configure option refers to the location of the kernel source. ]) ]) ], [ LINUX_SYMBOLS=NONE ]) AC_MSG_RESULT($LINUX_SYMBOLS) AC_SUBST(LINUX_SYMBOLS) ]) dnl # dnl # Detect the kernel to be built against dnl # dnl # Most modern Linux distributions have separate locations for bare dnl # source (source) and prebuilt (build) files. Additionally, there are dnl # `source` and `build` symlinks in `/lib/modules/$(KERNEL_VERSION)` dnl # pointing to them. The directory search order is now: dnl # dnl # - `configure` command line values if both `--with-linux` and dnl # `--with-linux-obj` were defined dnl # dnl # - If only `--with-linux` was defined, `--with-linux-obj` is assumed dnl # to have the same value as `--with-linux` dnl # dnl # - If neither `--with-linux` nor `--with-linux-obj` were defined dnl # autodetection is used: dnl # dnl # - `/lib/modules/$(uname -r)/{source,build}` respectively, if exist. dnl # dnl # - If only `/lib/modules/$(uname -r)/build` exists, it is assumed dnl # to be both source and build directory. dnl # dnl # - The first directory in `/lib/modules` with the highest version dnl # number according to `sort -V` which contains both `source` and dnl # `build` symlinks/directories. If module directory contains only dnl # `build` component, it is assumed to be both source and build dnl # directory. dnl # dnl # - Last resort: the first directory matching `/usr/src/kernels/*` dnl # and `/usr/src/linux-*` with the highest version number according dnl # to `sort -V` is assumed to be both source and build directory. dnl # AC_DEFUN([ZFS_AC_KERNEL], [ AC_ARG_WITH([linux], AS_HELP_STRING([--with-linux=PATH], [Path to kernel source]), [kernelsrc="$withval"]) AC_ARG_WITH(linux-obj, AS_HELP_STRING([--with-linux-obj=PATH], [Path to kernel build objects]), [kernelbuild="$withval"]) AC_MSG_CHECKING([kernel source and build directories]) AS_IF([test -n "$kernelsrc" && test -z "$kernelbuild"], [ kernelbuild="$kernelsrc" ], [test -z "$kernelsrc"], [ AS_IF([test -e "/lib/modules/$(uname -r)/source" && \ test -e "/lib/modules/$(uname -r)/build"], [ src="/lib/modules/$(uname -r)/source" build="/lib/modules/$(uname -r)/build" ], [test -e "/lib/modules/$(uname -r)/build"], [ build="/lib/modules/$(uname -r)/build" src="$build" ], [ src= for d in $(ls -1d /lib/modules/* 2>/dev/null | sort -Vr); do if test -e "$d/source" && test -e "$d/build"; then src="$d/source" build="$d/build" break fi if test -e "$d/build"; then src="$d/build" build="$d/build" break fi done # the least reliable method if test -z "$src"; then src=$(ls -1d /usr/src/kernels/* /usr/src/linux-* \ 2>/dev/null | grep -v obj | sort -Vr | head -1) build="$src" fi ]) AS_IF([test -n "$src" && test -e "$src"], [ kernelsrc=$(readlink -e "$src") ], [ kernelsrc="[Not found]" ]) AS_IF([test -n "$build" && test -e "$build"], [ kernelbuild=$(readlink -e "$build") ], [ kernelbuild="[Not found]" ]) ], [ AS_IF([test "$kernelsrc" = "NONE"], [ kernsrcver=NONE ]) withlinux=yes ]) AC_MSG_RESULT([done]) AC_MSG_CHECKING([kernel source directory]) AC_MSG_RESULT([$kernelsrc]) AC_MSG_CHECKING([kernel build directory]) AC_MSG_RESULT([$kernelbuild]) AS_IF([test ! -d "$kernelsrc" || test ! -d "$kernelbuild"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed and then try again. If that fails, you can specify the *** location of the kernel source and build with the '--with-linux=PATH' and *** '--with-linux-obj=PATH' options respectively.]) ]) AC_MSG_CHECKING([kernel source version]) utsrelease1=$kernelbuild/include/linux/version.h utsrelease2=$kernelbuild/include/linux/utsrelease.h utsrelease3=$kernelbuild/include/generated/utsrelease.h AS_IF([test -r $utsrelease1 && grep -qF UTS_RELEASE $utsrelease1], [ utsrelease=$utsrelease1 ], [test -r $utsrelease2 && grep -qF UTS_RELEASE $utsrelease2], [ utsrelease=$utsrelease2 ], [test -r $utsrelease3 && grep -qF UTS_RELEASE $utsrelease3], [ utsrelease=$utsrelease3 ]) AS_IF([test -n "$utsrelease"], [ kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease) AS_IF([test -z "$kernsrcver"], [ AC_MSG_RESULT([Not found]) AC_MSG_ERROR([ *** Cannot determine kernel version. ]) ]) ], [ AC_MSG_RESULT([Not found]) if test "x$enable_linux_builtin" != xyes; then AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. ]) else AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. *** Please run 'make prepare' inside the kernel source tree.]) fi ]) AC_MSG_RESULT([$kernsrcver]) AS_VERSION_COMPARE([$kernsrcver], [$ZFS_META_KVER_MIN], [ AC_MSG_ERROR([ *** Cannot build against kernel version $kernsrcver. *** The minimum supported kernel version is $ZFS_META_KVER_MIN. ]) ]) LINUX=${kernelsrc} LINUX_OBJ=${kernelbuild} LINUX_VERSION=${kernsrcver} AC_SUBST(LINUX) AC_SUBST(LINUX_OBJ) AC_SUBST(LINUX_VERSION) ]) dnl # dnl # Detect the QAT module to be built against, QAT provides hardware dnl # acceleration for data compression: dnl # dnl # https://01.org/intel-quickassist-technology dnl # dnl # 1) Download and install QAT driver from the above link dnl # 2) Start QAT driver in your system: dnl # service qat_service start dnl # 3) Enable QAT in ZFS, e.g.: dnl # ./configure --with-qat=/QAT1.6 dnl # make dnl # 4) Set GZIP compression in ZFS dataset: dnl # zfs set compression = gzip dnl # dnl # Then the data written to this ZFS pool is compressed by QAT accelerator dnl # automatically, and de-compressed by QAT when read from the pool. dnl # dnl # 1) Get QAT hardware statistics with: dnl # cat /proc/icp_dh895xcc_dev/qat dnl # 2) To disable QAT: dnl # insmod zfs.ko zfs_qat_disable=1 dnl # AC_DEFUN([ZFS_AC_QAT], [ AC_ARG_WITH([qat], AS_HELP_STRING([--with-qat=PATH], [Path to qat source]), AS_IF([test "$withval" = "yes"], AC_MSG_ERROR([--with-qat=PATH requires a PATH]), [qatsrc="$withval"])) AC_ARG_WITH([qat-obj], AS_HELP_STRING([--with-qat-obj=PATH], [Path to qat build objects]), [qatbuild="$withval"]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat source directory]) AC_MSG_RESULT([$qatsrc]) QAT_SRC="${qatsrc}/quickassist" AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [ AC_MSG_ERROR([ *** Please make sure the qat driver package is installed *** and specify the location of the qat source with the *** '--with-qat=PATH' option then try again. Failed to *** find cpa.h in: ${QAT_SRC}/include]) ]) ]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat build directory]) AS_IF([test -z "$qatbuild"], [ qatbuild="${qatsrc}/build" ]) AC_MSG_RESULT([$qatbuild]) QAT_OBJ=${qatbuild} AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find icp_qa_al.ko or qat_api.ko in: $QAT_OBJ]) ]) AC_SUBST(QAT_SRC) AC_SUBST(QAT_OBJ) AC_DEFINE(HAVE_QAT, 1, [qat is enabled and existed]) ]) dnl # dnl # Detect the name used for the QAT Module.symvers file. dnl # AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat file for module symbols]) QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers AS_IF([test -r $QAT_SYMBOLS], [ AC_MSG_RESULT([$QAT_SYMBOLS]) AC_SUBST(QAT_SYMBOLS) ],[ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find Module.symvers in: $QAT_SYMBOLS ]) ]) ]) ]) dnl # dnl # ZFS_LINUX_CONFTEST_H dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ test -d build/$2 || mkdir -p build/$2 cat - <<_ACEOF >build/$2/$2.h $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_C dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ test -d build/$2 || mkdir -p build/$2 cat confdefs.h - <<_ACEOF >build/$2/$2.c $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_MAKEFILE dnl # dnl # $1 - test case name dnl # $2 - add to top-level Makefile dnl # $3 - additional build flags dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [ test -d build || mkdir -p build test -d build/$1 || mkdir -p build/$1 file=build/$1/Makefile dnl # Example command line to manually build source. cat - <<_ACEOF >$file # Example command line to manually build source # make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1 ccflags-y := -Werror $FRAME_LARGER_THAN _ACEOF dnl # Additional custom CFLAGS as requested. m4_ifval($3, [echo "ccflags-y += $3" >>$file], []) dnl # Test case source echo "obj-m := $1.o" >>$file AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], []) ]) dnl # dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) dnl # m4_define([ZFS_LINUX_TEST_PROGRAM], [ #include $1 int main (void) { $2 ; return 0; } MODULE_DESCRIPTION("conftest"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); MODULE_LICENSE($3); ]) dnl # dnl # ZFS_LINUX_TEST_REMOVE dnl # dnl # Removes the specified test source and results. dnl # AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [ test -d build/$1 && rm -Rf build/$1 test -f build/Makefile && sed '/$1/d' build/Makefile ]) dnl # dnl # ZFS_LINUX_COMPILE dnl # dnl # $1 - build dir dnl # $2 - test command dnl # $3 - pass command dnl # $4 - fail command dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes' dnl # $6 - set KBUILD_MODPOST_WARN='yes' dnl # dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST} dnl # AC_DEFUN([ZFS_LINUX_COMPILE], [ AC_ARG_VAR([KERNEL_CC], [C compiler for building kernel modules]) AC_ARG_VAR([KERNEL_LD], [Linker for building kernel modules]) AC_ARG_VAR([KERNEL_LLVM], [Binary option to build kernel modules with LLVM/CLANG toolchain]) AC_TRY_COMMAND([ KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC} ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM} CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1]) AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_TEST_COMPILE dnl # dnl # Perform a full compile excluding the final modpost phase. dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.compile.$1 mv $2/build.log $2/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to compile test source to determine kernel interfaces.]) ], [yes], []) ]) dnl # dnl # ZFS_LINUX_TEST_MODPOST dnl # dnl # Perform a full compile including the modpost phase. This may dnl # be an incremental build if the objects have already been built. dnl # AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.modpost.$1 cat $2/build.log >>build/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to modpost test source to determine kernel interfaces.]) ], [], [yes]) ]) dnl # dnl # Perform the compilation of the test cases in two phases. dnl # dnl # Phase 1) attempt to build the object files for all of the tests dnl # defined by the ZFS_LINUX_TEST_SRC macro. But do not dnl # perform the final modpost stage. dnl # dnl # Phase 2) disable all tests which failed the initial compilation, dnl # then invoke the final modpost step for the remaining tests. dnl # dnl # This allows us efficiently build the test cases in parallel while dnl # remaining resilient to build failures which are expected when dnl # detecting the available kernel interfaces. dnl # dnl # The maximum allowed parallelism can be controlled by setting the dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ dnl # Phase 1 - Compilation only, final linking is skipped. ZFS_LINUX_TEST_COMPILE([$1], [build]) dnl # dnl # Phase 2 - When building external modules disable test cases dnl # which failed to compile and invoke modpost to verify the dnl # final linking. dnl # dnl # Test names suffixed with '_license' call modpost independently dnl # to ensure that a single incompatibility does not result in the dnl # modpost phase exiting early. This check is not performed on dnl # every symbol since the majority are compatible and doing so dnl # would significantly slow down this phase. dnl # dnl # When configuring for builtin (--enable-linux-builtin) dnl # fake the linking step artificially create the expected .ko dnl # files for tests which did compile. This is required for dnl # kernels which do not have loadable module support or have dnl # not yet been built. dnl # AS_IF([test "x$enable_linux_builtin" = "xno"], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ AS_IF([test "${name##*_}" = "license"], [ ZFS_LINUX_TEST_MODPOST([$1], [build/$name]) echo "obj-n += $dir" >>build/Makefile ], [ echo "obj-m += $dir" >>build/Makefile ]) ], [ echo "obj-n += $dir" >>build/Makefile ]) done ZFS_LINUX_TEST_MODPOST([$1], [build]) ], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ touch build/$name/$name.ko ]) done ]) ]) dnl # dnl # ZFS_LINUX_TEST_SRC dnl # dnl # $1 - name dnl # $2 - global dnl # $3 - source dnl # $4 - extra cflags dnl # $5 - check license-compatibility dnl # dnl # Check if the test source is buildable at all and then if it is dnl # license compatible. dnl # dnl # N.B because all of the test cases are compiled in parallel they dnl # must never depend on the results of previous tests. Each test dnl # needs to be entirely independent. dnl # AC_DEFUN([ZFS_LINUX_TEST_SRC], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]], [["Dual BSD/GPL"]])], [$1]) ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) AS_IF([ test -n "$5" ], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM( [[$2]], [[$3]], [[$5]])], [$1_license]) ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT dnl # dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC) dnl # $2 - run on success (valid .ko generated) dnl # $3 - run on failure (unable to compile) dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT], [ AS_IF([test -d build/$1], [ AS_IF([test -f build/$1/$1.ko], [$2], [$3]) ], [ AC_MSG_ERROR([ *** No matching source for the "$1" test, check that *** both the test source and result macros refer to the same name. ]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_ERROR dnl # dnl # Generic error message which can be used when none of the expected dnl # kernel interfaces were detected. dnl # AC_DEFUN([ZFS_LINUX_TEST_ERROR], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT_SYMBOL dnl # dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to dnl # verify symbol exports, unless --enable-linux-builtin was provided to dnl # configure. dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [ AS_IF([ ! test -f build/$1/$1.ko], [ $5 ], [ AS_IF([test "x$enable_linux_builtin" != "xyes"], [ ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5]) ], [ $4 ]) ]) ]) dnl # dnl # ZFS_LINUX_COMPILE_IFELSE dnl # AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ ZFS_LINUX_TEST_REMOVE([conftest]) m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])]) m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])], [ZFS_LINUX_CONFTEST_H([], [conftest])]) ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no], [m4_ifvaln([$5], [-I$PWD/build/conftest], [])]) ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], []) ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE dnl # dnl # $1 - global dnl # $2 - source dnl # $3 - run on success (valid .ko generated) dnl # $4 - run on failure (unable to compile) dnl # dnl # When configuring as builtin (--enable-linux-builtin) for kernels dnl # without loadable module support (CONFIG_MODULES=n) only the object dnl # file is created. See ZFS_LINUX_TEST_COMPILE_ALL for details. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4]) ]) ]) dnl # dnl # ZFS_CHECK_SYMBOL_EXPORT dnl # dnl # Check if a symbol is exported on not by consulting the symbols dnl # file, or optionally the source code. dnl # AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null rc=$? if test $rc -ne 0; then export=0 for file in $2; do grep -q -E "EXPORT_SYMBOL.*($1)" \ "$LINUX/$file" 2>/dev/null rc=$? if test $rc -eq 0; then export=1 break; fi done if test $export -eq 0; then : $4 else : $3 fi else : $3 fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL dnl # dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called dnl # to verify symbol exports, unless --enable-linux-builtin was provided dnl # to configure. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) if test $rc -ne 0; then : $6 else if test "x$enable_linux_builtin" != xyes; then ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1]) fi if test $rc -ne 0; then : $6 else : $5 fi fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_HEADER dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are dnl # provided via the fifth parameter dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4], [$5]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4], [$5]) ]) ]) dnl # dnl # AS_VERSION_COMPARE_LE dnl # like AS_VERSION_COMPARE_LE, but runs $3 if (and only if) $1 <= $2 dnl # AS_VERSION_COMPARE_LE (version-1, version-2, [action-if-less-or-equal], [action-if-greater]) dnl # AC_DEFUN([AS_VERSION_COMPARE_LE], [ AS_VERSION_COMPARE([$1], [$2], [$3], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_REQUIRE_API dnl # like ZFS_LINUX_TEST_ERROR, except only fails if the kernel is dnl # at least some specified version. dnl # AC_DEFUN([ZFS_LINUX_REQUIRE_API], [ AS_VERSION_COMPARE_LE([$2], [$kernsrcver], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. This *** interface is expected for kernels version "$2" and above. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. Newer kernels may have incompatible *** APIs. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ], [ AC_MSG_RESULT(no) ]) ]) diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index 5948e44daab1..56a0ac96ac19 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -1,312 +1,311 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H #if __FreeBSD_version >= 1300125 #define TEARDOWN_RMS #endif #if __FreeBSD_version >= 1300109 #define TEARDOWN_INACTIVE_RMS #endif #include #include #include #include #include #include #ifdef TEARDOWN_INACTIVE_RMS #include #endif #include #ifdef __cplusplus extern "C" { #endif #ifdef TEARDOWN_RMS typedef struct rmslock zfs_teardown_lock_t; #else #define zfs_teardown_lock_t rrmlock_t #endif #ifdef TEARDOWN_INACTIVE_RMS typedef struct rmslock zfs_teardown_inactive_lock_t; #else #define zfs_teardown_inactive_lock_t krwlock_t #endif typedef struct zfsvfs zfsvfs_t; struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ zfsvfs_t *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_type; /* type of acl usable on this fs */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_atime; /* enable atimes mount option */ boolean_t z_unmounted; /* unmounted */ zfs_teardown_lock_t z_teardown_lock; zfs_teardown_inactive_lock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ - uint64_t z_nr_znodes; /* number of znodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ boolean_t z_use_namecache; /* make use of FreeBSD name cache */ uint8_t z_xattr; /* xattr type in use */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ struct task z_unlinked_drain_task; }; #ifdef TEARDOWN_RMS #define ZFS_TEARDOWN_INIT(zfsvfs) \ rms_init(&(zfsvfs)->z_teardown_lock, "zfs teardown") #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rms_destroy(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rms_rlock(&(zfsvfs)->z_teardown_lock); #define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ rms_runlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ rms_wlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ rms_wunlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ rms_unlock(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ rms_rowned(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ rms_wowned(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_HELD(zfsvfs) \ rms_owned_any(&(zfsvfs)->z_teardown_lock) #else #define ZFS_TEARDOWN_INIT(zfsvfs) \ rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rrm_destroy(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); #define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) #define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_HELD(zfsvfs) \ RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) #endif #ifdef TEARDOWN_INACTIVE_RMS #define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ rms_init(&(zfsvfs)->z_teardown_inactive_lock, "zfs teardown inactive") #define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ rms_destroy(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ rms_rlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ rms_runlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ rms_wlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ rms_wowned(&(zfsvfs)->z_teardown_inactive_lock) #else #define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ rw_init(&(zfsvfs)->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL) #define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ rw_destroy(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ rw_tryenter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) #define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) #define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ rw_exit(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_WRITER) #define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ rw_exit(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ RW_WRITE_HELD(&(zfsvfs)->z_teardown_inactive_lock) #endif #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes[**] currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. * * [*] 20 bytes on FreeBSD to fit into the size of struct fid. * [**] 2 bytes on FreeBSD for the above reason. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; extern int zfs_bclone_enabled; extern void zfs_init(void); extern void zfs_fini(void); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readonly, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); extern int zfs_busy(void); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h index e0f20ba32008..f111e648ccf7 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h @@ -1,774 +1,781 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * LLNL-CODE-403049. */ #ifndef _ZFS_BLKDEV_H #define _ZFS_BLKDEV_H #include #include #include #include #include /* for SECTOR_* */ #include #ifdef HAVE_BLK_MQ #include #endif #ifndef HAVE_BLK_QUEUE_FLAG_SET static inline void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { queue_flag_set(flag, q); } #endif #ifndef HAVE_BLK_QUEUE_FLAG_CLEAR static inline void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { queue_flag_clear(flag, q); } #endif /* * 4.7 API, * The blk_queue_write_cache() interface has replaced blk_queue_flush() * interface. However, the new interface is GPL-only thus we implement * our own trivial wrapper when the GPL-only version is detected. * * 2.6.36 - 4.6 API, * The blk_queue_flush() interface has replaced blk_queue_ordered() * interface. However, while the old interface was available to all the * new one is GPL-only. Thus if the GPL-only version is detected we * implement our own trivial helper. */ static inline void blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) { #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY) if (wc) blk_queue_flag_set(QUEUE_FLAG_WC, q); else blk_queue_flag_clear(QUEUE_FLAG_WC, q); if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else blk_queue_flag_clear(QUEUE_FLAG_FUA, q); #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE) blk_queue_write_cache(q, wc, fua); #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) if (wc) q->flush_flags |= REQ_FLUSH; if (fua) q->flush_flags |= REQ_FUA; #elif defined(HAVE_BLK_QUEUE_FLUSH) blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0)); #else #error "Unsupported kernel" #endif } static inline void blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) { #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \ !defined(HAVE_DISK_UPDATE_READAHEAD) #ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC q->backing_dev_info->ra_pages = ra_pages; #else q->backing_dev_info.ra_pages = ra_pages; #endif #endif } #ifdef HAVE_BIO_BVEC_ITER #define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector #define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size #define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx #define BIO_BI_SKIP(bio) (bio)->bi_iter.bi_bvec_done #define bio_for_each_segment4(bv, bvp, b, i) \ bio_for_each_segment((bv), (b), (i)) typedef struct bvec_iter bvec_iterator_t; #else #define BIO_BI_SECTOR(bio) (bio)->bi_sector #define BIO_BI_SIZE(bio) (bio)->bi_size #define BIO_BI_IDX(bio) (bio)->bi_idx #define BIO_BI_SKIP(bio) (0) #define bio_for_each_segment4(bv, bvp, b, i) \ bio_for_each_segment((bvp), (b), (i)) typedef int bvec_iterator_t; #endif static inline void bio_set_flags_failfast(struct block_device *bdev, int *flags, bool dev, bool transport, bool driver) { #ifdef CONFIG_BUG /* * Disable FAILFAST for loopback devices because of the * following incorrect BUG_ON() in loop_make_request(). * This support is also disabled for md devices because the * test suite layers md devices on top of loopback devices. * This may be removed when the loopback driver is fixed. * * BUG_ON(!lo || (rw != READ && rw != WRITE)); */ if ((MAJOR(bdev->bd_dev) == LOOP_MAJOR) || (MAJOR(bdev->bd_dev) == MD_MAJOR)) return; #ifdef BLOCK_EXT_MAJOR if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) return; #endif /* BLOCK_EXT_MAJOR */ #endif /* CONFIG_BUG */ if (dev) *flags |= REQ_FAILFAST_DEV; if (transport) *flags |= REQ_FAILFAST_TRANSPORT; if (driver) *flags |= REQ_FAILFAST_DRIVER; } /* * Maximum disk label length, it may be undefined for some kernels. */ #if !defined(DISK_NAME_LEN) #define DISK_NAME_LEN 32 #endif /* DISK_NAME_LEN */ #ifdef HAVE_BIO_BI_STATUS static inline int bi_status_to_errno(blk_status_t status) { switch (status) { case BLK_STS_OK: return (0); case BLK_STS_NOTSUPP: return (EOPNOTSUPP); case BLK_STS_TIMEOUT: return (ETIMEDOUT); case BLK_STS_NOSPC: return (ENOSPC); case BLK_STS_TRANSPORT: return (ENOLINK); case BLK_STS_TARGET: return (EREMOTEIO); #ifdef HAVE_BLK_STS_RESV_CONFLICT case BLK_STS_RESV_CONFLICT: #else case BLK_STS_NEXUS: #endif return (EBADE); case BLK_STS_MEDIUM: return (ENODATA); case BLK_STS_PROTECTION: return (EILSEQ); case BLK_STS_RESOURCE: return (ENOMEM); case BLK_STS_AGAIN: return (EAGAIN); case BLK_STS_IOERR: return (EIO); default: return (EIO); } } static inline blk_status_t errno_to_bi_status(int error) { switch (error) { case 0: return (BLK_STS_OK); case EOPNOTSUPP: return (BLK_STS_NOTSUPP); case ETIMEDOUT: return (BLK_STS_TIMEOUT); case ENOSPC: return (BLK_STS_NOSPC); case ENOLINK: return (BLK_STS_TRANSPORT); case EREMOTEIO: return (BLK_STS_TARGET); case EBADE: #ifdef HAVE_BLK_STS_RESV_CONFLICT return (BLK_STS_RESV_CONFLICT); #else return (BLK_STS_NEXUS); #endif case ENODATA: return (BLK_STS_MEDIUM); case EILSEQ: return (BLK_STS_PROTECTION); case ENOMEM: return (BLK_STS_RESOURCE); case EAGAIN: return (BLK_STS_AGAIN); case EIO: return (BLK_STS_IOERR); default: return (BLK_STS_IOERR); } } #endif /* HAVE_BIO_BI_STATUS */ /* * 4.3 API change * The bio_endio() prototype changed slightly. These are helper * macro's to ensure the prototype and invocation are handled. */ #ifdef HAVE_1ARG_BIO_END_IO_T #ifdef HAVE_BIO_BI_STATUS #define BIO_END_IO_ERROR(bio) bi_status_to_errno(bio->bi_status) #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x) #define BIO_END_IO(bio, error) bio_set_bi_status(bio, error) static inline void bio_set_bi_status(struct bio *bio, int error) { ASSERT3S(error, <=, 0); bio->bi_status = errno_to_bi_status(-error); bio_endio(bio); } #else #define BIO_END_IO_ERROR(bio) (-(bio->bi_error)) #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x) #define BIO_END_IO(bio, error) bio_set_bi_error(bio, error) static inline void bio_set_bi_error(struct bio *bio, int error) { ASSERT3S(error, <=, 0); bio->bi_error = error; bio_endio(bio); } #endif /* HAVE_BIO_BI_STATUS */ #else #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x, int z) #define BIO_END_IO(bio, error) bio_endio(bio, error); #endif /* HAVE_1ARG_BIO_END_IO_T */ /* * 5.15 MACRO, * GD_DEAD * * 2.6.36 - 5.14 MACRO, * GENHD_FL_UP * * Check the disk status and return B_TRUE if alive * otherwise B_FALSE */ static inline boolean_t zfs_check_disk_status(struct block_device *bdev) { #if defined(GENHD_FL_UP) return (!!(bdev->bd_disk->flags & GENHD_FL_UP)); #elif defined(GD_DEAD) return (!test_bit(GD_DEAD, &bdev->bd_disk->state)); #else /* * This is encountered if neither GENHD_FL_UP nor GD_DEAD is available in * the kernel - likely due to an MACRO change that needs to be chased down. */ #error "Unsupported kernel: no usable disk status check" #endif } /* * 4.1 API, * 3.10.0 CentOS 7.x API, * blkdev_reread_part() * * For older kernels trigger a re-reading of the partition table by calling * check_disk_change() which calls flush_disk() to invalidate the device. * * For newer kernels (as of 5.10), bdev_check_media_change is used, in favor of * check_disk_change(), with the modification that invalidation is no longer * forced. */ #ifdef HAVE_CHECK_DISK_CHANGE #define zfs_check_media_change(bdev) check_disk_change(bdev) #ifdef HAVE_BLKDEV_REREAD_PART #define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev) #else #define vdev_bdev_reread_part(bdev) check_disk_change(bdev) #endif /* HAVE_BLKDEV_REREAD_PART */ #else #ifdef HAVE_BDEV_CHECK_MEDIA_CHANGE static inline int zfs_check_media_change(struct block_device *bdev) { #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK struct gendisk *gd = bdev->bd_disk; const struct block_device_operations *bdo = gd->fops; #endif if (!bdev_check_media_change(bdev)) return (0); #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK /* * Force revalidation, to mimic the old behavior of * check_disk_change() */ if (bdo->revalidate_disk) bdo->revalidate_disk(gd); #endif return (0); } #define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) #elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE) #define vdev_bdev_reread_part(bdev) disk_check_media_change(bdev->bd_disk) +#define zfs_check_media_change(bdev) disk_check_media_change(bdev->bd_disk) #else /* * This is encountered if check_disk_change() and bdev_check_media_change() * are not available in the kernel - likely due to an API change that needs * to be chased down. */ #error "Unsupported kernel: no usable disk change check" #endif /* HAVE_BDEV_CHECK_MEDIA_CHANGE */ #endif /* HAVE_CHECK_DISK_CHANGE */ /* * 2.6.27 API change * The function was exported for use, prior to this it existed but the * symbol was not exported. * * 4.4.0-6.21 API change for Ubuntu * lookup_bdev() gained a second argument, FMODE_*, to check inode permissions. * * 5.11 API change * Changed to take a dev_t argument which is set on success and return a * non-zero error code on failure. */ static inline int vdev_lookup_bdev(const char *path, dev_t *dev) { #if defined(HAVE_DEVT_LOOKUP_BDEV) return (lookup_bdev(path, dev)); #elif defined(HAVE_1ARG_LOOKUP_BDEV) struct block_device *bdev = lookup_bdev(path); if (IS_ERR(bdev)) return (PTR_ERR(bdev)); *dev = bdev->bd_dev; bdput(bdev); return (0); #elif defined(HAVE_MODE_LOOKUP_BDEV) struct block_device *bdev = lookup_bdev(path, FMODE_READ); if (IS_ERR(bdev)) return (PTR_ERR(bdev)); *dev = bdev->bd_dev; bdput(bdev); return (0); #else #error "Unsupported kernel" #endif } +#if defined(HAVE_BLK_MODE_T) +#define blk_mode_is_open_write(flag) ((flag) & BLK_OPEN_WRITE) +#else +#define blk_mode_is_open_write(flag) ((flag) & FMODE_WRITE) +#endif + /* * Kernels without bio_set_op_attrs use bi_rw for the bio flags. */ #if !defined(HAVE_BIO_SET_OP_ATTRS) static inline void bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) { #if defined(HAVE_BIO_BI_OPF) bio->bi_opf = rw | flags; #else bio->bi_rw |= rw | flags; #endif /* HAVE_BIO_BI_OPF */ } #endif /* * bio_set_flush - Set the appropriate flags in a bio to guarantee * data are on non-volatile media on completion. * * 2.6.37 - 4.8 API, * Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a * replacement for WRITE_BARRIER to allow expressing richer semantics * to the block layer. It's up to the block layer to implement the * semantics correctly. Use the WRITE_FLUSH_FUA flag combination. * * 4.8 - 4.9 API, * REQ_FLUSH was renamed to REQ_PREFLUSH. For consistency with previous * OpenZFS releases, prefer the WRITE_FLUSH_FUA flag set if it's available. * * 4.10 API, * The read/write flags and their modifiers, including WRITE_FLUSH, * WRITE_FUA and WRITE_FLUSH_FUA were removed from fs.h in * torvalds/linux@70fd7614 and replaced by direct flag modification * of the REQ_ flags in bio->bi_opf. Use REQ_PREFLUSH. */ static inline void bio_set_flush(struct bio *bio) { #if defined(HAVE_REQ_PREFLUSH) /* >= 4.10 */ bio_set_op_attrs(bio, 0, REQ_PREFLUSH | REQ_OP_WRITE); #elif defined(WRITE_FLUSH_FUA) /* >= 2.6.37 and <= 4.9 */ bio_set_op_attrs(bio, 0, WRITE_FLUSH_FUA); #else #error "Allowing the build will cause bio_set_flush requests to be ignored." #endif } /* * 4.8 API, * REQ_OP_FLUSH * * 4.8-rc0 - 4.8-rc1, * REQ_PREFLUSH * * 2.6.36 - 4.7 API, * REQ_FLUSH * * in all cases but may have a performance impact for some kernels. It * has the advantage of minimizing kernel specific changes in the zvol code. * */ static inline boolean_t bio_is_flush(struct bio *bio) { #if defined(HAVE_REQ_OP_FLUSH) && defined(HAVE_BIO_BI_OPF) return ((bio_op(bio) == REQ_OP_FLUSH) || (bio->bi_opf & REQ_PREFLUSH)); #elif defined(HAVE_REQ_PREFLUSH) && defined(HAVE_BIO_BI_OPF) return (bio->bi_opf & REQ_PREFLUSH); #elif defined(HAVE_REQ_PREFLUSH) && !defined(HAVE_BIO_BI_OPF) return (bio->bi_rw & REQ_PREFLUSH); #elif defined(HAVE_REQ_FLUSH) return (bio->bi_rw & REQ_FLUSH); #else #error "Unsupported kernel" #endif } /* * 4.8 API, * REQ_FUA flag moved to bio->bi_opf * * 2.6.x - 4.7 API, * REQ_FUA */ static inline boolean_t bio_is_fua(struct bio *bio) { #if defined(HAVE_BIO_BI_OPF) return (bio->bi_opf & REQ_FUA); #elif defined(REQ_FUA) return (bio->bi_rw & REQ_FUA); #else #error "Allowing the build will cause fua requests to be ignored." #endif } /* * 4.8 API, * REQ_OP_DISCARD * * 2.6.36 - 4.7 API, * REQ_DISCARD * * In all cases the normal I/O path is used for discards. The only * difference is how the kernel tags individual I/Os as discards. */ static inline boolean_t bio_is_discard(struct bio *bio) { #if defined(HAVE_REQ_OP_DISCARD) return (bio_op(bio) == REQ_OP_DISCARD); #elif defined(HAVE_REQ_DISCARD) return (bio->bi_rw & REQ_DISCARD); #else #error "Unsupported kernel" #endif } /* * 4.8 API, * REQ_OP_SECURE_ERASE * * 2.6.36 - 4.7 API, * REQ_SECURE */ static inline boolean_t bio_is_secure_erase(struct bio *bio) { #if defined(HAVE_REQ_OP_SECURE_ERASE) return (bio_op(bio) == REQ_OP_SECURE_ERASE); #elif defined(REQ_SECURE) return (bio->bi_rw & REQ_SECURE); #else return (0); #endif } /* * 2.6.33 API change * Discard granularity and alignment restrictions may now be set. For * older kernels which do not support this it is safe to skip it. */ static inline void blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) { q->limits.discard_granularity = dg; } /* * 5.19 API, * bdev_max_discard_sectors() * * 2.6.32 API, * blk_queue_discard() */ static inline boolean_t bdev_discard_supported(struct block_device *bdev) { #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS) return (!!bdev_max_discard_sectors(bdev)); #elif defined(HAVE_BLK_QUEUE_DISCARD) return (!!blk_queue_discard(bdev_get_queue(bdev))); #else #error "Unsupported kernel" #endif } /* * 5.19 API, * bdev_max_secure_erase_sectors() * * 4.8 API, * blk_queue_secure_erase() * * 2.6.36 - 4.7 API, * blk_queue_secdiscard() */ static inline boolean_t bdev_secure_discard_supported(struct block_device *bdev) { #if defined(HAVE_BDEV_MAX_SECURE_ERASE_SECTORS) return (!!bdev_max_secure_erase_sectors(bdev)); #elif defined(HAVE_BLK_QUEUE_SECURE_ERASE) return (!!blk_queue_secure_erase(bdev_get_queue(bdev))); #elif defined(HAVE_BLK_QUEUE_SECDISCARD) return (!!blk_queue_secdiscard(bdev_get_queue(bdev))); #else #error "Unsupported kernel" #endif } /* * A common holder for vdev_bdev_open() is used to relax the exclusive open * semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to * allow them to open the device multiple times. Other kernel callers and * user space processes which don't pass this value will get EBUSY. This is * currently required for the correct operation of hot spares. */ #define VDEV_HOLDER ((void *)0x2401de7) static inline unsigned long blk_generic_start_io_acct(struct request_queue *q __attribute__((unused)), struct gendisk *disk __attribute__((unused)), int rw __attribute__((unused)), struct bio *bio) { #if defined(HAVE_BDEV_IO_ACCT_63) return (bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies)); #elif defined(HAVE_BDEV_IO_ACCT_OLD) return (bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio), jiffies)); #elif defined(HAVE_DISK_IO_ACCT) return (disk_start_io_acct(disk, bio_sectors(bio), bio_op(bio))); #elif defined(HAVE_BIO_IO_ACCT) return (bio_start_io_acct(bio)); #elif defined(HAVE_GENERIC_IO_ACCT_3ARG) unsigned long start_time = jiffies; generic_start_io_acct(rw, bio_sectors(bio), &disk->part0); return (start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) unsigned long start_time = jiffies; generic_start_io_acct(q, rw, bio_sectors(bio), &disk->part0); return (start_time); #else /* Unsupported */ return (0); #endif } static inline void blk_generic_end_io_acct(struct request_queue *q __attribute__((unused)), struct gendisk *disk __attribute__((unused)), int rw __attribute__((unused)), struct bio *bio, unsigned long start_time) { #if defined(HAVE_BDEV_IO_ACCT_63) bdev_end_io_acct(bio->bi_bdev, bio_op(bio), bio_sectors(bio), start_time); #elif defined(HAVE_BDEV_IO_ACCT_OLD) bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); #elif defined(HAVE_DISK_IO_ACCT) disk_end_io_acct(disk, bio_op(bio), start_time); #elif defined(HAVE_BIO_IO_ACCT) bio_end_io_acct(bio, start_time); #elif defined(HAVE_GENERIC_IO_ACCT_3ARG) generic_end_io_acct(rw, &disk->part0, start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) generic_end_io_acct(q, rw, &disk->part0, start_time); #endif } #ifndef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS static inline struct request_queue * blk_generic_alloc_queue(make_request_fn make_request, int node_id) { #if defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN) return (blk_alloc_queue(make_request, node_id)); #elif defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH) return (blk_alloc_queue_rh(make_request, node_id)); #else struct request_queue *q = blk_alloc_queue(GFP_KERNEL); if (q != NULL) blk_queue_make_request(q, make_request); return (q); #endif } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ /* * All the io_*() helper functions below can operate on a bio, or a rq, but * not both. The older submit_bio() codepath will pass a bio, and the * newer blk-mq codepath will pass a rq. */ static inline int io_data_dir(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) { if (op_is_write(req_op(rq))) { return (WRITE); } else { return (READ); } } #else ASSERT3P(rq, ==, NULL); #endif return (bio_data_dir(bio)); } static inline int io_is_flush(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) return (req_op(rq) == REQ_OP_FLUSH); #else ASSERT3P(rq, ==, NULL); #endif return (bio_is_flush(bio)); } static inline int io_is_discard(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) return (req_op(rq) == REQ_OP_DISCARD); #else ASSERT3P(rq, ==, NULL); #endif return (bio_is_discard(bio)); } static inline int io_is_secure_erase(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) return (req_op(rq) == REQ_OP_SECURE_ERASE); #else ASSERT3P(rq, ==, NULL); #endif return (bio_is_secure_erase(bio)); } static inline int io_is_fua(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) return (rq->cmd_flags & REQ_FUA); #else ASSERT3P(rq, ==, NULL); #endif return (bio_is_fua(bio)); } static inline uint64_t io_offset(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) return (blk_rq_pos(rq) << 9); #else ASSERT3P(rq, ==, NULL); #endif return (BIO_BI_SECTOR(bio) << 9); } static inline uint64_t io_size(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) return (blk_rq_bytes(rq)); #else ASSERT3P(rq, ==, NULL); #endif return (BIO_BI_SIZE(bio)); } static inline int io_has_data(struct bio *bio, struct request *rq) { #ifdef HAVE_BLK_MQ if (rq != NULL) return (bio_has_data(rq->bio)); #else ASSERT3P(rq, ==, NULL); #endif return (bio_has_data(bio)); } #endif /* _ZFS_BLKDEV_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h index fe2b5c07a018..cce097e16fbc 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h @@ -1,176 +1,188 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_UIO_H #define _SPL_UIO_H #include #include #include #include #include #include #include #include #if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE) #define iov_iter_fault_in_readable(a, b) fault_in_iov_iter_readable(a, b) #endif typedef struct iovec iovec_t; typedef enum zfs_uio_rw { UIO_READ = 0, UIO_WRITE = 1, } zfs_uio_rw_t; typedef enum zfs_uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, UIO_BVEC = 2, #if defined(HAVE_VFS_IOV_ITER) UIO_ITER = 3, #endif } zfs_uio_seg_t; typedef struct zfs_uio { union { const struct iovec *uio_iov; const struct bio_vec *uio_bvec; #if defined(HAVE_VFS_IOV_ITER) struct iov_iter *uio_iter; #endif }; int uio_iovcnt; offset_t uio_loffset; zfs_uio_seg_t uio_segflg; boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; ssize_t uio_resid; size_t uio_skip; struct request *rq; /* * Used for saving rq_for_each_segment() state between calls * to zfs_uiomove_bvec_rq(). */ struct req_iterator iter; struct bio_vec bv; } zfs_uio_t; #define zfs_uio_segflg(u) (u)->uio_segflg #define zfs_uio_offset(u) (u)->uio_loffset #define zfs_uio_resid(u) (u)->uio_resid #define zfs_uio_iovcnt(u) (u)->uio_iovcnt #define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base #define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set #define zfs_uio_rlimit_fsize(z, u) (0) #define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u)) extern int zfs_uio_prefaultpages(ssize_t, zfs_uio_t *); static inline void zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) { uio->uio_loffset = off; } static inline void zfs_uio_advance(zfs_uio_t *uio, size_t size) { uio->uio_resid -= size; uio->uio_loffset += size; } static inline void zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, unsigned long nr_segs, offset_t offset, zfs_uio_seg_t seg, ssize_t resid, size_t skip) { ASSERT(seg == UIO_USERSPACE || seg == UIO_SYSSPACE); uio->uio_iov = iov; uio->uio_iovcnt = nr_segs; uio->uio_loffset = offset; uio->uio_segflg = seg; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; } static inline void zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) { /* Either bio or rq will be set, but not both */ ASSERT3P(uio, !=, bio); if (bio) { uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; } else { uio->uio_bvec = NULL; uio->uio_iovcnt = 0; memset(&uio->iter, 0, sizeof (uio->iter)); } uio->uio_loffset = io_offset(bio, rq); uio->uio_segflg = UIO_BVEC; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = io_size(bio, rq); if (bio) { uio->uio_skip = BIO_BI_SKIP(bio); } else { uio->uio_skip = 0; } uio->rq = rq; } #if defined(HAVE_VFS_IOV_ITER) static inline void zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, ssize_t resid, size_t skip) { uio->uio_iter = iter; uio->uio_iovcnt = iter->nr_segs; uio->uio_loffset = offset; uio->uio_segflg = UIO_ITER; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; } #endif +#if defined(HAVE_ITER_IOV) +#define zfs_uio_iter_iov(iter) iter_iov((iter)) +#else +#define zfs_uio_iter_iov(iter) (iter)->iov +#endif + +#if defined(HAVE_IOV_ITER_TYPE) +#define zfs_uio_iov_iter_type(iter) iov_iter_type((iter)) +#else +#define zfs_uio_iov_iter_type(iter) (iter)->type +#endif + #endif /* SPL_UIO_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h index e320b8de4222..b4d5db21f5e5 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -1,256 +1,255 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct zfsvfs zfsvfs_t; struct znode; /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural * differences between the platforms. */ typedef struct vfs { struct zfsvfs *vfs_data; char *vfs_mntpoint; /* Primary mount point */ uint64_t vfs_xattr; boolean_t vfs_readonly; boolean_t vfs_do_readonly; boolean_t vfs_setuid; boolean_t vfs_do_setuid; boolean_t vfs_exec; boolean_t vfs_do_exec; boolean_t vfs_devices; boolean_t vfs_do_devices; boolean_t vfs_do_xattr; boolean_t vfs_atime; boolean_t vfs_do_atime; boolean_t vfs_relatime; boolean_t vfs_do_relatime; boolean_t vfs_nbmand; boolean_t vfs_do_nbmand; } vfs_t; typedef struct zfs_mnt { const char *mnt_osname; /* Objset name */ char *mnt_data; /* Raw mount options */ } zfs_mnt_t; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ struct super_block *z_sb; /* generic super_block */ struct zfsvfs *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ uint_t z_acl_type; /* type of ACL usable on this FS */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_relatime; /* enable relatime mount option */ boolean_t z_unmounted; /* unmounted */ rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ - uint64_t z_nr_znodes; /* number of znodes in the fs */ unsigned long z_rollback_time; /* last online rollback time */ unsigned long z_snap_defer_time; /* last snapshot unmount deferral */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ struct inode *z_ctldir; /* .zfs directory inode */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ boolean_t z_draining; /* is true when drain is active */ boolean_t z_drain_cancel; /* signal the unlinked drain to stop */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ uint64_t z_hold_size; /* znode hold array size */ avl_tree_t *z_hold_trees; /* znode hold trees */ kmutex_t *z_hold_locks; /* znode hold locks */ taskqid_t z_drain_task; /* task id for the unlink drain task */ }; #define ZFS_TEARDOWN_INIT(zfsvfs) \ rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) #define ZFS_TEARDOWN_DESTROY(zfsvfs) \ rrm_destroy(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); #define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) #define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ rrm_exit(&(zfsvfs)->z_teardown_lock, tag) #define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) #define ZFS_TEARDOWN_HELD(zfsvfs) \ RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Allow a maximum number of links. While ZFS does not internally limit * this the inode->i_nlink member is defined as an unsigned int. To be * safe we use 2^31-1 as the limit. */ #define ZFS_LINK_MAX ((1U << 31) - 1U) /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern void zfs_init(void); extern void zfs_fini(void); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern void zfs_exit_fs(zfsvfs_t *zfsvfs); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); extern void zfs_preumount(struct super_block *sb); extern int zfs_umount(struct super_block *sb); extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm); extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp); extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects); extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/sys/contrib/openzfs/man/man7/zpoolconcepts.7 b/sys/contrib/openzfs/man/man7/zpoolconcepts.7 index db3fd4926236..98f3ee7cd660 100644 --- a/sys/contrib/openzfs/man/man7/zpoolconcepts.7 +++ b/sys/contrib/openzfs/man/man7/zpoolconcepts.7 @@ -1,514 +1,512 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or https://opensource.org/licenses/CDDL-1.0. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" .Dd April 7, 2023 .Dt ZPOOLCONCEPTS 7 .Os . .Sh NAME .Nm zpoolconcepts .Nd overview of ZFS storage pools . .Sh DESCRIPTION .Ss Virtual Devices (vdevs) A "virtual device" describes a single device or a collection of devices, organized according to certain performance and fault characteristics. The following virtual devices are supported: .Bl -tag -width "special" .It Sy disk A block device, typically located under .Pa /dev . ZFS can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name .Po the relative portion of the path under .Pa /dev .Pc . A whole disk can be specified by omitting the slice or partition designation. For example, .Pa sda is equivalent to .Pa /dev/sda . When given a whole disk, ZFS automatically labels the disk, if necessary. .It Sy file A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system on which it resides. A file must be specified by a full path. .It Sy mirror A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with .Em N No disks of size Em X No can hold Em X No bytes and can withstand Em N-1 devices failing, without losing data. .It Sy raidz , raidz1 , raidz2 , raidz3 A distributed-parity layout, similar to RAID-5/6, with improved distribution of parity, and which does not suffer from the RAID-5/6 .Qq write hole , .Pq in which data and parity become inconsistent after a power loss . Data and parity is striped across all disks within a raidz group, though not necessarily in a consistent stripe width. .Pp A raidz group can have single, double, or triple parity, meaning that the raidz group can sustain one, two, or three failures, respectively, without losing any data. The .Sy raidz1 vdev type specifies a single-parity raidz group; the .Sy raidz2 vdev type specifies a double-parity raidz group; and the .Sy raidz3 vdev type specifies a triple-parity raidz group. The .Sy raidz vdev type is an alias for .Sy raidz1 . .Pp A raidz group with .Em N No disks of size Em X No with Em P No parity disks can hold approximately .Em (N-P)*X No bytes and can withstand Em P No devices failing without losing data . The minimum number of devices in a raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. .It Sy draid , draid1 , draid2 , draid3 A variant of raidz that provides integrated distributed hot spares, allowing for faster resilvering, while retaining the benefits of raidz. A dRAID vdev is constructed from multiple internal raidz groups, each with .Em D No data devices and Em P No parity devices . These groups are distributed over all of the children in order to fully utilize the available disk performance. .Pp Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with zeros) to allow fully sequential resilvering. This fixed stripe width significantly affects both usable capacity and IOPS. For example, with the default .Em D=8 No and Em 4 KiB No disk sectors the minimum allocation size is Em 32 KiB . If using compression, this relatively large allocation size can reduce the effective compression ratio. When using ZFS volumes (zvols) and dRAID, the default of the .Sy volblocksize property is increased to account for the allocation size. If a dRAID pool will hold a significant amount of small blocks, it is recommended to also add a mirrored .Sy special vdev to store those blocks. .Pp In regards to I/O, performance is similar to raidz since, for any read, all .Em D No data disks must be accessed . Delivered random IOPS can be reasonably approximated as .Sy floor((N-S)/(D+P))*single_drive_IOPS . .Pp Like raidz, a dRAID can have single-, double-, or triple-parity. The .Sy draid1 , .Sy draid2 , and .Sy draid3 types can be used to specify the parity level. The .Sy draid vdev type is an alias for .Sy draid1 . .Pp A dRAID with .Em N No disks of size Em X , D No data disks per redundancy group , Em P .No parity level, and Em S No distributed hot spares can hold approximately .Em (N-S)*(D/(D+P))*X No bytes and can withstand Em P devices failing without losing data. .It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc A non-default dRAID configuration can be specified by appending one or more of the following optional arguments to the .Sy draid keyword: .Bl -tag -compact -width "children" .It Ar parity The parity level (1-3). .It Ar data The number of data devices per redundancy group. In general, a smaller value of .Em D No will increase IOPS, improve the compression ratio , and speed up resilvering at the expense of total usable capacity. Defaults to .Em 8 , No unless Em N-P-S No is less than Em 8 . .It Ar children The expected number of children. Useful as a cross-check when listing a large number of devices. An error is returned when the provided number of children differs. .It Ar spares The number of distributed hot spares. Defaults to zero. .El .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the .Sx Hot Spares section. .It Sy log A separate intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, raidz vdev types are not supported for the intent log. For more information, see the .Sx Intent Log section. .It Sy dedup A device solely dedicated for deduplication tables. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one dedup device is specified, then allocations are load-balanced between those devices. .It Sy special A device dedicated solely for allocating various kinds of internal metadata, and optionally small file blocks. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one special device is specified, then allocations are load-balanced between those devices. .Pp For more information on special allocations, see the .Sx Special Allocation Class section. .It Sy cache A device used to cache storage pool data. A cache device cannot be configured as a mirror or raidz group. For more information, see the .Sx Cache Devices section. .El .Pp -Virtual devices cannot be nested, so a mirror or raidz virtual device can only -contain files or disks. -Mirrors of mirrors -.Pq or other combinations -are not allowed. +Virtual devices cannot be nested arbitrarily. +A mirror, raidz or draid virtual device can only be created with files or disks. +Mirrors of mirrors or other such combinations are not allowed. .Pp A pool can have any number of virtual devices at the top of the configuration .Po known as .Qq root vdevs .Pc . Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, ZFS automatically places data on the newly available devices. .Pp Virtual devices are specified one at a time on the command line, separated by whitespace. Keywords like .Sy mirror No and Sy raidz are used to distinguish where a group ends and another begins. For example, the following creates a pool with two root vdevs, each a mirror of two disks: .Dl # Nm zpool Cm create Ar mypool Sy mirror Ar sda sdb Sy mirror Ar sdc sdd . .Ss Device Failure and Recovery ZFS supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and ZFS automatically repairs bad data from a good copy, when corruption is detected. .Pp In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or raidz groups. While ZFS supports running in a non-redundant configuration, where each root vdev is simply a disk or file, this is strongly discouraged. A single case of bit corruption can render some or all of your data unavailable. .Pp A pool's health status is described by one of three states: .Sy online , degraded , No or Sy faulted . An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has corrupted metadata, or one or more faulted devices, and insufficient replicas to continue functioning. .Pp The health of the top-level vdev, such as a mirror or raidz device, is potentially impacted by the state of its associated vdevs or component devices. A top-level vdev or component device is in one of the following states: .Bl -tag -width "DEGRADED" .It Sy DEGRADED One or more top-level vdevs is in the degraded state because one or more component devices are offline. Sufficient replicas exist to continue functioning. .Pp One or more component devices is in the degraded or faulted state, but sufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -compact .It The number of checksum errors exceeds acceptable levels and the device is degraded as an indication that something may be wrong. ZFS continues to use the device as necessary. .It The number of I/O errors exceeds acceptable levels. The device could not be marked as faulted because there are insufficient replicas to continue functioning. .El .It Sy FAULTED One or more top-level vdevs is in the faulted state because one or more component devices are offline. Insufficient replicas exist to continue functioning. .Pp One or more component devices is in the faulted state, and insufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -compact .It The device could be opened, but the contents did not match expected values. .It The number of I/O errors exceeds acceptable levels and the device is faulted to prevent further use of the device. .El .It Sy OFFLINE The device was explicitly taken offline by the .Nm zpool Cm offline command. .It Sy ONLINE The device is online and functioning. .It Sy REMOVED The device was physically removed while the system was running. Device removal detection is hardware-dependent and may not be supported on all platforms. .It Sy UNAVAIL The device could not be opened. If a pool is imported when a device was unavailable, then the device will be identified by a unique identifier instead of its path since the path was never correct in the first place. .El .Pp Checksum errors represent events where a disk returned data that was expected to be correct, but was not. In other words, these are instances of silent data corruption. The checksum errors are reported in .Nm zpool Cm status and .Nm zpool Cm events . When a block is stored redundantly, a damaged block may be reconstructed (e.g. from raidz parity or a mirrored copy). In this case, ZFS reports the checksum error against the disks that contained damaged data. If a block is unable to be reconstructed (e.g. due to 3 disks being damaged in a raidz2 group), it is not possible to determine which disks were silently corrupted. In this case, checksum errors are reported for all disks on which the block is stored. .Pp If a device is removed and later re-attached to the system, ZFS attempts to bring the device online automatically. Device attachment detection is hardware-dependent and might not be supported on all platforms. . .Ss Hot Spares ZFS allows devices to be associated with pools as .Qq hot spares . These devices are not actively used in the pool. But, when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a .Sy spare vdev with any number of devices. For example, .Dl # Nm zpool Cm create Ar pool Sy mirror Ar sda sdb Sy spare Ar sdc sdd .Pp Spares can be shared across multiple pools, and can be added with the .Nm zpool Cm add command and removed with the .Nm zpool Cm remove command. Once a spare replacement is initiated, a new .Sy spare vdev is created within the configuration that will remain there until the original device is replaced. At this point, the hot spare becomes available again, if another device fails. .Pp If a pool has a shared spare that is currently being used, the pool cannot be exported, since other pools may use this shared spare, which may lead to potential data corruption. .Pp Shared spares add some risk. If the pools are imported on different hosts, and both pools suffer a device failure at the same time, both could attempt to use the spare at the same time. This may not be detected, resulting in data corruption. .Pp An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools. .Pp The .Sy draid vdev type provides distributed hot spares. These hot spares are named after the dRAID vdev they're a part of .Po Sy draid1 Ns - Ns Ar 2 Ns - Ns Ar 3 No specifies spare Ar 3 No of vdev Ar 2 , .No which is a single parity dRAID Pc and may only be used by that dRAID vdev. Otherwise, they behave the same as normal hot spares. .Pp Spares cannot replace log devices. . .Ss Intent Log The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous transactions. For instance, databases often require their transactions to be on stable storage devices when returning from a system call. NFS and other applications can also use .Xr fsync 2 to ensure data stability. By default, the intent log is allocated from blocks within the main pool. However, it might be possible to get better performance using separate intent log devices such as NVRAM or a dedicated disk. For example: .Dl # Nm zpool Cm create Ar pool sda sdb Sy log Ar sdc .Pp Multiple log devices can also be specified, and they can be mirrored. See the .Sx EXAMPLES section for an example of mirroring multiple log devices. .Pp Log devices can be added, replaced, attached, detached, and removed. In addition, log devices are imported and exported as part of the pool that contains them. Mirrored devices can be removed by specifying the top-level mirror vdev. . .Ss Cache Devices Devices can be added to a storage pool as .Qq cache devices . These devices provide an additional layer of caching between main memory and disk. For read-heavy workloads, where the working set size is much larger than what can be cached in main memory, using cache devices allows much more of this working set to be served from low latency media. Using cache devices provides the greatest performance improvement for random read-workloads of mostly static content. .Pp To create a pool with cache devices, specify a .Sy cache vdev with any number of devices. For example: .Dl # Nm zpool Cm create Ar pool sda sdb Sy cache Ar sdc sdd .Pp Cache devices cannot be mirrored or part of a raidz configuration. If a read error is encountered on a cache device, that read I/O is reissued to the original storage pool device, which might be part of a mirrored or raidz configuration. .Pp The content of the cache devices is persistent across reboots and restored asynchronously when importing the pool in L2ARC (persistent L2ARC). This can be disabled by setting .Sy l2arc_rebuild_enabled Ns = Ns Sy 0 . For cache devices smaller than .Em 1 GiB , ZFS does not write the metadata structures required for rebuilding the L2ARC, to conserve space. This can be changed with .Sy l2arc_rebuild_blocks_min_l2size . The cache device header .Pq Em 512 B is updated even if no metadata structures are written. Setting .Sy l2arc_headroom Ns = Ns Sy 0 will result in scanning the full-length ARC lists for cacheable content to be written in L2ARC (persistent ARC). If a cache device is added with .Nm zpool Cm add , its label and header will be overwritten and its contents will not be restored in L2ARC, even if the device was previously part of the pool. If a cache device is onlined with .Nm zpool Cm online , its contents will be restored in L2ARC. This is useful in case of memory pressure, where the contents of the cache device are not fully restored in L2ARC. The user can off- and online the cache device when there is less memory pressure, to fully restore its contents to L2ARC. . .Ss Pool checkpoint Before starting critical procedures that include destructive actions .Pq like Nm zfs Cm destroy , an administrator can checkpoint the pool's state and, in the case of a mistake or failure, rewind the entire pool back to the checkpoint. Otherwise, the checkpoint can be discarded when the procedure has completed successfully. .Pp A pool checkpoint can be thought of as a pool-wide snapshot and should be used with care as it contains every part of the pool's state, from properties to vdev configuration. Thus, certain operations are not allowed while a pool has a checkpoint. Specifically, vdev removal/attach/detach, mirror splitting, and changing the pool's GUID. Adding a new vdev is supported, but in the case of a rewind it will have to be added again. Finally, users of this feature should keep in mind that scrubs in a pool that has a checkpoint do not repair checkpointed data. .Pp To create a checkpoint for a pool: .Dl # Nm zpool Cm checkpoint Ar pool .Pp To later rewind to its checkpointed state, you need to first export it and then rewind it during import: .Dl # Nm zpool Cm export Ar pool .Dl # Nm zpool Cm import Fl -rewind-to-checkpoint Ar pool .Pp To discard the checkpoint from a pool: .Dl # Nm zpool Cm checkpoint Fl d Ar pool .Pp Dataset reservations (controlled by the .Sy reservation No and Sy refreservation properties) may be unenforceable while a checkpoint exists, because the checkpoint is allowed to consume the dataset's reservation. Finally, data that is part of the checkpoint but has been freed in the current state of the pool won't be scanned during a scrub. . .Ss Special Allocation Class Allocations in the special class are dedicated to specific block types. By default, this includes all metadata, the indirect blocks of user data, and any deduplication tables. The class can also be provisioned to accept small file blocks. .Pp A pool must always have at least one normal .Pq non- Ns Sy dedup Ns /- Ns Sy special vdev before other devices can be assigned to the special class. If the .Sy special class becomes full, then allocations intended for it will spill back into the normal class. .Pp Deduplication tables can be excluded from the special class by unsetting the .Sy zfs_ddt_data_is_special ZFS module parameter. .Pp Inclusion of small file blocks in the special class is opt-in. Each dataset can control the size of small file blocks allowed in the special class by setting the .Sy special_small_blocks property to nonzero. See .Xr zfsprops 7 for more info on this property. diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in index 5b71e1abf79e..9b34b3dfaec7 100644 --- a/sys/contrib/openzfs/module/Makefile.in +++ b/sys/contrib/openzfs/module/Makefile.in @@ -1,171 +1,171 @@ include Kbuild INSTALL_MOD_DIR ?= extra INSTALL_MOD_PATH ?= $(DESTDIR) all: modules distclean maintainer-clean: clean install: modules_install data_install uninstall: modules_uninstall data_uninstall check: .PHONY: all distclean maintainer-clean install uninstall check distdir \ modules modules-Linux modules-FreeBSD modules-unknown \ clean clean-Linux clean-FreeBSD \ modules_install modules_install-Linux modules_install-FreeBSD \ data_install data_install-Linux data_install-FreeBSD \ modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \ data_uninstall data_uninstall-Linux data_uninstall-FreeBSD \ cppcheck cppcheck-Linux cppcheck-FreeBSD # For FreeBSD, use debug options from ./configure if not overridden. export WITH_DEBUG ?= @WITH_DEBUG@ export WITH_INVARIANTS ?= @WITH_INVARIANTS@ # Filter out options that FreeBSD make doesn't understand getflags = ( \ set -- \ $(filter-out --%,$(firstword $(MFLAGS))) \ $(filter -I%,$(MFLAGS)) \ $(filter -j%,$(MFLAGS)); \ fmakeflags=""; \ while getopts :deiI:j:knqrstw flag; do \ case $$flag in \ \?) :;; \ :) if [ $$OPTARG = "j" ]; then \ ncpus=$$(sysctl -n kern.smp.cpus 2>/dev/null || :); \ if [ -n "$$ncpus" ]; then fmakeflags="$$fmakeflags -j$$ncpus"; fi; \ fi;; \ d) fmakeflags="$$fmakeflags -dA";; \ *) fmakeflags="$$fmakeflags -$$flag$$OPTARG";; \ esac; \ done; \ echo $$fmakeflags \ ) FMAKEFLAGS = -C @abs_srcdir@ -f Makefile.bsd $(shell $(getflags)) ifneq (@abs_srcdir@,@abs_builddir@) FMAKEFLAGS += MAKEOBJDIR=@abs_builddir@ endif FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) modules-Linux: mkdir -p $(sort $(dir $(spl-objs) $(spl-))) mkdir -p $(sort $(dir $(zfs-objs) $(zfs-))) $(MAKE) -C @LINUX_OBJ@ $(if @KERNEL_CC@,CC=@KERNEL_CC@) \ $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: +$(FMAKE) modules-unknown: @true modules: modules-@ac_system@ clean-Linux: @# Only cleanup the kernel build directories when CONFIG_KERNEL @# is defined. This indicates that kernel modules should be built. @CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ M="$$PWD" @KERNEL_MAKE@ clean $(RM) @LINUX_SYMBOLS@ Module.markers find . -name '*.ur-safe' -type f -delete clean-FreeBSD: +$(FMAKE) clean clean: clean-@ac_system@ .PHONY: modules_uninstall-Linux-legacy modules_uninstall-Linux-legacy: $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/) KMODDIR := $(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ modules_install-Linux: modules_uninstall-Linux-legacy @# Install the kernel modules $(MAKE) -C @LINUX_OBJ@ M="$$PWD" modules_install \ INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) \ INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \ KERNELRELEASE=@LINUX_VERSION@ @# Remove extraneous build products when packaging if [ -n "$(DESTDIR)" ]; then \ find $(KMODDIR) -name 'modules.*' -delete; \ fi @# Debian ships tiny fake System.map files that are @# syntactically valid but just say @# "if you want system.map go install this package" @# Naturally, depmod is less than amused by this. @# So if we find it missing or with one of these present, @# we check for the alternate path for the System.map sysmap=$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \ { [ -f "$$sysmap" ] && [ $$(wc -l < "$$sysmap") -ge 100 ]; } || \ sysmap=$(INSTALL_MOD_PATH)/usr/lib/debug/boot/System.map-@LINUX_VERSION@; \ if [ -f $$sysmap ]; then \ depmod -ae -F $$sysmap @LINUX_VERSION@; \ fi modules_install-FreeBSD: @# Install the kernel modules +$(FMAKE) install modules_install: modules_install-@ac_system@ data_install-Linux: @mkdir -p $(DESTDIR)/@prefix@/src/zfs-@VERSION@/@LINUX_VERSION@ cp ../zfs.release ../zfs_config.h @LINUX_SYMBOLS@ $(DESTDIR)/@prefix@/src/zfs-@VERSION@/@LINUX_VERSION@ data_install-FreeBSD: @ data_install: data_install-@ac_system@ modules_uninstall-Linux: modules_uninstall-Linux-legacy @# Uninstall the kernel modules $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko) modules_uninstall-FreeBSD: @false modules_uninstall: modules_uninstall-@ac_system@ data_uninstall-Linux: $(RM) $(addprefix $(DESTDIR)/@prefix@/src/zfs-@VERSION@/@LINUX_VERSION@/,zfs.release zfs_config.h @LINUX_SYMBOLS@) data_uninstall-FreeBSD: @ data_uninstall: data_uninstall-@ac_system@ cppcheck-Linux: @CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \ --inline-suppr \ --suppress=unmatchedSuppression \ --suppress=noValidConfiguration \ --enable=warning,information -D_KERNEL \ --include=@LINUX_OBJ@/include/generated/autoconf.h \ --include=@top_builddir@/zfs_config.h \ --config-exclude=@LINUX_OBJ@/include \ -i zstd/lib \ -I @LINUX_OBJ@/include \ -I @top_srcdir@/include/os/linux/kernel \ -I @top_srcdir@/include/os/linux/spl \ -I @top_srcdir@/include/os/linux/zfs \ -I @top_srcdir@/include \ avl icp lua nvpair unicode zcommon zfs zstd os/linux cppcheck-FreeBSD: @true cppcheck: cppcheck-@ac_system@ distdir: cd @srcdir@ && find . -name '*.[chS]' -exec sh -c 'for f; do mkdir -p $$distdir/$${f%/*}; cp @srcdir@/$$f $$distdir/$$f; done' _ {} + cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd gen-zstd-symbols: for obj in $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)); do echo; echo "/* $${obj#zstd/}: */"; @OBJDUMP@ -t $$obj | awk '$$2 == "g" && !/ zfs_/ {print "#define\t" $$6 " zfs_" $$6}' | sort; done >> zstd/include/zstd_compat_wrapper.h check-zstd-symbols: - @OBJDUMP@ -t $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)) | awk '/file format/ {print} $$2 == "g" && !/ zfs_/ {++ret; print} END {exit ret}' + @OBJDUMP@ -t $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)) | awk '/file format/ {print} $$2 == "g" && (!/ zfs_/ && !/ __pfx_zfs_/) {++ret; print} END {exit ret}' diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 49b97ae8f590..8969fd6a54bd 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -1,2521 +1,2519 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #ifndef MNTK_VMSETSIZE_BUG #define MNTK_VMSETSIZE_BUG 0 #endif #ifndef MNTK_NOMSYNC #define MNTK_NOMSYNC 8 #endif struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owners can perform privileged operation on file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); int zfs_bclone_enabled; SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, &zfs_bclone_enabled, 0, "Enable block cloning"); struct zfs_jailparam { int mount_snapshot; }; static struct zfs_jailparam zfs_jailparam0 = { .mount_snapshot = 0, }; static int zfs_jailparam_slot; SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters"); SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I", "Allow mounting snapshots in the .zfs directory for unjailed datasets"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); #if __FreeBSD_version >= 1400018 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy); #else static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); #endif static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); #if __FreeBSD_version >= 1300098 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors); #else static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); #endif static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, #if __FreeBSD_version >= 1300049 .vfs_root = vfs_cache_root, .vfs_cachedroot = zfs_root, #else .vfs_root = zfs_root, #endif .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, .vfs_quotactl = zfs_quotactl, }; #ifdef VFCF_CROSS_COPY_FILE_RANGE VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE); #else VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL); #endif /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, &zfvp); if (error != 0) return (error); if (zfvp == NULL) return (ENOENT); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) tmp = 1; break; case ZFS_PROP_DEVICES: if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) tmp = 1; break; case ZFS_PROP_EXEC: if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) tmp = 1; break; case ZFS_PROP_SETUID: if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) tmp = 1; break; case ZFS_PROP_READONLY: if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) tmp = 1; break; case ZFS_PROP_XATTR: if (zfvp->z_flags & ZSB_XATTR) tmp = zfvp->z_xattr; break; case ZFS_PROP_NBMAND: if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) tmp = 1; break; default: vfs_unbusy(vfsp); return (ENOENT); } vfs_unbusy(vfsp); if (tmp != *val) { if (setpoint) (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } static int zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) { int error = 0; char buf[32]; uint64_t usedobj, quotaobj; uint64_t quota, used = 0; timespec_t now; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) { error = ENOENT; goto done; } (void) sprintf(buf, "%llx", (longlong_t)id); if ((error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota), 1, "a)) != 0) { dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); goto done; } /* * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". * So we set them to be the same. */ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); if (error && error != ENOENT) { dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); goto done; } dqp->dqb_curblocks = btodb(used); dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; vfs_timestamp(&now); /* * Setting this to 0 causes FreeBSD quota(8) to print * the number of days since the epoch, which isn't * particularly useful. */ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; done: return (error); } static int #if __FreeBSD_version >= 1400018 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) #else zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; struct thread *td; int cmd, type, error = 0; int bitsize; zfs_userquota_prop_t quota_type; struct dqblk64 dqblk = { 0 }; td = curthread; cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (id == -1) { switch (type) { case USRQUOTA: id = td->td_ucred->cr_ruid; break; case GRPQUOTA: id = td->td_ucred->cr_rgid; break; default: error = EINVAL; #if __FreeBSD_version < 1400018 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(vfsp); #endif goto done; } } /* * Map BSD type to: * ZFS_PROP_USERUSED, * ZFS_PROP_USERQUOTA, * ZFS_PROP_GROUPUSED, * ZFS_PROP_GROUPQUOTA */ switch (cmd) { case Q_SETQUOTA: case Q_SETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERQUOTA; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPQUOTA; else error = EINVAL; break; case Q_GETQUOTA: case Q_GETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERUSED; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPUSED; else error = EINVAL; break; } /* * Depending on the cmd, we may need to get * the ruid and domain (see fuidstr_to_sid?), * the fuid (how?), or other information. * Create fuid using zfs_fuid_create(zfsvfs, id, * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? * I think I can use just the id? * * Look at zfs_id_overquota() to look up a quota. * zap_lookup(something, quotaobj, fuidstring, * sizeof (long long), 1, "a) * * See zfs_set_userquota() to set a quota. */ if ((uint32_t)type >= MAXQUOTAS) { error = EINVAL; goto done; } switch (cmd) { case Q_GETQUOTASIZE: bitsize = 64; error = copyout(&bitsize, arg, sizeof (int)); break; case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; #if __FreeBSD_version < 1400018 vfs_unbusy(vfsp); #endif break; case Q_QUOTAOFF: error = ENOTSUP; #if __FreeBSD_version < 1400018 vfs_unbusy(vfsp); #endif break; case Q_SETQUOTA: error = copyin(arg, &dqblk, sizeof (dqblk)); if (error == 0) error = zfs_set_userquota(zfsvfs, quota_type, "", id, dbtob(dqblk.dqb_bhardlimit)); break; case Q_GETQUOTA: error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); if (error == 0) error = copyout(&dqblk, arg, sizeof (dqblk)); break; default: error = EINVAL; break; } done: zfs_exit(zfsvfs, FTAG); return (error); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); } static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (rebooting && spa_suspended(dp->dp_spa)) { zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(8). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static void acl_type_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_type = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; boolean_t xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; boolean_t do_xattr = B_FALSE; int error = 0; ASSERT3P(vfsp, !=, NULL); zfsvfs = vfsp->vfs_data; ASSERT3P(zfsvfs, !=, NULL); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_write_cachefile() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); if (error != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); if (error == 0 && val == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT3U(zfsvfs->z_root, !=, 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } taskq_t *zfsvfs_taskq; static void zfsvfs_task_unlinked_drain(void *context, int pending __unused) { zfs_unlinked_drain((zfsvfs_t *)context); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); return (error); } int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, zfsvfs_task_unlinked_drain, zfsvfs); ZFS_TEARDOWN_INIT(zfsvfs); ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) return (SET_ERROR(EROFS)); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) { zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; } else { dsl_dir_t *dd; zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", (u_longlong_t)zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { boolean_t use_nc = zfsvfs->z_use_namecache; zfsvfs->z_use_namecache = B_FALSE; zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; zfsvfs->z_use_namecache = use_nc; } } /* restore readonly bit */ if (readonly != 0) zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; } else { ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); - ASSERT3U(zfsvfs->z_nr_znodes, ==, 0); list_destroy(&zfsvfs->z_all_znodes); ZFS_TEARDOWN_DESTROY(zfsvfs); ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; ASSERT3P(vfsp, !=, NULL); ASSERT3P(osname, !=, NULL); error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; /* * This can cause a loss of coherence between ARC and page cache * on ZoF - unclear if the problem is in FreeBSD or ZoF */ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ vfsp->mnt_kern_flag |= MNTK_NOMSYNC; vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; #if defined(_KERNEL) && !defined(KMEM_DEBUG) vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; #endif /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | (vfsp->mnt_vfc->vfc_typenum & 0xFF); /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acl_type_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strlcpy(poolname, osname, p - osname + 1); } return (0); } static void fetch_osname_options(char *name, bool *checkpointrewind) { if (name[0] == '!') { *checkpointrewind = true; memmove(name, name + 1, strlen(name)); } else { *checkpointrewind = false; } } static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; bool checkpointrewind, isctlsnap = false; if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } fetch_osname_options(osname, &checkpointrewind); isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) && strchr(osname, '@') != NULL); /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error && isctlsnap) { secpolicy_fs_mount_clearopts(cr, vfsp); } else if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK1(mvp); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK1(mvp); goto out; } VOP_UNLOCK1(mvp); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curproc) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { boolean_t mount_snapshot = B_FALSE; /* * Snapshots may be mounted in .zfs for unjailed datasets * if allowed by the jail param zfs.mount_snapshot. */ if (isctlsnap) { struct prison *pr; struct zfs_jailparam *zjp; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); zjp = osd_jail_get(pr, zfs_jailparam_slot); mtx_unlock(&pr->pr_mtx); if (zjp && zjp->mount_snapshot) mount_snapshot = B_TRUE; } if (!mount_snapshot) { error = SET_ERROR(EPERM); goto out; } } vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname, checkpointrewind); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; int error; statp->f_version = STATFS_VERSION; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ strlcpy(statp->f_fstypename, "zfs", sizeof (statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof (statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof (statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; zfs_exit(zfsvfs, FTAG); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); zfs_exit(zfsvfs, FTAG); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; dsl_dir_t *dd; /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but zreles run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. + * We can safely check z_all_znodes for being empty because the + * VFS has already blocked operations which add to it. */ int round = 0; - while (zfsvfs->z_nr_znodes > 0) { + while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ #ifdef FREEBSD_NAMECACHE #if __FreeBSD_version >= 1300117 cache_purgevfs(zfsvfs->z_parent->z_vfs); #else cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl != NULL) { zfs_znode_dmu_fini(zp); } } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (!zfs_is_readonly(zfsvfs)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)vfsp->vfs_resource, ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfsvfs->z_unmounted = B_TRUE; ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) return (ret); while (taskqueue_cancel(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task, NULL) != 0) taskqueue_drain(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); if (err == 0) { err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); } if (err != 0) *vpp = NULL; return (err); } static int #if __FreeBSD_version >= 1300098 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors) #else zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN, "struct fid bigger than SHORT_FID_LEN"); _Static_assert(sizeof (struct fid) >= LONG_FID_LEN, "struct fid bigger than LONG_FID_LEN"); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { struct componentname cn; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; vnode_t *dvp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t setgen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); zfs_exit(zfsvfs, FTAG); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (fidp->fid_len == LONG_FID_LEN && setgen != 0) { zfs_exit(zfsvfs, FTAG); dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", (u_longlong_t)fid_gen, (u_longlong_t)setgen); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { zfs_exit(zfsvfs, FTAG); VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | LOCKLEAF; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else if (object == zfsvfs->z_shares_dir) { /* * XXX This branch must not be taken, * if it is, then the lookup below will * explode. */ cn.cn_nameptr = "shares"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else { *vpp = dvp; } return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, (u_longlong_t)fid_gen, (u_longlong_t)gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { zfs_exit(zfsvfs, FTAG); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); vrele(ZTOV(zp)); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); err = vn_lock(*vpp, flags); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; znode_t *zp; ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via zfs_enter_verify_zp * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #include #include #include #include #include #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof (struct vm_object) + sizeof (struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); } void zfs_fini(void) { taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_vfs, 0); zfsvfs->z_unmounted = B_TRUE; return (0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY0(sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %ju to %ju", (uintmax_t)zfsvfs->z_version, (uintmax_t)newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_vfs != NULL && (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void) strlcpy(fromname, newname, sizeof (mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", newname, fromname + oldlen); (void) strlcpy(fromname, tmpbuf, sizeof (mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif /* * Find a prison with ZFS info. * Return the ZFS info and the (locked) prison. */ static struct zfs_jailparam * zfs_jailparam_find(struct prison *spr, struct prison **prp) { struct prison *pr; struct zfs_jailparam *zjp; for (pr = spr; ; pr = pr->pr_parent) { mtx_lock(&pr->pr_mtx); if (pr == &prison0) { zjp = &zfs_jailparam0; break; } zjp = osd_jail_get(pr, zfs_jailparam_slot); if (zjp != NULL) break; mtx_unlock(&pr->pr_mtx); } *prp = pr; return (zjp); } /* * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the * ZFS info and lock the prison. */ static void zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp) { struct prison *ppr; struct zfs_jailparam *zjp, *nzjp; void **rsv; /* If this prison already has ZFS info, return that. */ zjp = zfs_jailparam_find(pr, &ppr); if (ppr == pr) goto done; /* * Allocate a new info record. Then check again, in case something * changed during the allocation. */ mtx_unlock(&ppr->pr_mtx); nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK); rsv = osd_reserve(zfs_jailparam_slot); zjp = zfs_jailparam_find(pr, &ppr); if (ppr == pr) { free(nzjp, M_PRISON); osd_free_reserved(rsv); goto done; } /* Inherit the initial values from the ancestor. */ mtx_lock(&pr->pr_mtx); (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp); (void) memcpy(nzjp, zjp, sizeof (*zjp)); zjp = nzjp; mtx_unlock(&ppr->pr_mtx); done: if (zjpp != NULL) *zjpp = zjp; else mtx_unlock(&pr->pr_mtx); } /* * Jail OSD methods for ZFS VFS info. */ static int zfs_jailparam_create(void *obj, void *data) { struct prison *pr = obj; struct vfsoptlist *opts = data; int jsys; if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 && jsys == JAIL_SYS_INHERIT) return (0); /* * Inherit a prison's initial values from its parent * (different from JAIL_SYS_INHERIT which also inherits changes). */ zfs_jailparam_alloc(pr, NULL); return (0); } static int zfs_jailparam_get(void *obj, void *data) { struct prison *ppr, *pr = obj; struct vfsoptlist *opts = data; struct zfs_jailparam *zjp; int jsys, error; zjp = zfs_jailparam_find(pr, &ppr); jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys)); if (error != 0 && error != ENOENT) goto done; if (jsys == JAIL_SYS_NEW) { error = vfs_setopt(opts, "zfs.mount_snapshot", &zjp->mount_snapshot, sizeof (zjp->mount_snapshot)); if (error != 0 && error != ENOENT) goto done; } else { /* * If this prison is inheriting its ZFS info, report * empty/zero parameters. */ static int mount_snapshot = 0; error = vfs_setopt(opts, "zfs.mount_snapshot", &mount_snapshot, sizeof (mount_snapshot)); if (error != 0 && error != ENOENT) goto done; } error = 0; done: mtx_unlock(&ppr->pr_mtx); return (error); } static int zfs_jailparam_set(void *obj, void *data) { struct prison *pr = obj; struct prison *ppr; struct vfsoptlist *opts = data; int error, jsys, mount_snapshot; /* Set the parameters, which should be correct. */ error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); if (error == ENOENT) jsys = -1; error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, sizeof (mount_snapshot)); if (error == ENOENT) mount_snapshot = -1; else jsys = JAIL_SYS_NEW; switch (jsys) { case JAIL_SYS_NEW: { /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */ struct zfs_jailparam *zjp; /* * A child jail cannot have more permissions than its parent */ if (pr->pr_parent != &prison0) { zjp = zfs_jailparam_find(pr->pr_parent, &ppr); mtx_unlock(&ppr->pr_mtx); if (zjp->mount_snapshot < mount_snapshot) { return (EPERM); } } zfs_jailparam_alloc(pr, &zjp); if (mount_snapshot != -1) zjp->mount_snapshot = mount_snapshot; mtx_unlock(&pr->pr_mtx); break; } case JAIL_SYS_INHERIT: /* "zfs=inherit": inherit the parent's ZFS info. */ mtx_lock(&pr->pr_mtx); osd_jail_del(pr, zfs_jailparam_slot); mtx_unlock(&pr->pr_mtx); break; case -1: /* * If the setting being changed is not ZFS related * then do nothing. */ break; } return (0); } static int zfs_jailparam_check(void *obj __unused, void *data) { struct vfsoptlist *opts = data; int error, jsys, mount_snapshot; /* Check that the parameters are correct. */ error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); if (error != ENOENT) { if (error != 0) return (error); if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) return (EINVAL); } error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, sizeof (mount_snapshot)); if (error != ENOENT) { if (error != 0) return (error); if (mount_snapshot != 0 && mount_snapshot != 1) return (EINVAL); } return (0); } static void zfs_jailparam_destroy(void *data) { free(data, M_PRISON); } static void zfs_jailparam_sysinit(void *arg __unused) { struct prison *pr; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_CREATE] = zfs_jailparam_create, [PR_METHOD_GET] = zfs_jailparam_get, [PR_METHOD_SET] = zfs_jailparam_set, [PR_METHOD_CHECK] = zfs_jailparam_check, }; zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods); /* Copy the defaults to any existing prisons. */ sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) zfs_jailparam_alloc(pr, NULL); sx_sunlock(&allprison_lock); } static void zfs_jailparam_sysuninit(void *arg __unused) { osd_jail_deregister(zfs_jailparam_slot); } SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zfs_jailparam_sysinit, NULL); SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zfs_jailparam_sysuninit, NULL); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c index c4f2b722ef4e..0d4c94555c6b 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -1,2228 +1,2226 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2011 Martin Matuska */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* Used by fstat(1). */ SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef ZFS_DEBUG #define ZNODE_STATS #endif /* DEBUG */ #ifdef ZNODE_STATS #define ZNODE_STAT_ADD(stat) ((stat)++) #else #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL #if !defined(KMEM_DEBUG) && __FreeBSD_version >= 1300102 #define _ZFS_USE_SMR static uma_zone_t znode_uma_zone; #else static kmem_cache_t *znode_cache = NULL; #endif extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; extern struct vop_vector zfs_shareops; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; POINTER_INVALIDATE(&zp->z_zfsvfs); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); ASSERT3P(zp->z_vnode, ==, NULL); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } #ifdef _ZFS_USE_SMR VFS_SMR_DECLARE; static int zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, int flags) { return (zfs_znode_cache_constructor(mem, private, flags)); } static void zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private) { zfs_znode_cache_destructor(mem, private); } void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_uma_zone, ==, NULL); znode_uma_zone = uma_zcreate("zfs_znode_cache", sizeof (znode_t), zfs_znode_cache_constructor_smr, zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); VFS_SMR_ZONE_SET(znode_uma_zone); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (uma_zalloc_smr(znode_uma_zone, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } uma_zfree_smr(znode_uma_zone, zp); } #else void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_cache, ==, NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (kmem_cache_alloc(znode_cache, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } #endif void zfs_znode_fini(void) { /* * Cleanup zcache */ #ifdef _ZFS_USE_SMR if (znode_uma_zone) { uma_zdestroy(znode_uma_zone); znode_uma_zone = NULL; } #else if (znode_cache) { kmem_cache_destroy(znode_cache); znode_cache = NULL; } #endif } static int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { zfs_acl_ids_t acl_ids; vattr_t vattr; znode_t *sharezp; znode_t *zp; int error; vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0555; vattr.va_uid = crgetuid(kcred); vattr.va_gid = crgetgid(kcred); sharezp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; sharezp->z_is_sa = zfsvfs->z_use_sa; VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids, NULL)); zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); POINTER_INVALIDATE(&sharezp->z_zfsvfs); error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); zfsvfs->z_shares_dir = sharezp->z_id; zfs_acl_ids_free(&acl_ids); sa_handle_destroy(sharezp->z_sa_hdl); zfs_znode_free_kmem(sharezp); return (error); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); ASSERT3P(zp->z_sa_hdl, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); if (sa_hdl == NULL) { VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* * Slap on VROOT if we are the root znode unless we are the root * node of a snapshot mounted under .zfs. */ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; vn_exists(ZTOV(zp)); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } static void zfs_vnode_forget(vnode_t *vp) { /* copied from insmntque_stddtr */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Construct a new znode/vnode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; vnode_t *vp; uint64_t mode; uint64_t parent; #ifdef notyet uint64_t mtime[2], ctime[2]; #endif uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[9]; int count = 0; int error; zp = zfs_znode_alloc_kmem(KM_SLEEP); #ifndef _ZFS_USE_SMR KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0, ("%s: fast path lookup enabled without smr", __func__)); #endif #if __FreeBSD_version >= 1300076 KASSERT(curthread->td_vp_reserved != NULL, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); #else KASSERT(curthread->td_vp_reserv > 0, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); #endif error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); if (error != 0) { zfs_znode_free_kmem(zp); return (NULL); } zp->z_vnode = vp; vp->v_data = zp; /* * Acquire the vnode lock before any possible interaction with the * outside world. Specifically, there is an error path that calls * zfs_vnode_forget() and the vnode should be exclusively locked. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; #if __FreeBSD_version >= 1300139 atomic_store_ptr(&zp->z_cached_symlink, NULL); #endif zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, 16); #ifdef notyet SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); #endif SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, 8); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zfs_vnode_forget(vp); zp->z_vnode = NULL; zfs_znode_free_kmem(zp); return (NULL); } zp->z_projid = projid; zp->z_mode = mode; /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; vp->v_type = IFTOVT((mode_t)mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; case VFIFO: vp->v_op = &zfs_fifoops; break; case VREG: if (parent == zfsvfs->z_shares_dir) { ASSERT0(zp->z_uid); ASSERT0(zp->z_gid); vp->v_op = &zfs_shareops; } break; default: break; } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); #if __FreeBSD_version >= 1400077 vn_set_state(vp, VSTATE_CONSTRUCTED); #endif VN_LOCK_AREC(vp); if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); return (zp); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. * * OUT: zpp - allocated znode * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; ASSERT3P(vap, !=, NULL); ASSERT3U((vap->va_mask & AT_MODE), ==, AT_MODE); if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; vfs_timestamp(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } else { dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (vap->va_type == VDIR) { size = 2; /* contents ("." and "..") */ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } else { size = links = 0; } if (vap->va_type == VBLK || vap->va_type == VCHR) { rdev = zfs_expldev(vap->va_rdev); } parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } if (obj_type == DMU_OT_ZNODE || (vap->va_type == VBLK || vap->va_type == VCHR)) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); ASSERT3P(*zpp, !=, NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; (*zpp)->z_dnodesize = dnodesize; if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { vnode_t *vp = ZTOV(*zpp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; (void) err; KASSERT(err == 0, ("insmntque() failed: error %d", err)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; xoap = xva_getxoptattr(xvap); ASSERT3P(xoap, !=, NULL); if (zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; sa_handle_t *hdl; int locked; int err; getnewvnode_reserve_(); again: *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { vp = ZTOV(zp); /* * Don't let the vnode disappear after * ZFS_OBJ_HOLD_EXIT. */ VN_HOLD(vp); *zpp = zp; err = 0; } sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); if (err) { getnewvnode_drop_reserve(); return (err); } locked = VOP_ISLOCKED(vp); VI_LOCK(vp); if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { /* * The vnode is doomed and this thread doesn't * hold the exclusive lock on it, so the vnode * must be being reclaimed by another thread. * Otherwise the doomed vnode is being reclaimed * by this thread and zfs_zget is called from * ZIL internals. */ VI_UNLOCK(vp); /* * XXX vrele() locks the vnode when the last reference * is dropped. Although in this case the vnode is * doomed / dead and so no inactivation is required, * the vnode lock is still acquired. That could result * in a LOR with z_teardown_lock if another thread holds * the vnode's lock and tries to take z_teardown_lock. * But that is only possible if the other thread peforms * a ZFS vnode operation on the vnode. That either * should not happen if the vnode is dead or the thread * should also have a reference to the vnode and thus * our reference is not last. */ VN_RELE(vp); goto again; } VI_UNLOCK(vp); getnewvnode_drop_reserve(); return (err); } /* * Not found create new znode/vnode * but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } if (err == 0) { vnode_t *vp = ZTOV(zp); err = insmntque(vp, zfsvfs->z_vfs); if (err == 0) { vp->v_hash = obj_num; VOP_UNLOCK1(vp); } else { zp->z_vnode = NULL; zfs_znode_dmu_fini(zp); zfs_znode_free(zp); *zpp = NULL; } } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_info_t doi; dmu_buf_t *db; vnode_t *vp; uint64_t obj_num = zp->z_id; uint64_t mode, size; sa_bulk_attr_t bulk[8]; int err; int count = 0; uint64_t gen; /* * Remove cached pages before reloading the znode, so that they are not * lingering after we run into any error. Ideally, we should vgone() * the vnode in case of error, but currently we cannot do that * because of the LOR between the vnode lock and z_teardown_lock. * So, instead, we have to "doom" the znode in the illumos style. * * Ignore invalid pages during the scan. This is to avoid deadlocks * between page busying and the teardown lock, as pages are busied prior * to a VOP_GETPAGES operation, which acquires the teardown read lock. * Such pages will be invalid and can safely be skipped here. */ vp = ZTOV(zp); #if __FreeBSD_version >= 1400042 vn_pages_remove_valid(vp, 0, 0); #else vn_pages_remove(vp, 0, 0); #endif ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT3P(zp->z_sa_hdl, ==, NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); size = zp->z_size; /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, sizeof (zp->z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, sizeof (zp->z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } zp->z_mode = mode; if (gen != zp->z_gen) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * It is highly improbable but still quite possible that two * objects in different datasets are created with the same * object numbers and in transaction groups with the same * numbers. znodes corresponding to those objects would * have the same z_id and z_gen, but their other attributes * may be different. * zfs recv -F may replace one of such objects with the other. * As a result file properties recorded in the replaced * object's vnode may no longer match the received object's * properties. At present the only cached property is the * files type recorded in v_type. * So, handle this case by leaving the old vnode and znode * disassociated from the actual object. A new vnode and a * znode will be created if the object is accessed * (e.g. via a look-up). The old vnode and znode will be * recycled when the last vnode reference is dropped. */ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatically removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (zp->z_links == 0); if (zp->z_unlinked) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } zp->z_blksz = doi.doi_data_block_size; if (zp->z_size != size) vnode_pager_setsize(vp, zp->z_size); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY0(dmu_object_free(os, acl_obj, tx)); } VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); zfs_znode_free(zp); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT3P(zp->z_sa_hdl, !=, NULL); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_rmnode(zp); return; } } zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_znode_free(zp); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; #if __FreeBSD_version >= 1300139 char *symlink; #endif ASSERT3P(zp->z_sa_hdl, ==, NULL); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; mutex_exit(&zfsvfs->z_znodes_lock); #if __FreeBSD_version >= 1300139 symlink = atomic_load_ptr(&zp->z_cached_symlink); if (symlink != NULL) { atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL); cache_symlink_free(symlink, strlen(symlink) + 1); } #endif if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } zfs_znode_free_kmem(zp); } void zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2], boolean_t have_tx) { timestruc_t now; vfs_timestamp(&now); if (have_tx) { /* will sa_bulk_update happen really soon? */ zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) { ZFS_TIME_ENCODE(&now, zp->z_atime); } if (flag & AT_MTIME) { ZFS_TIME_ENCODE(&now, mtime); if (zp->z_zfsvfs->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & AT_CTIME) { ZFS_TIME_ENCODE(&now, ctime); if (zp->z_zfsvfs->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); vnode_pager_setsize(ZTOV(zp), end); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); if (error == 0) { #if __FreeBSD_version >= 1400032 vnode_pager_purge_range(ZTOV(zp), off, off + len); #else /* * Before __FreeBSD_version 1400032 we cannot free block in the * middle of a file, but only at the end of a file, so this code * path should never happen. */ vnode_pager_setsize(ZTOV(zp), off); #endif } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ vnode_pager_setsize(vp, end); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; else return (error); } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(error); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; int i; znode_t *rootzp = NULL; zfsvfs_t *zfsvfs; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT0(error); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; const char *name; ASSERT3S(nvpair_type(elem), ==, DATA_TYPE_UINT64); val = fnvpair_value_uint64(elem); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT0(error); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT3U(version, !=, 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); ASSERT0(error); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT0(error); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ VATTR_NULL(&vattr); vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); rootzp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT0(error); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); rootzp->z_zfsvfs = zfsvfs; VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, NULL)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT0(error); zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); sa_handle_destroy(rootzp->z_sa_hdl); zfs_znode_free_kmem(rootzp); /* * Create shares directory */ error = zfs_create_share_dir(zfsvfs, tx); ASSERT0(error); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, const void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t))) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir; if (prevdb) { ASSERT3P(prevhdl, !=, NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { (void) sprintf(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT3P(path, >=, buf); memcpy(path, component, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT3P(sa_db, !=, NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; } else { pname = zfs_prop_to_name(prop); } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; case ZFS_PROP_ACLTYPE: *value = ZFS_ACLTYPE_NFSV4; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } void zfs_znode_update_vfs(znode_t *zp) { vm_object_t object; if ((object = ZTOV(zp)->v_object) == NULL || zp->z_size == object->un_pager.vnp.vnp_size) return; vnode_pager_setsize(ZTOV(zp), zp->z_size); } #ifdef _KERNEL int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t parent; int is_xattrdir; int err; /* Extended attributes should not be visible as regular files. */ if ((zp->z_pflags & ZFS_XATTR) != 0) return (SET_ERROR(EINVAL)); err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, &parent, &is_xattrdir); if (err != 0) return (err); ASSERT0(is_xattrdir); /* No name as this is a root object. */ if (parent == zp->z_id) return (SET_ERROR(EINVAL)); err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, ZFS_DIRENT_OBJ(-1ULL), buf); if (err != 0) return (err); err = zfs_zget(zfsvfs, parent, dzpp); return (err); } #endif /* _KERNEL */ #ifdef _KERNEL int zfs_rlimit_fsize(off_t fsize) { struct thread *td = curthread; off_t lim; if (td == NULL) return (0); lim = lim_cur(td, RLIMIT_FSIZE); if (__predict_true((uoff_t)fsize <= lim)) return (0); /* * The limit is reached. */ PROC_LOCK(td->td_proc); kern_psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c index 01f5619e1893..f0f929d3ce90 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c @@ -1,731 +1,766 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Proc Implementation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_gitrev.h" #if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) typedef struct ctl_table __no_const spl_ctl_table; #else typedef struct ctl_table spl_ctl_table; #endif static unsigned long table_min = 0; static unsigned long table_max = ~0; static struct ctl_table_header *spl_header = NULL; +#ifndef HAVE_REGISTER_SYSCTL_TABLE +static struct ctl_table_header *spl_kmem = NULL; +static struct ctl_table_header *spl_kstat = NULL; +#endif static struct proc_dir_entry *proc_spl = NULL; static struct proc_dir_entry *proc_spl_kmem = NULL; static struct proc_dir_entry *proc_spl_kmem_slab = NULL; static struct proc_dir_entry *proc_spl_taskq_all = NULL; static struct proc_dir_entry *proc_spl_taskq = NULL; struct proc_dir_entry *proc_spl_kstat = NULL; #ifdef DEBUG_KMEM static int proc_domemused(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int rc = 0; unsigned long val; spl_ctl_table dummy = *table; dummy.data = &val; dummy.proc_handler = &proc_dointvec; dummy.extra1 = &table_min; dummy.extra2 = &table_max; if (write) { *ppos += *lenp; } else { #ifdef HAVE_ATOMIC64_T val = atomic64_read((atomic64_t *)table->data); #else val = atomic_read((atomic_t *)table->data); #endif /* HAVE_ATOMIC64_T */ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); } return (rc); } #endif /* DEBUG_KMEM */ static int proc_doslab(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int rc = 0; unsigned long val = 0, mask; spl_ctl_table dummy = *table; spl_kmem_cache_t *skc = NULL; dummy.data = &val; dummy.proc_handler = &proc_dointvec; dummy.extra1 = &table_min; dummy.extra2 = &table_max; if (write) { *ppos += *lenp; } else { down_read(&spl_kmem_cache_sem); mask = (unsigned long)table->data; list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { /* Only use slabs of the correct kmem/vmem type */ if (!(skc->skc_flags & mask)) continue; /* Sum the specified field for selected slabs */ switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) { case KMC_TOTAL: val += skc->skc_slab_size * skc->skc_slab_total; break; case KMC_ALLOC: val += skc->skc_obj_size * skc->skc_obj_alloc; break; case KMC_MAX: val += skc->skc_obj_size * skc->skc_obj_max; break; } } up_read(&spl_kmem_cache_sem); rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); } return (rc); } static int proc_dohostid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char *end, str[32]; unsigned long hid; spl_ctl_table dummy = *table; dummy.data = str; dummy.maxlen = sizeof (str) - 1; if (!write) snprintf(str, sizeof (str), "%lx", (unsigned long) zone_get_hostid(NULL)); /* always returns 0 */ proc_dostring(&dummy, write, buffer, lenp, ppos); if (write) { /* * We can't use proc_doulongvec_minmax() in the write * case here because hostid, while a hex value, has no * leading 0x, which confuses the helper function. */ hid = simple_strtoul(str, &end, 16); if (str == end) return (-EINVAL); spl_hostid = hid; } return (0); } static void taskq_seq_show_headers(struct seq_file *f) { seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", "taskq", "act", "nthr", "spwn", "maxt", "pri", "mina", "maxa", "cura", "flags"); } /* indices into the lheads array below */ #define LHEAD_PEND 0 #define LHEAD_PRIO 1 #define LHEAD_DELAY 2 #define LHEAD_WAIT 3 #define LHEAD_ACTIVE 4 #define LHEAD_SIZE 5 static unsigned int spl_max_show_tasks = 512; /* CSTYLED */ module_param(spl_max_show_tasks, uint, 0644); MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc"); static int taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) { taskq_t *tq = p; taskq_thread_t *tqt = NULL; spl_wait_queue_entry_t *wq; struct task_struct *tsk; taskq_ent_t *tqe; char name[100]; struct list_head *lheads[LHEAD_SIZE], *lh; static char *list_names[LHEAD_SIZE] = {"pend", "prio", "delay", "wait", "active" }; int i, j, have_lheads = 0; unsigned long wflags, flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); /* get the various lists and check whether they're empty */ lheads[LHEAD_PEND] = &tq->tq_pend_list; lheads[LHEAD_PRIO] = &tq->tq_prio_list; lheads[LHEAD_DELAY] = &tq->tq_delay_list; #ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head; #else lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; #endif lheads[LHEAD_ACTIVE] = &tq->tq_active_list; for (i = 0; i < LHEAD_SIZE; ++i) { if (list_empty(lheads[i])) lheads[i] = NULL; else ++have_lheads; } /* early return in non-"all" mode if lists are all empty */ if (!allflag && !have_lheads) { spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); spin_unlock_irqrestore(&tq->tq_lock, flags); return (0); } /* unlock the waitq quickly */ if (!lheads[LHEAD_WAIT]) spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); /* show the base taskq contents */ snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance); seq_printf(f, "%-25s ", name); seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, tq->tq_nalloc, tq->tq_flags); /* show the active list */ if (lheads[LHEAD_ACTIVE]) { j = 0; list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { if (j == 0) seq_printf(f, "\t%s:", list_names[LHEAD_ACTIVE]); else if (j == 2) { seq_printf(f, "\n\t "); j = 0; } seq_printf(f, " [%d]%pf(%ps)", tqt->tqt_thread->pid, tqt->tqt_task->tqent_func, tqt->tqt_task->tqent_arg); ++j; } seq_printf(f, "\n"); } for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) if (lheads[i]) { j = 0; list_for_each(lh, lheads[i]) { if (spl_max_show_tasks != 0 && j >= spl_max_show_tasks) { seq_printf(f, "\n\t(truncated)"); break; } /* show the wait waitq list */ if (i == LHEAD_WAIT) { #ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY wq = list_entry(lh, spl_wait_queue_entry_t, entry); #else wq = list_entry(lh, spl_wait_queue_entry_t, task_list); #endif if (j == 0) seq_printf(f, "\t%s:", list_names[i]); else if (j % 8 == 0) seq_printf(f, "\n\t "); tsk = wq->private; seq_printf(f, " %d", tsk->pid); /* pend, prio and delay lists */ } else { tqe = list_entry(lh, taskq_ent_t, tqent_list); if (j == 0) seq_printf(f, "\t%s:", list_names[i]); else if (j % 2 == 0) seq_printf(f, "\n\t "); seq_printf(f, " %pf(%ps)", tqe->tqent_func, tqe->tqent_arg); } ++j; } seq_printf(f, "\n"); } if (lheads[LHEAD_WAIT]) spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); spin_unlock_irqrestore(&tq->tq_lock, flags); return (0); } static int taskq_all_seq_show(struct seq_file *f, void *p) { return (taskq_seq_show_impl(f, p, B_TRUE)); } static int taskq_seq_show(struct seq_file *f, void *p) { return (taskq_seq_show_impl(f, p, B_FALSE)); } static void * taskq_seq_start(struct seq_file *f, loff_t *pos) { struct list_head *p; loff_t n = *pos; down_read(&tq_list_sem); if (!n) taskq_seq_show_headers(f); p = tq_list.next; while (n--) { p = p->next; if (p == &tq_list) return (NULL); } return (list_entry(p, taskq_t, tq_taskqs)); } static void * taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) { taskq_t *tq = p; ++*pos; return ((tq->tq_taskqs.next == &tq_list) ? NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); } static void slab_seq_show_headers(struct seq_file *f) { seq_printf(f, "--------------------- cache ----------" "--------------------------------------------- " "----- slab ------ " "---- object ----- " "--- emergency ---\n"); seq_printf(f, "name " " flags size alloc slabsize objsize " "total alloc max " "total alloc max " "dlock alloc max\n"); } static int slab_seq_show(struct seq_file *f, void *p) { spl_kmem_cache_t *skc = p; ASSERT(skc->skc_magic == SKC_MAGIC); if (skc->skc_flags & KMC_SLAB) { /* * This cache is backed by a generic Linux kmem cache which * has its own accounting. For these caches we only track * the number of active allocated objects that exist within * the underlying Linux slabs. For the overall statistics of * the underlying Linux cache please refer to /proc/slabinfo. */ spin_lock(&skc->skc_lock); uint64_t objs_allocated = percpu_counter_sum(&skc->skc_linux_alloc); seq_printf(f, "%-36s ", skc->skc_name); seq_printf(f, "0x%05lx %9s %9lu %8s %8u " "%5s %5s %5s %5s %5lu %5s %5s %5s %5s\n", (long unsigned)skc->skc_flags, "-", (long unsigned)(skc->skc_obj_size * objs_allocated), "-", (unsigned)skc->skc_obj_size, "-", "-", "-", "-", (long unsigned)objs_allocated, "-", "-", "-", "-"); spin_unlock(&skc->skc_lock); return (0); } spin_lock(&skc->skc_lock); seq_printf(f, "%-36s ", skc->skc_name); seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", (long unsigned)skc->skc_flags, (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), (unsigned)skc->skc_slab_size, (unsigned)skc->skc_obj_size, (long unsigned)skc->skc_slab_total, (long unsigned)skc->skc_slab_alloc, (long unsigned)skc->skc_slab_max, (long unsigned)skc->skc_obj_total, (long unsigned)skc->skc_obj_alloc, (long unsigned)skc->skc_obj_max, (long unsigned)skc->skc_obj_deadlock, (long unsigned)skc->skc_obj_emergency, (long unsigned)skc->skc_obj_emergency_max); spin_unlock(&skc->skc_lock); return (0); } static void * slab_seq_start(struct seq_file *f, loff_t *pos) { struct list_head *p; loff_t n = *pos; down_read(&spl_kmem_cache_sem); if (!n) slab_seq_show_headers(f); p = spl_kmem_cache_list.next; while (n--) { p = p->next; if (p == &spl_kmem_cache_list) return (NULL); } return (list_entry(p, spl_kmem_cache_t, skc_list)); } static void * slab_seq_next(struct seq_file *f, void *p, loff_t *pos) { spl_kmem_cache_t *skc = p; ++*pos; return ((skc->skc_list.next == &spl_kmem_cache_list) ? NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list)); } static void slab_seq_stop(struct seq_file *f, void *v) { up_read(&spl_kmem_cache_sem); } static const struct seq_operations slab_seq_ops = { .show = slab_seq_show, .start = slab_seq_start, .next = slab_seq_next, .stop = slab_seq_stop, }; static int proc_slab_open(struct inode *inode, struct file *filp) { return (seq_open(filp, &slab_seq_ops)); } static const kstat_proc_op_t proc_slab_operations = { #ifdef HAVE_PROC_OPS_STRUCT .proc_open = proc_slab_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, #else .open = proc_slab_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, #endif }; static void taskq_seq_stop(struct seq_file *f, void *v) { up_read(&tq_list_sem); } static const struct seq_operations taskq_all_seq_ops = { .show = taskq_all_seq_show, .start = taskq_seq_start, .next = taskq_seq_next, .stop = taskq_seq_stop, }; static const struct seq_operations taskq_seq_ops = { .show = taskq_seq_show, .start = taskq_seq_start, .next = taskq_seq_next, .stop = taskq_seq_stop, }; static int proc_taskq_all_open(struct inode *inode, struct file *filp) { return (seq_open(filp, &taskq_all_seq_ops)); } static int proc_taskq_open(struct inode *inode, struct file *filp) { return (seq_open(filp, &taskq_seq_ops)); } static const kstat_proc_op_t proc_taskq_all_operations = { #ifdef HAVE_PROC_OPS_STRUCT .proc_open = proc_taskq_all_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, #else .open = proc_taskq_all_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, #endif }; static const kstat_proc_op_t proc_taskq_operations = { #ifdef HAVE_PROC_OPS_STRUCT .proc_open = proc_taskq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release, #else .open = proc_taskq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, #endif }; static struct ctl_table spl_kmem_table[] = { #ifdef DEBUG_KMEM { .procname = "kmem_used", .data = &kmem_alloc_used, #ifdef HAVE_ATOMIC64_T .maxlen = sizeof (atomic64_t), #else .maxlen = sizeof (atomic_t), #endif /* HAVE_ATOMIC64_T */ .mode = 0444, .proc_handler = &proc_domemused, }, { .procname = "kmem_max", .data = &kmem_alloc_max, .maxlen = sizeof (unsigned long), .extra1 = &table_min, .extra2 = &table_max, .mode = 0444, .proc_handler = &proc_doulongvec_minmax, }, #endif /* DEBUG_KMEM */ { .procname = "slab_kvmem_total", .data = (void *)(KMC_KVMEM | KMC_TOTAL), .maxlen = sizeof (unsigned long), .extra1 = &table_min, .extra2 = &table_max, .mode = 0444, .proc_handler = &proc_doslab, }, { .procname = "slab_kvmem_alloc", .data = (void *)(KMC_KVMEM | KMC_ALLOC), .maxlen = sizeof (unsigned long), .extra1 = &table_min, .extra2 = &table_max, .mode = 0444, .proc_handler = &proc_doslab, }, { .procname = "slab_kvmem_max", .data = (void *)(KMC_KVMEM | KMC_MAX), .maxlen = sizeof (unsigned long), .extra1 = &table_min, .extra2 = &table_max, .mode = 0444, .proc_handler = &proc_doslab, }, {}, }; static struct ctl_table spl_kstat_table[] = { {}, }; static struct ctl_table spl_table[] = { /* * NB No .strategy entries have been provided since * sysctl(8) prefers to go via /proc for portability. */ { .procname = "gitrev", .data = (char *)ZFS_META_GITREV, .maxlen = sizeof (ZFS_META_GITREV), .mode = 0444, .proc_handler = &proc_dostring, }, { .procname = "hostid", .data = &spl_hostid, .maxlen = sizeof (unsigned long), .mode = 0644, .proc_handler = &proc_dohostid, }, +#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, .child = spl_kmem_table, }, { .procname = "kstat", .mode = 0555, .child = spl_kstat_table, }, +#endif {}, }; +#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", .mode = 0555, .child = spl_table, }, {} }; static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; +#endif + +static void spl_proc_cleanup(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + +#ifndef HAVE_REGISTER_SYSCTL_TABLE + if (spl_kstat) { + unregister_sysctl_table(spl_kstat); + spl_kstat = NULL; + } + if (spl_kmem) { + unregister_sysctl_table(spl_kmem); + spl_kmem = NULL; + } +#endif + if (spl_header) { + unregister_sysctl_table(spl_header); + spl_header = NULL; + } +} int spl_proc_init(void) { int rc = 0; +#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); +#else + spl_header = register_sysctl("kernel/spl", spl_table); + if (spl_header == NULL) + return (-EUNATCH); + + spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table); + if (spl_kmem == NULL) { + rc = -EUNATCH; + goto out; + } + spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table); + if (spl_kstat == NULL) { + rc = -EUNATCH; + goto out; + } +#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { rc = -EUNATCH; goto out; } proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl, &proc_taskq_all_operations, NULL); if (proc_spl_taskq_all == NULL) { rc = -EUNATCH; goto out; } proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl, &proc_taskq_operations, NULL); if (proc_spl_taskq == NULL) { rc = -EUNATCH; goto out; } proc_spl_kmem = proc_mkdir("kmem", proc_spl); if (proc_spl_kmem == NULL) { rc = -EUNATCH; goto out; } proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem, &proc_slab_operations, NULL); if (proc_spl_kmem_slab == NULL) { rc = -EUNATCH; goto out; } proc_spl_kstat = proc_mkdir("kstat", proc_spl); if (proc_spl_kstat == NULL) { rc = -EUNATCH; goto out; } out: - if (rc) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - unregister_sysctl_table(spl_header); - } + if (rc) + spl_proc_cleanup(); return (rc); } void spl_proc_fini(void) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - - ASSERT(spl_header != NULL); - unregister_sysctl_table(spl_header); + spl_proc_cleanup(); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 925ee9d9fe9c..48ac55f07034 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -1,1056 +1,1105 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LINUX_BLK_CGROUP_HEADER #include #endif typedef struct vdev_disk { struct block_device *vd_bdev; krwlock_t vd_lock; } vdev_disk_t; /* * Unique identifier for the exclusive vdev holder. */ static void *zfs_vdev_holder = VDEV_HOLDER; /* * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the * device is missing. The missing path may be transient since the links * can be briefly removed and recreated in response to udev events. */ static uint_t zfs_vdev_open_timeout_ms = 1000; /* * Size of the "reserved" partition, in blocks. */ #define EFI_MIN_RESV_SIZE (16 * 1024) /* * Virtual device vector for disks. */ typedef struct dio_request { zio_t *dr_zio; /* Parent ZIO */ atomic_t dr_ref; /* References */ int dr_error; /* Bio error */ int dr_bio_count; /* Count of bio's */ struct bio *dr_bio[]; /* Attached bio's */ } dio_request_t; /* * BIO request failfast mask. */ static unsigned int zfs_vdev_failfast_mask = 1; +#ifdef HAVE_BLK_MODE_T +static blk_mode_t +#else static fmode_t +#endif vdev_bdev_mode(spa_mode_t spa_mode) { +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = 0; + + if (spa_mode & SPA_MODE_READ) + mode |= BLK_OPEN_READ; + + if (spa_mode & SPA_MODE_WRITE) + mode |= BLK_OPEN_WRITE; +#else fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) mode |= FMODE_READ; if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; +#endif return (mode); } /* * Returns the usable capacity (in bytes) for the partition or disk. */ static uint64_t bdev_capacity(struct block_device *bdev) { return (i_size_read(bdev->bd_inode)); } #if !defined(HAVE_BDEV_WHOLE) static inline struct block_device * bdev_whole(struct block_device *bdev) { return (bdev->bd_contains); } #endif #if defined(HAVE_BDEVNAME) #define vdev_bdevname(bdev, name) bdevname(bdev, name) #else static inline void vdev_bdevname(struct block_device *bdev, char *name) { snprintf(name, BDEVNAME_SIZE, "%pg", bdev); } #endif /* * Returns the maximum expansion capacity of the block device (in bytes). * * It is possible to expand a vdev when it has been created as a wholedisk * and the containing block device has increased in capacity. Or when the * partition containing the pool has been manually increased in size. * * This function is only responsible for calculating the potential expansion * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is * responsible for verifying the expected partition layout in the wholedisk * case, and updating the partition table if appropriate. Once the partition * size has been increased the additional capacity will be visible using * bdev_capacity(). * * The returned maximum expansion capacity is always expected to be larger, or * at the very least equal, to its usable capacity to prevent overestimating * the pool expandsize. */ static uint64_t bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) { uint64_t psize; int64_t available; if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to * alignment restrictions. Over reporting this value isn't * harmful and would only result in slightly less capacity * than expected post expansion. * The estimated available space may be slightly smaller than * bdev_capacity() for devices where the number of sectors is * not a multiple of the alignment size and the partition layout * is keeping less than PARTITION_END_ALIGNMENT bytes after the * "reserved" EFI partition: in such cases return the device * usable capacity. */ available = i_size_read(bdev_whole(bdev)->bd_inode) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); } else { psize = bdev_capacity(bdev); } return (psize); } static void vdev_disk_error(zio_t *zio) { /* * This function can be called in interrupt context, for instance while * handling IRQs coming from a misbehaving disk device; use printk() * which is safe from any context. */ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), zio->io_vd->vdev_path, zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); } static void vdev_disk_kobj_evt_post(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (vd && vd->vd_bdev) { spl_signal_kobj_evt(vd->vd_bdev); } else { vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", v->vdev_path); } } +#if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) +/* + * Define a dummy struct blk_holder_ops for kernel versions + * prior to 6.5. + */ +struct blk_holder_ops {}; +#endif + +static struct block_device * +vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, + const struct blk_holder_ops *hops) +{ +#ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG + return (blkdev_get_by_path(path, + vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops)); +#else + return (blkdev_get_by_path(path, + vdev_bdev_mode(mode) | FMODE_EXCL, holder)); +#endif +} + +static void +vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) +{ +#ifdef HAVE_BLKDEV_PUT_HOLDER + return (blkdev_put(bdev, holder)); +#else + return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL)); +#endif +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct block_device *bdev; +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); +#else fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); +#endif hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_dbgmsg(v, "invalid vdev_path"); return (SET_ERROR(EINVAL)); } /* * Reopen the device if it is currently open. When expanding a * partition force re-scanning the partition table if userland * did not take care of this already. We need to do this while closed * in order to get an accurate updated block device size. Then * since udev may need to recreate the device links increase the * open retry timeout before reporting the device as unavailable. */ vd = v->vdev_tsd; if (vd) { char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; boolean_t reread_part = B_FALSE; rw_enter(&vd->vd_lock, RW_WRITER); bdev = vd->vd_bdev; vd->vd_bdev = NULL; if (bdev) { if (v->vdev_expanding && bdev != bdev_whole(bdev)) { vdev_bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition * table already. We can detect this by * comparing our current physical size * with that of the device. If they are * the same, then we must not have * BLKPG_RESIZE_PARTITION or it failed to * update the partition table online. We * fallback to rescanning the partition * table from the kernel below. However, * if the capacity already reflects the * updated partition, then we skip * rescanning the partition table here. */ if (v->vdev_psize == bdev_capacity(bdev)) reread_part = B_TRUE; } - blkdev_put(bdev, mode | FMODE_EXCL); + vdev_blkdev_put(bdev, mode, zfs_vdev_holder); } if (reread_part) { - bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, - zfs_vdev_holder); + bdev = vdev_blkdev_get_by_path(disk_name, mode, + zfs_vdev_holder, NULL); if (!IS_ERR(bdev)) { int error = vdev_bdev_reread_part(bdev); - blkdev_put(bdev, mode | FMODE_EXCL); + vdev_blkdev_put(bdev, mode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); } } } } else { vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); rw_enter(&vd->vd_lock, RW_WRITER); } /* * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path * then drives may be re-cabled without an issue. If the provided * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical * locations to maximize the systems tolerance to component failure. * * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. * * The specified paths may be briefly removed and recreated in * response to udev events. This should be exceptionally unlikely * because the zpool command makes every effort to verify these paths * have already settled prior to reaching this point. Therefore, * a ENOENT failure at this point is highly likely to be transient * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. * * When ERESTARTSYS is returned it indicates the block device is * a zvol which could not be opened due to the deadlock detection * logic in zvol_open(). Extend the timeout and retry the open * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); bdev = ERR_PTR(-ENXIO); while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { - bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, - zfs_vdev_holder); + bdev = vdev_blkdev_get_by_path(v->vdev_path, mode, + zfs_vdev_holder, NULL); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { /* * There is no point of waiting since device is removed * explicitly */ if (v->vdev_removed) break; schedule_timeout(MSEC_TO_TICK(10)); } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; } else if (IS_ERR(bdev)) { break; } } if (IS_ERR(bdev)) { int error = -PTR_ERR(bdev); vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, (u_longlong_t)(gethrtime() - start), (u_longlong_t)timeout); vd->vd_bdev = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); return (SET_ERROR(error)); } else { vd->vd_bdev = bdev; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); } /* Determine the physical block size */ int physical_block_size = bdev_physical_block_size(vd->vd_bdev); /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(vd->vd_bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); /* Set when device reports it supports secure TRIM. */ v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(vd->vd_bdev); /* Physical volume size in bytes including possible expansion space */ *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *physical_ashift = highbit64(MAX(physical_block_size, SPA_MINBLOCKSIZE)) - 1; *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; return (0); } static void vdev_disk_close(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (v->vdev_reopening || vd == NULL) return; if (vd->vd_bdev != NULL) { - blkdev_put(vd->vd_bdev, - vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); + vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa), + zfs_vdev_holder); } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } static dio_request_t * vdev_disk_dio_alloc(int bio_count) { dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); atomic_set(&dr->dr_ref, 0); dr->dr_bio_count = bio_count; dr->dr_error = 0; for (int i = 0; i < dr->dr_bio_count; i++) dr->dr_bio[i] = NULL; return (dr); } static void vdev_disk_dio_free(dio_request_t *dr) { int i; for (i = 0; i < dr->dr_bio_count; i++) if (dr->dr_bio[i]) bio_put(dr->dr_bio[i]); kmem_free(dr, sizeof (dio_request_t) + sizeof (struct bio *) * dr->dr_bio_count); } static void vdev_disk_dio_get(dio_request_t *dr) { atomic_inc(&dr->dr_ref); } static void vdev_disk_dio_put(dio_request_t *dr) { int rc = atomic_dec_return(&dr->dr_ref); /* * Free the dio_request when the last reference is dropped and * ensure zio_interpret is called only once with the correct zio */ if (rc == 0) { zio_t *zio = dr->dr_zio; int error = dr->dr_error; vdev_disk_dio_free(dr); if (zio) { zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_delay_interrupt(zio); } } } BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) { dio_request_t *dr = bio->bi_private; if (dr->dr_error == 0) { #ifdef HAVE_1ARG_BIO_END_IO_T dr->dr_error = BIO_END_IO_ERROR(bio); #else if (error) dr->dr_error = -(error); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) dr->dr_error = EIO; #endif } /* Drop reference acquired by __vdev_disk_physio */ vdev_disk_dio_put(dr); } static inline void vdev_submit_bio_impl(struct bio *bio) { #ifdef HAVE_1ARG_SUBMIT_BIO (void) submit_bio(bio); #else (void) submit_bio(bio_data_dir(bio), bio); #endif } /* * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so * replace it with preempt_schedule under the following condition: */ #if defined(CONFIG_ARM64) && \ defined(CONFIG_PREEMPTION) && \ defined(CONFIG_BLK_CGROUP) #define preempt_schedule_notrace(x) preempt_schedule(x) #endif /* * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct * as an argument removing the need to set it with bio_set_dev(). This * removes the need for all of the following compatibility code. */ #if !defined(HAVE_BIO_ALLOC_4ARG) #ifdef HAVE_BIO_SET_DEV #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) /* * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). * As a side effect the function was converted to GPL-only. Define our * own version when needed which uses rcu_read_lock_sched(). * * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public * part, moving blkg_tryget into the private one. Define our own version. */ #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) static inline bool vdev_blkg_tryget(struct blkcg_gq *blkg) { struct percpu_ref *ref = &blkg->refcnt; unsigned long __percpu *count; bool rc; rcu_read_lock_sched(); if (__ref_is_percpu(ref, &count)) { this_cpu_inc(*count); rc = true; } else { #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA rc = atomic_long_inc_not_zero(&ref->data->count); #else rc = atomic_long_inc_not_zero(&ref->count); #endif } rcu_read_unlock_sched(); return (rc); } #else #define vdev_blkg_tryget(bg) blkg_tryget(bg) #endif #ifdef HAVE_BIO_SET_DEV_MACRO /* * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the * GPL-only bio_associate_blkg() symbol thus inadvertently converting * the entire macro. Provide a minimal version which always assigns the * request queue's root_blkg to the bio. */ static inline void vdev_bio_associate_blkg(struct bio *bio) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_associate_blkg vdev_bio_associate_blkg #else static inline void vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) bio_clear_flag(bio, BIO_THROTTLED); bio->bi_bdev = bdev; ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_set_dev vdev_bio_set_dev #endif #endif #else /* * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. */ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio->bi_bdev = bdev; } #endif /* HAVE_BIO_SET_DEV */ #endif /* !HAVE_BIO_ALLOC_4ARG */ static inline void vdev_submit_bio(struct bio *bio) { struct bio_list *bio_list = current->bio_list; current->bio_list = NULL; vdev_submit_bio_impl(bio); current->bio_list = bio_list; } static inline struct bio * vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, unsigned short nr_vecs) { struct bio *bio; #ifdef HAVE_BIO_ALLOC_4ARG bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); #else bio = bio_alloc(gfp_mask, nr_vecs); if (likely(bio != NULL)) bio_set_dev(bio, bdev); #endif return (bio); } static inline unsigned int vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) { unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, bio_size, abd_offset); #ifdef HAVE_BIO_MAX_SEGS return (bio_max_segs(nr_segs)); #else return (MIN(nr_segs, BIO_MAX_PAGES)); #endif } static int __vdev_disk_physio(struct block_device *bdev, zio_t *zio, size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; int bio_size; int bio_count = 16; int error = 0; struct blk_plug plug; unsigned short nr_vecs; /* * Accessing outside the block device is never allowed. */ if (io_offset + io_size > bdev->bd_inode->i_size) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", (u_longlong_t)io_offset, (u_longlong_t)io_size, (u_longlong_t)i_size_read(bdev->bd_inode)); return (SET_ERROR(EIO)); } retry: dr = vdev_disk_dio_alloc(bio_count); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && zio->io_vd->vdev_failfast == B_TRUE) { bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); } dr->dr_zio = zio; /* * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio * can cover at least 128KB and at most 1MB. When the required number * of iovec's exceeds this, we are forced to break the IO in multiple * bio's and wait for them all to complete. This is likely if the * recordsize property is increased beyond 1MB. The default * bio_count=16 should typically accommodate the maximum-size zio of * 16MB. */ abd_offset = 0; bio_offset = io_offset; bio_size = io_size; for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* * If additional bio's are required, we have to retry, but * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); bio_count *= 2; goto retry; } nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (SET_ERROR(ENOMEM)); } /* Matching put called by vdev_disk_physio_completion */ vdev_disk_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); /* Submit all bio's associated with this dio */ for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); } if (dr->dr_bio_count > 1) blk_finish_plug(&plug); vdev_disk_dio_put(dr); return (error); } BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; #ifdef HAVE_1ARG_BIO_END_IO_T zio->io_error = BIO_END_IO_ERROR(bio); #else zio->io_error = -error; #endif if (zio->io_error && (zio->io_error == EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; bio_put(bio); ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_interrupt(zio); } static int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; struct bio *bio; q = bdev_get_queue(bdev); if (!q) return (SET_ERROR(ENXIO)); bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); if (unlikely(bio == NULL)) return (SET_ERROR(ENOMEM)); bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio_set_flush(bio); vdev_submit_bio(bio); invalidate_bdev(bdev); return (0); } static int vdev_disk_io_trim(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) if (zio->io_trim_flags & ZIO_TRIM_SECURE) { return (-blkdev_issue_secure_erase(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } else { return (-blkdev_issue_discard(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) unsigned long trim_flags = 0; #if defined(BLKDEV_DISCARD_SECURE) if (zio->io_trim_flags & ZIO_TRIM_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif return (-blkdev_issue_discard(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" #endif } static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; int rw, error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (vd == NULL) { zio->io_error = ENXIO; zio_interrupt(zio); return; } rw_enter(&vd->vd_lock, RW_READER); /* * If the vdev is closed, it's likely due to a failed reopen and is * in the UNAVAIL state. Nothing to be done here but return failure. */ if (vd->vd_bdev == NULL) { rw_exit(&vd->vd_lock); zio->io_error = ENXIO; zio_interrupt(zio); return; } switch (zio->io_type) { case ZIO_TYPE_IOCTL: if (!vdev_readable(v)) { rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush) break; if (v->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } error = vdev_disk_io_flush(vd->vd_bdev, zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } zio->io_error = error; break; default: zio->io_error = SET_ERROR(ENOTSUP); } rw_exit(&vd->vd_lock); zio_execute(zio); return; case ZIO_TYPE_WRITE: rw = WRITE; break; case ZIO_TYPE_READ: rw = READ; break; case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); zio_interrupt(zio); return; default: rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } zio->io_target_timestamp = zio_handle_io_delay(zio); error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, 0); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; zio_interrupt(zio); return; } } static void vdev_disk_io_done(zio_t *zio) { /* * If the device returned EIO, we revalidate the media. If it is * determined the media has changed this triggers the asynchronous * removal of the device from the configuration. */ if (zio->io_error == EIO) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; if (!zfs_check_disk_status(vd->vd_bdev)) { invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } } } static void vdev_disk_hold(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') return; /* * Only prefetch path and devid info if the device has * never been opened. */ if (vd->vdev_tsd != NULL) return; } static void vdev_disk_rele(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* XXX: Implement me as a vnode rele for the device */ } vdev_ops_t vdev_disk_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_disk_hold, .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE, /* leaf vdev */ .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post }; /* * The zfs_vdev_scheduler module option has been deprecated. Setting this * value no longer has any effect. It has not yet been entirely removed * to allow the module to be loaded if this option is specified in the * /etc/modprobe.d/zfs.conf file. The following warning will be logged. */ static int param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) { int error = param_set_charp(val, kp); if (error == 0) { printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " "is not supported.\n"); } return (error); } static const char *zfs_vdev_scheduler = "unused"; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, param_get_charp, &zfs_vdev_scheduler, 0644); MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } int param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, "Timeout before determining that a device is missing"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index c45a3eb5a4eb..02cb379ea840 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -1,1318 +1,1317 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * LLNL-CODE-403049. * Rewritten for Linux by: * Rohan Puri * Brian Behlendorf * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2018 George Melikov. All Rights Reserved. * Copyright (c) 2019 Datto, Inc. All rights reserved. * Copyright (c) 2020 The MathWorks, Inc. All rights reserved. */ /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' and 'shares' directory, but this may * expand in the future. The elements are built dynamically, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding inode. * * All mounts are handled automatically by an user mode helper which invokes * the mount procedure. Unmounts are handled by allowing the mount * point to expire so the kernel may automatically unmount it. * * The '.zfs', '.zfs/snapshot', and all directories created under * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same * zfsvfs_t as the head filesystem (what '.zfs' lives under). * * File systems mounted on top of the '.zfs/snapshot/' paths * (ie: snapshots) are complete ZFS filesystems and have their own unique * zfsvfs_t. However, the fsid reported by these mounts will be the same * as that used by the parent zfsvfs_t to make NFS happy. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" /* * Two AVL trees are maintained which contain all currently automounted * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t * entry which MUST: * * - be attached to both trees, and * - be unique, no duplicate entries are allowed. * * The zfs_snapshots_by_name tree is indexed by the full dataset name * while the zfs_snapshots_by_objsetid tree is indexed by the unique * objsetid. This allows for fast lookups either by name or objsetid. */ static avl_tree_t zfs_snapshots_by_name; static avl_tree_t zfs_snapshots_by_objsetid; static krwlock_t zfs_snapshot_lock; /* * Control Directory Tunables (.zfs) */ int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; static int zfs_admin_snapshot = 0; typedef struct { char *se_name; /* full snapshot name */ char *se_path; /* full mount path */ spa_t *se_spa; /* pool spa */ uint64_t se_objsetid; /* snapshot objset id */ struct dentry *se_root_dentry; /* snapshot root dentry */ krwlock_t se_taskqid_lock; /* scheduled unmount taskqid lock */ taskqid_t se_taskqid; /* scheduled unmount taskqid */ avl_node_t se_node_name; /* zfs_snapshots_by_name link */ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ zfs_refcount_t se_refcount; /* reference count */ } zfs_snapentry_t; static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); /* * Allocate a new zfs_snapentry_t being careful to make a copy of the * the snapshot name and provided mount point. No reference is taken. */ static zfs_snapentry_t * zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa, uint64_t objsetid, struct dentry *root_dentry) { zfs_snapentry_t *se; se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); se->se_name = kmem_strdup(full_name); se->se_path = kmem_strdup(full_path); se->se_spa = spa; se->se_objsetid = objsetid; se->se_root_dentry = root_dentry; se->se_taskqid = TASKQID_INVALID; rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL); zfs_refcount_create(&se->se_refcount); return (se); } /* * Free a zfs_snapentry_t the caller must ensure there are no active * references. */ static void zfsctl_snapshot_free(zfs_snapentry_t *se) { zfs_refcount_destroy(&se->se_refcount); kmem_strfree(se->se_name); kmem_strfree(se->se_path); rw_destroy(&se->se_taskqid_lock); kmem_free(se, sizeof (zfs_snapentry_t)); } /* * Hold a reference on the zfs_snapentry_t. */ static void zfsctl_snapshot_hold(zfs_snapentry_t *se) { zfs_refcount_add(&se->se_refcount, NULL); } /* * Release a reference on the zfs_snapentry_t. When the number of * references drops to zero the structure will be freed. */ static void zfsctl_snapshot_rele(zfs_snapentry_t *se) { if (zfs_refcount_remove(&se->se_refcount, NULL) == 0) zfsctl_snapshot_free(se); } /* * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part * of the trees a reference is held. */ static void zfsctl_snapshot_add(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); zfsctl_snapshot_hold(se); avl_add(&zfs_snapshots_by_name, se); avl_add(&zfs_snapshots_by_objsetid, se); } /* * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped, * this can result in the structure being freed if that was the last * remaining reference. */ static void zfsctl_snapshot_remove(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); avl_remove(&zfs_snapshots_by_name, se); avl_remove(&zfs_snapshots_by_objsetid, se); zfsctl_snapshot_rele(se); } /* * Snapshot name comparison function for the zfs_snapshots_by_name. */ static int snapentry_compare_by_name(const void *a, const void *b) { const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; int ret; ret = strcmp(se_a->se_name, se_b->se_name); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Snapshot name comparison function for the zfs_snapshots_by_objsetid. */ static int snapentry_compare_by_objsetid(const void *a, const void *b) { const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; if (se_a->se_spa != se_b->se_spa) return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1); if (se_a->se_objsetid < se_b->se_objsetid) return (-1); else if (se_a->se_objsetid > se_b->se_objsetid) return (1); else return (0); } /* * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname * is found a pointer to the zfs_snapentry_t is returned and a reference * taken on the structure. The caller is responsible for dropping the * reference with zfsctl_snapshot_rele(). If the snapname is not found * NULL will be returned. */ static zfs_snapentry_t * zfsctl_snapshot_find_by_name(const char *snapname) { zfs_snapentry_t *se, search; ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); search.se_name = (char *)snapname; se = avl_find(&zfs_snapshots_by_name, &search, NULL); if (se) zfsctl_snapshot_hold(se); return (se); } /* * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id * rather than the snapname. In all other respects it behaves the same * as zfsctl_snapshot_find_by_name(). */ static zfs_snapentry_t * zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) { zfs_snapentry_t *se, search; ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); search.se_spa = spa; search.se_objsetid = objsetid; se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); if (se) zfsctl_snapshot_hold(se); return (se); } /* * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is * removed, renamed, and added back to the new correct location in the tree. */ static int zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname) { zfs_snapentry_t *se; ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); se = zfsctl_snapshot_find_by_name(old_snapname); if (se == NULL) return (SET_ERROR(ENOENT)); zfsctl_snapshot_remove(se); kmem_strfree(se->se_name); se->se_name = kmem_strdup(new_snapname); zfsctl_snapshot_add(se); zfsctl_snapshot_rele(se); return (0); } /* * Delayed task responsible for unmounting an expired automounted snapshot. */ static void snapentry_expire(void *data) { zfs_snapentry_t *se = (zfs_snapentry_t *)data; spa_t *spa = se->se_spa; uint64_t objsetid = se->se_objsetid; if (zfs_expire_snapshot <= 0) { zfsctl_snapshot_rele(se); return; } rw_enter(&se->se_taskqid_lock, RW_WRITER); se->se_taskqid = TASKQID_INVALID; rw_exit(&se->se_taskqid_lock); (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); zfsctl_snapshot_rele(se); /* * Reschedule the unmount if the zfs_snapentry_t wasn't removed. * This can occur when the snapshot is busy. */ rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); zfsctl_snapshot_rele(se); } rw_exit(&zfs_snapshot_lock); } /* * Cancel an automatic unmount of a snapname. This callback is responsible * for dropping the reference on the zfs_snapentry_t which was taken when * during dispatch. */ static void zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) { int err = 0; rw_enter(&se->se_taskqid_lock, RW_WRITER); err = taskq_cancel_id(system_delay_taskq, se->se_taskqid); /* * if we get ENOENT, the taskq couldn't be found to be * canceled, so we can just mark it as invalid because * it's already gone. If we got EBUSY, then we already * blocked until it was gone _anyway_, so we don't care. */ se->se_taskqid = TASKQID_INVALID; rw_exit(&se->se_taskqid_lock); if (err == 0) { zfsctl_snapshot_rele(se); } } /* * Dispatch the unmount task for delayed handling with a hold protecting it. */ static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) { if (delay <= 0) return; zfsctl_snapshot_hold(se); rw_enter(&se->se_taskqid_lock, RW_WRITER); /* * If this condition happens, we managed to: * - dispatch once * - want to dispatch _again_ before it returned * * So let's just return - if that task fails at unmounting, * we'll eventually dispatch again, and if it succeeds, * no problem. */ if (se->se_taskqid != TASKQID_INVALID) { rw_exit(&se->se_taskqid_lock); zfsctl_snapshot_rele(se); return; } se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); rw_exit(&se->se_taskqid_lock); } /* * Schedule an automatic unmount of objset id to occur in delay seconds from * now. Any previous delayed unmount will be cancelled in favor of the * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name() * and held until the outstanding task is handled or cancelled. */ int zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) { zfs_snapentry_t *se; int error = ENOENT; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_unmount_delay_impl(se, delay); zfsctl_snapshot_rele(se); error = 0; } rw_exit(&zfs_snapshot_lock); return (error); } /* * Check if snapname is currently mounted. Returned non-zero when mounted * and zero when unmounted. */ static boolean_t zfsctl_snapshot_ismounted(const char *snapname) { zfs_snapentry_t *se; boolean_t ismounted = B_FALSE; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) { zfsctl_snapshot_rele(se); ismounted = B_TRUE; } rw_exit(&zfs_snapshot_lock); return (ismounted); } /* * Check if the given inode is a part of the virtual .zfs directory. */ boolean_t zfsctl_is_node(struct inode *ip) { return (ITOZ(ip)->z_is_ctldir); } /* * Check if the given inode is a .zfs/snapshots/snapname directory. */ boolean_t zfsctl_is_snapdir(struct inode *ip) { return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); } /* * Allocate a new inode with the passed id and ops. */ static struct inode * zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops, uint64_t creation) { struct inode *ip; znode_t *zp; inode_timespec_t now = {.tv_sec = creation}; ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); if (!creation) now = current_time(ip); zp = ITOZ(ip); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_id = id; zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_zn_prefetch = B_FALSE; zp->z_is_sa = B_FALSE; #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; #endif zp->z_is_ctldir = B_TRUE; zp->z_sa_hdl = NULL; zp->z_blksz = 0; zp->z_seq = 0; zp->z_mapcnt = 0; zp->z_size = 0; zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); ip->i_uid = SUID_TO_KUID(0); ip->i_gid = SGID_TO_KGID(0); ip->i_blkbits = SPA_MINBLOCKSHIFT; ip->i_atime = now; ip->i_mtime = now; ip->i_ctime = now; ip->i_fop = fops; ip->i_op = ops; #if defined(IOP_XATTR) ip->i_opflags &= ~IOP_XATTR; #endif if (insert_inode_locked(ip)) { unlock_new_inode(ip); iput(ip); return (NULL); } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); unlock_new_inode(ip); return (ip); } /* * Lookup the inode with given id, it will be allocated if needed. */ static struct inode * zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct inode *ip = NULL; uint64_t creation = 0; dsl_dataset_t *snap_ds; dsl_pool_t *pool; while (ip == NULL) { ip = ilookup(zfsvfs->z_sb, (unsigned long)id); if (ip) break; if (id <= ZFSCTL_INO_SNAPDIRS && !creation) { pool = dmu_objset_pool(zfsvfs->z_os); dsl_pool_config_enter(pool, FTAG); if (!dsl_dataset_hold_obj(pool, ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) { creation = dsl_get_creation(snap_ds); dsl_dataset_rele(snap_ds, FTAG); } dsl_pool_config_exit(pool, FTAG); } /* May fail due to concurrent zfsctl_inode_alloc() */ ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation); } return (ip); } /* * Create the '.zfs' directory. This directory is cached as part of the VFS * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() * therefore checks against a vfs_count of 2 instead of 1. This reference * is removed when the ctldir is destroyed in the unmount. All other entities * under the '.zfs' directory are created dynamically as needed. * * Because the dynamically created '.zfs' directory entries assume the use * of 64-bit inode numbers this support must be disabled on 32-bit systems. */ int zfsctl_create(zfsvfs_t *zfsvfs) { ASSERT(zfsvfs->z_ctldir == NULL); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, &zpl_fops_root, &zpl_ops_root, 0); if (zfsvfs->z_ctldir == NULL) return (SET_ERROR(ENOENT)); return (0); } /* * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name. * Only called when the filesystem is unmounted. */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { if (zfsvfs->z_issnap) { zfs_snapentry_t *se; spa_t *spa = zfsvfs->z_os->os_spa; uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); rw_enter(&zfs_snapshot_lock, RW_WRITER); se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); if (se != NULL) zfsctl_snapshot_remove(se); rw_exit(&zfs_snapshot_lock); if (se != NULL) { zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_rele(se); } } else if (zfsvfs->z_ctldir) { iput(zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ struct inode * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); /* Must have an existing ref, so igrab() cannot return NULL */ VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL); return (ZTOZSB(zp)->z_ctldir); } /* * Generate a long fid to indicate a snapdir. We encode whether snapdir is * already mounted in gen field. We do this because nfsd lookup will not * trigger automount. Next time the nfsd does fh_to_dentry, we will notice * this and do automount and return ESTALE to force nfsd revalidate and follow * mount. */ static int zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp) { zfid_short_t *zfid = (zfid_short_t *)fidp; zfid_long_t *zlfid = (zfid_long_t *)fidp; uint32_t gen = 0; uint64_t object; uint64_t objsetid; int i; struct dentry *dentry; if (fidp->fid_len < LONG_FID_LEN) { fidp->fid_len = LONG_FID_LEN; return (SET_ERROR(ENOSPC)); } object = ip->i_ino; objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino; zfid->zf_len = LONG_FID_LEN; dentry = d_obtain_alias(igrab(ip)); if (!IS_ERR(dentry)) { gen = !!d_mountpoint(dentry); dput(dentry); } for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; return (0); } /* * Generate an appropriate fid for an entry in the .zfs directory. */ int zfsctl_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t object = zp->z_id; zfid_short_t *zfid; int i; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (zfsctl_is_snapdir(ip)) { zfs_exit(zfsvfs, FTAG); return (zfsctl_snapdir_fid(ip, fidp)); } if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs znodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; zfs_exit(zfsvfs, FTAG); return (0); } /* * Construct a full dataset name in full_name: "pool/dataset@snap_name" */ static int zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, char *full_name) { objset_t *os = zfsvfs->z_os; if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) return (SET_ERROR(EILSEQ)); dmu_objset_name(os, full_name); if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) return (SET_ERROR(ENAMETOOLONG)); (void) strcat(full_name, "@"); (void) strcat(full_name, snap_name); return (0); } /* * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" */ static int zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, int path_len, char *full_path) { objset_t *os = zfsvfs->z_os; fstrans_cookie_t cookie; char *snapname; boolean_t case_conflict; uint64_t id, pos = 0; int error = 0; if (zfsvfs->z_vfs->vfs_mntpoint == NULL) return (SET_ERROR(ENOENT)); cookie = spl_fstrans_mark(); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto out; if (id == objsetid) break; } snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", zfsvfs->z_vfs->vfs_mntpoint, snapname); out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); spl_fstrans_unmark(cookie); return (error); } /* * Special case the handling of "..". */ int zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); int error = 0; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (strcmp(name, "..") == 0) { *ipp = dip->i_sb->s_root->d_inode; } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, &zpl_fops_snapdir, &zpl_ops_snapdir); } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES, &zpl_fops_shares, &zpl_ops_shares); } else { *ipp = NULL; } if (*ipp == NULL) error = SET_ERROR(ENOENT); zfs_exit(zfsvfs, FTAG); return (error); } /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem inode as necessary. */ int zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); uint64_t id; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); if (error) { zfs_exit(zfsvfs, FTAG); return (error); } *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, &simple_dir_operations, &simple_dir_inode_operations); if (*ipp == NULL) error = SET_ERROR(ENOENT); zfs_exit(zfsvfs, FTAG); return (error); } /* * Renaming a directory under '.zfs/snapshot' will automatically trigger * a rename of the snapshot to the new given name. The rename is confined * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. */ int zfsctl_snapdir_rename(struct inode *sdip, const char *snm, struct inode *tdip, const char *tnm, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(sdip); char *to, *from, *real, *fsname; int error; if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zfsvfs->z_os, snm, real, ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { snm = real; } else if (error != ENOTSUP) { goto out; } } dmu_objset_name(zfsvfs->z_os, fsname); error = zfsctl_snapshot_name(ITOZSB(sdip), snm, ZFS_MAX_DATASET_NAME_LEN, from); if (error == 0) error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, ZFS_MAX_DATASET_NAME_LEN, to); if (error == 0) error = zfs_secpolicy_rename_perms(from, to, cr); if (error != 0) goto out; /* * Cannot move snapshots out of the snapdir. */ if (sdip != tdip) { error = SET_ERROR(EINVAL); goto out; } /* * No-op when names are identical. */ if (strcmp(snm, tnm) == 0) { error = 0; goto out; } rw_enter(&zfs_snapshot_lock, RW_WRITER); error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); if (error == 0) (void) zfsctl_snapshot_rename(snm, tnm); rw_exit(&zfs_snapshot_lock); out: kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Removing a directory under '.zfs/snapshot' will automatically trigger * the removal of the snapshot with the given name. */ int zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(dip); char *snapname, *real; int error; if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zfsvfs->z_os, name, real, ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { name = real; } else if (error != ENOTSUP) { goto out; } } error = zfsctl_snapshot_name(ITOZSB(dip), name, ZFS_MAX_DATASET_NAME_LEN, snapname); if (error == 0) error = zfs_secpolicy_destroy_perms(snapname, cr); if (error != 0) goto out; error = zfsctl_snapshot_unmount(snapname, MNT_FORCE); if ((error == 0) || (error == ENOENT)) error = dsl_destroy_snapshot(snapname, B_FALSE); out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Creating a directory under '.zfs/snapshot' will automatically trigger * the creation of a new snapshot with the given name. */ int zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap, struct inode **ipp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = ITOZSB(dip); char *dsname; int error; if (!zfs_admin_snapshot) return (SET_ERROR(EACCES)); dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { error = SET_ERROR(EILSEQ); goto out; } dmu_objset_name(zfsvfs->z_os, dsname); error = zfs_secpolicy_snapshot_perms(dsname, cr); if (error != 0) goto out; if (error == 0) { error = dmu_objset_snapshot_one(dsname, dirname); if (error != 0) goto out; error = zfsctl_snapdir_lookup(dip, dirname, ipp, 0, cr, NULL, NULL); } out: kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); return (error); } /* * Flush everything out of the kernel's export table and such. * This is needed as once the snapshot is used over NFS, its * entries in svc_export and svc_expkey caches hold reference * to the snapshot mount point. There is no known way of flushing * only the entries related to the snapshot. */ static void exportfs_flush(void) { char *argv[] = { "/usr/sbin/exportfs", "-f", NULL }; char *envp[] = { NULL }; (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); } /* * Attempt to unmount a snapshot by making a call to user space. * There is no assurance that this can or will succeed, is just a * best effort. In the case where it does fail, perhaps because * it's in use, the unmount will fail harmlessly. */ int zfsctl_snapshot_unmount(const char *snapname, int flags) { char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, NULL }; char *envp[] = { NULL }; zfs_snapentry_t *se; int error; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) { rw_exit(&zfs_snapshot_lock); return (SET_ERROR(ENOENT)); } rw_exit(&zfs_snapshot_lock); exportfs_flush(); if (flags & MNT_FORCE) argv[4] = "-fn"; argv[5] = se->se_path; dprintf("unmount; path=%s\n", se->se_path); error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); zfsctl_snapshot_rele(se); /* * The umount system utility will return 256 on error. We must * assume this error is because the file system is busy so it is * converted to the more sensible EBUSY. */ if (error) error = SET_ERROR(EBUSY); return (error); } int zfsctl_snapshot_mount(struct path *path, int flags) { struct dentry *dentry = path->dentry; struct inode *ip = dentry->d_inode; zfsvfs_t *zfsvfs; zfsvfs_t *snap_zfsvfs; zfs_snapentry_t *se; char *full_name, *full_path; char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, NULL }; char *envp[] = { NULL }; int error; struct path spath; if (ip == NULL) return (SET_ERROR(EISDIR)); zfsvfs = ITOZSB(ip); if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); error = zfsctl_snapshot_name(zfsvfs, dname(dentry), ZFS_MAX_DATASET_NAME_LEN, full_name); if (error) goto error; /* * Construct a mount point path from sb of the ctldir inode and dirent * name, instead of from d_path(), so that chroot'd process doesn't fail * on mount.zfs(8). */ snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", dname(dentry)); /* * Multiple concurrent automounts of a snapshot are never allowed. * The snapshot may be manually mounted as many times as desired. */ if (zfsctl_snapshot_ismounted(full_name)) { error = 0; goto error; } /* * Attempt to mount the snapshot from user space. Normally this * would be done using the vfs_kern_mount() function, however that * function is marked GPL-only and cannot be used. On error we * careful to log the real error to the console and return EISDIR * to safely abort the automount. This should be very rare. * * If the user mode helper happens to return EBUSY, a concurrent * mount is already in progress in which case the error is ignored. * Take note that if the program was executed successfully the return * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); argv[5] = full_name; argv[6] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { zfs_dbgmsg("Unable to automount %s error=%d", full_path, error); error = SET_ERROR(EISDIR); } else { /* * EBUSY, this could mean a concurrent mount, or the * snapshot has already been mounted at completely * different place. We return 0 so VFS will retry. For * the latter case the VFS will retry several times * and return ELOOP, which is probably not a very good * behavior. */ error = 0; } goto error; } /* * Follow down in to the mounted snapshot and set MNT_SHRINKABLE * to identify this as an automounted filesystem. */ spath = *path; path_get(&spath); if (follow_down_one(&spath)) { snap_zfsvfs = ITOZSB(spath.dentry->d_inode); snap_zfsvfs->z_parent = zfsvfs; dentry = spath.dentry; spath.mnt->mnt_flags |= MNT_SHRINKABLE; rw_enter(&zfs_snapshot_lock, RW_WRITER); se = zfsctl_snapshot_alloc(full_name, full_path, snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os), dentry); zfsctl_snapshot_add(se); zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); rw_exit(&zfs_snapshot_lock); } path_put(&spath); error: kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); kmem_free(full_path, MAXPATHLEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Get the snapdir inode from fid */ int zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen, struct inode **ipp) { int error; struct path path; char *mnt; struct dentry *dentry; mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid, MAXPATHLEN, mnt); if (error) goto out; /* Trigger automount */ error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); if (error) goto out; path_put(&path); /* * Get the snapdir inode. Note, we don't want to use the above * path because it contains the root of the snapshot rather * than the snapdir. */ *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid); if (*ipp == NULL) { error = SET_ERROR(ENOENT); goto out; } /* check gen, see zfsctl_snapdir_fid */ dentry = d_obtain_alias(igrab(*ipp)); if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) { iput(*ipp); *ipp = NULL; error = SET_ERROR(ENOENT); } if (!IS_ERR(dentry)) dput(dentry); out: kmem_free(mnt, MAXPATHLEN); return (error); } int zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ITOZSB(dip); znode_t *zp; znode_t *dzp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (zfsvfs->z_shares_dir == 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL); zrele(dzp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Initialize the various pieces we'll need to create and manipulate .zfs * directories. Currently this is unused but available. */ void zfsctl_init(void) { avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_name)); avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_objsetid)); rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); } /* * Cleanup the various pieces we needed for .zfs directories. In particular * ensure the expiry timer is canceled safely. */ void zfsctl_fini(void) { avl_destroy(&zfs_snapshots_by_name); avl_destroy(&zfs_snapshots_by_objsetid); rw_destroy(&zfs_snapshot_lock); } module_param(zfs_admin_snapshot, int, 0644); MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); module_param(zfs_expire_snapshot, int, 0644); MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 464c12e1108d..a1db5c57c18b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1,2130 +1,2129 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" enum { TOKEN_RO, TOKEN_RW, TOKEN_SETUID, TOKEN_NOSETUID, TOKEN_EXEC, TOKEN_NOEXEC, TOKEN_DEVICES, TOKEN_NODEVICES, TOKEN_DIRXATTR, TOKEN_SAXATTR, TOKEN_XATTR, TOKEN_NOXATTR, TOKEN_ATIME, TOKEN_NOATIME, TOKEN_RELATIME, TOKEN_NORELATIME, TOKEN_NBMAND, TOKEN_NONBMAND, TOKEN_MNTPOINT, TOKEN_LAST, }; static const match_table_t zpl_tokens = { { TOKEN_RO, MNTOPT_RO }, { TOKEN_RW, MNTOPT_RW }, { TOKEN_SETUID, MNTOPT_SETUID }, { TOKEN_NOSETUID, MNTOPT_NOSETUID }, { TOKEN_EXEC, MNTOPT_EXEC }, { TOKEN_NOEXEC, MNTOPT_NOEXEC }, { TOKEN_DEVICES, MNTOPT_DEVICES }, { TOKEN_NODEVICES, MNTOPT_NODEVICES }, { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, { TOKEN_SAXATTR, MNTOPT_SAXATTR }, { TOKEN_XATTR, MNTOPT_XATTR }, { TOKEN_NOXATTR, MNTOPT_NOXATTR }, { TOKEN_ATIME, MNTOPT_ATIME }, { TOKEN_NOATIME, MNTOPT_NOATIME }, { TOKEN_RELATIME, MNTOPT_RELATIME }, { TOKEN_NORELATIME, MNTOPT_NORELATIME }, { TOKEN_NBMAND, MNTOPT_NBMAND }, { TOKEN_NONBMAND, MNTOPT_NONBMAND }, { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, { TOKEN_LAST, NULL }, }; static void zfsvfs_vfs_free(vfs_t *vfsp) { if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) kmem_strfree(vfsp->vfs_mntpoint); kmem_free(vfsp, sizeof (vfs_t)); } } static int zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) { switch (token) { case TOKEN_RO: vfsp->vfs_readonly = B_TRUE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_RW: vfsp->vfs_readonly = B_FALSE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_SETUID: vfsp->vfs_setuid = B_TRUE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_NOSETUID: vfsp->vfs_setuid = B_FALSE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_EXEC: vfsp->vfs_exec = B_TRUE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_NOEXEC: vfsp->vfs_exec = B_FALSE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_DEVICES: vfsp->vfs_devices = B_TRUE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_NODEVICES: vfsp->vfs_devices = B_FALSE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_DIRXATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_SAXATTR: vfsp->vfs_xattr = ZFS_XATTR_SA; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_XATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_NOXATTR: vfsp->vfs_xattr = ZFS_XATTR_OFF; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_ATIME: vfsp->vfs_atime = B_TRUE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_NOATIME: vfsp->vfs_atime = B_FALSE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_RELATIME: vfsp->vfs_relatime = B_TRUE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NORELATIME: vfsp->vfs_relatime = B_FALSE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NBMAND: vfsp->vfs_nbmand = B_TRUE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_NONBMAND: vfsp->vfs_nbmand = B_FALSE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_MNTPOINT: vfsp->vfs_mntpoint = match_strdup(&args[0]); if (vfsp->vfs_mntpoint == NULL) return (SET_ERROR(ENOMEM)); break; default: break; } return (0); } /* * Parse the raw mntopts and return a vfs_t describing the options. */ static int zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) { vfs_t *tmp_vfsp; int error; tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); if (mntopts != NULL) { substring_t args[MAX_OPT_ARGS]; char *tmp_mntopts, *p, *t; int token; tmp_mntopts = t = kmem_strdup(mntopts); if (tmp_mntopts == NULL) return (SET_ERROR(ENOMEM)); while ((p = strsep(&t, ",")) != NULL) { if (!*p) continue; args[0].to = args[0].from = NULL; token = match_token(p, zpl_tokens, args); error = zfsvfs_parse_option(p, token, args, tmp_vfsp); if (error) { kmem_strfree(tmp_mntopts); zfsvfs_vfs_free(tmp_vfsp); return (error); } } kmem_strfree(tmp_mntopts); } *vfsp = tmp_vfsp; return (0); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY)); } int zfs_sync(struct super_block *sb, int wait, cred_t *cr) { (void) cr; zfsvfs_t *zfsvfs = sb->s_fs_info; /* * Semantically, the only requirement is that the sync be initiated. * The DMU syncs out txgs frequently, so there's nothing to do. */ if (!wait) return (0); if (zfsvfs != NULL) { /* * Sync a specific filesystem. */ dsl_pool_t *dp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (spa_suspended(dp->dp_spa)) { zfs_exit(zfsvfs, FTAG); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); zfs_exit(zfsvfs, FTAG); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; /* * Update SB_NOATIME bit in VFS super block. Since atime update is * determined by atime_needs_update(), atime_needs_update() needs to * return false if atime is turned off, and not unconditionally return * false if atime is turned on. */ if (newval) sb->s_flags &= ~SB_NOATIME; else sb->s_flags |= SB_NOATIME; } static void relatime_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_relatime = newval; } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void acltype_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; switch (newval) { case ZFS_ACLTYPE_NFSV4: case ZFS_ACLTYPE_OFF: zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; break; case ZFS_ACLTYPE_POSIX: #ifdef CONFIG_FS_POSIX_ACL zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX; zfsvfs->z_sb->s_flags |= SB_POSIXACL; #else zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; #endif /* CONFIG_FS_POSIX_ACL */ break; default: break; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval) sb->s_flags |= SB_RDONLY; else sb->s_flags &= ~SB_RDONLY; } static void devices_changed_cb(void *arg, uint64_t newval) { } static void setuid_changed_cb(void *arg, uint64_t newval) { } static void exec_changed_cb(void *arg, uint64_t newval) { } static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval == TRUE) sb->s_flags |= SB_MANDLOCK; else sb->s_flags &= ~SB_MANDLOCK; } static void snapdir_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_show_ctldir = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) { vfsp->vfs_do_readonly = B_TRUE; vfsp->vfs_readonly = B_TRUE; } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (vfsp->vfs_do_readonly) readonly_changed_cb(zfsvfs, vfsp->vfs_readonly); if (vfsp->vfs_do_setuid) setuid_changed_cb(zfsvfs, vfsp->vfs_setuid); if (vfsp->vfs_do_exec) exec_changed_cb(zfsvfs, vfsp->vfs_exec); if (vfsp->vfs_do_devices) devices_changed_cb(zfsvfs, vfsp->vfs_devices); if (vfsp->vfs_do_xattr) xattr_changed_cb(zfsvfs, vfsp->vfs_xattr); if (vfsp->vfs_do_atime) atime_changed_cb(zfsvfs, vfsp->vfs_atime); if (vfsp->vfs_do_relatime) relatime_changed_cb(zfsvfs, vfsp->vfs_relatime); if (vfsp->vfs_do_nbmand) nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Takes a dataset, a property, a value and that value's setpoint as * found in the ZAP. Checks if the property has been changed in the vfs. * If so, val and setpoint will be overwritten with updated content. * Otherwise, they are left unchanged. */ int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) return (EINVAL); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); mutex_exit(&os->os_user_ptr_lock); if (zfvp == NULL) return (ESRCH); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfsp->vfs_do_atime) tmp = vfsp->vfs_atime; break; case ZFS_PROP_RELATIME: if (vfsp->vfs_do_relatime) tmp = vfsp->vfs_relatime; break; case ZFS_PROP_DEVICES: if (vfsp->vfs_do_devices) tmp = vfsp->vfs_devices; break; case ZFS_PROP_EXEC: if (vfsp->vfs_do_exec) tmp = vfsp->vfs_exec; break; case ZFS_PROP_SETUID: if (vfsp->vfs_do_setuid) tmp = vfsp->vfs_setuid; break; case ZFS_PROP_READONLY: if (vfsp->vfs_do_readonly) tmp = vfsp->vfs_readonly; break; case ZFS_PROP_XATTR: if (vfsp->vfs_do_xattr) tmp = vfsp->vfs_xattr; break; case ZFS_PROP_NBMAND: if (vfsp->vfs_do_nbmand) tmp = vfsp->vfs_nbmand; break; default: return (ENOENT); } if (tmp != *val) { if (setpoint) (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printk("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.\n", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); if ((error == 0) && (val == ZFS_XATTR_SA)) zfsvfs->z_xattr_sa = B_TRUE; } error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); return (0); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); return (error); } /* * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function * on a failure. Do not pass in a statically allocated zfsvfs. */ int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_sb = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); ZFS_TEARDOWN_INIT(zfsvfs); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (int i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; zfsvfs_free(zfsvfs); return (error); } zfsvfs->z_drain_task = TASKQID_INVALID; zfsvfs->z_draining = B_FALSE; zfsvfs->z_drain_cancel = B_TRUE; *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; boolean_t readonly = zfs_is_readonly(zfsvfs); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); /* * During replay we remove the read only flag to * allow replays to succeed. */ if (readonly != 0) { readonly_changed_cb(zfsvfs, B_FALSE); } else { zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } /* restore readonly bit */ if (readonly != 0) readonly_changed_cb(zfsvfs, B_TRUE); } else { ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, &zfsvfs->z_kstat.dk_zil_sums); } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i, size = zfsvfs->z_hold_size; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); ZFS_TEARDOWN_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); zfsvfs_vfs_free(zfsvfs->z_vfs); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef HAVE_MLSLABEL /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : SET_ERROR(EACCES)); } return (SET_ERROR(EACCES)); } #endif /* HAVE_MLSLABEL */ static int zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, uint32_t bshift) { char buf[20 + DMU_OBJACCT_PREFIX_LEN]; uint64_t offset = DMU_OBJACCT_PREFIX_LEN; uint64_t quota; uint64_t used; int err; strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, sizeof (buf) - offset, B_FALSE); if (err) return (err); if (zfsvfs->z_projectquota_obj == 0) goto objs; err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) goto objs; else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf + offset, 8, 1, &used); if (unlikely(err == ENOENT)) { uint32_t blksize; u_longlong_t nblocks; /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); if (unlikely(zp->z_blksz == 0)) blksize = zfsvfs->z_max_blksz; used = blksize * nblocks; } else if (err) { return (err); } statp->f_blocks = quota >> bshift; statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; statp->f_bavail = statp->f_bfree; objs: if (zfsvfs->z_projectobjquota_obj == 0) return (0); err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) return (0); else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf, 8, 1, &used); if (unlikely(err == ENOENT)) { /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ used = 1; } else if (err) { return (err); } statp->f_files = quota; statp->f_ffree = (quota > used) ? (quota - used) : 0; return (0); } int zfs_statvfs(struct inode *ip, struct kstatfs *statp) { zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os); /* * The underlying storage pool actually uses multiple block * size. Under Solaris frsize (fragment size) is reported as * the smallest block size we support, and bsize (block size) * as the filesystem's maximum block size. Unfortunately, * under Linux the fragment size and block size are often used * interchangeably. Thus we are forced to report both of them * as the filesystem's maximum block size. */ statp->f_frsize = zfsvfs->z_max_blksz; statp->f_bsize = zfsvfs->z_max_blksz; uint32_t bshift = fls(statp->f_bsize) - 1; /* * The following report "total" blocks of various kinds in * the file system, but reported in terms of f_bsize - the * "preferred" size. */ /* Round up so we never have a filesystem using 0 blocks. */ refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); statp->f_blocks = (refdbytes + availbytes) >> bshift; statp->f_bfree = availbytes >> bshift; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of objects available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); statp->f_files = statp->f_ffree + usedobjs; statp->f_fsid.val[0] = (uint32_t)fsid; statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); statp->f_type = ZFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; /* * We have all of 40 characters to stuff a string here. * Is there anything useful we could/should provide? */ memset(statp->f_spare, 0, sizeof (statp->f_spare)); if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && dmu_objset_projectquota_present(zfsvfs->z_os)) { znode_t *zp = ITOZ(ip); if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && zpl_is_valid_projid(zp->z_projid)) err = zfs_statfs_project(zfsvfs, zp, statp, bshift); } zfs_exit(zfsvfs, FTAG); return (err); } static int zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) { znode_t *rootzp; int error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *ipp = ZTOI(rootzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Linux kernels older than 3.1 do not support a per-filesystem shrinker. * To accommodate this we must improvise and manually walk the list of znodes * attempting to prune dentries in order to be able to drop the inodes. * * To avoid scanning the same znodes multiple times they are always rotated * to the end of the z_all_znodes list. New znodes are inserted at the * end of the list so we're always scanning the oldest znodes first. */ static int zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) { znode_t **zp_array, *zp; int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); int objects = 0; int i = 0, j = 0; zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); mutex_enter(&zfsvfs->z_znodes_lock); while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { if ((i++ > nr_to_scan) || (j >= max_array)) break; ASSERT(list_link_active(&zp->z_link_node)); list_remove(&zfsvfs->z_all_znodes, zp); list_insert_tail(&zfsvfs->z_all_znodes, zp); /* Skip active znodes and .zfs entries */ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) continue; if (igrab(ZTOI(zp)) == NULL) continue; zp_array[j] = zp; j++; } mutex_exit(&zfsvfs->z_znodes_lock); for (i = 0; i < j; i++) { zp = zp_array[i]; ASSERT3P(zp, !=, NULL); d_prune_aliases(ZTOI(zp)); if (atomic_read(&ZTOI(zp)->i_count) == 1) objects++; zrele(zp); } vmem_free(zp_array, max_array * sizeof (znode_t *)); return (objects); } /* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; struct shrinker *shrinker = &sb->s_shrink; struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, }; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); /* * reset sc.nr_to_scan, modified by * scan_objects == super_cache_scan */ sc.nr_to_scan = nr_to_scan; } } else { *objects = (*shrinker->scan_objects)(shrinker, &sc); } #elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) *objects = (*shrinker->scan_objects)(shrinker, &sc); #elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) *objects = (*shrinker->shrink)(shrinker, &sc); #elif defined(HAVE_D_PRUNE_ALIASES) #define D_PRUNE_ALIASES_IS_DEFAULT *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #else #error "No available dentry and inode cache pruning mechanism." #endif #if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT) #undef D_PRUNE_ALIASES_IS_DEFAULT /* * Fall back to zfs_prune_aliases if the kernel's per-superblock * shrinker couldn't free anything, possibly due to the inodes being * allocated in a different memcg. */ if (*objects == 0) *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #endif zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "pruning, nr_to_scan=%lu objects=%d error=%d\n", nr_to_scan, *objects, error); return (error); } /* * Teardown the zfsvfs_t. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; zfs_unlinked_drain_stop_wait(zfsvfs); /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but iputs run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. + * We can safely check z_all_znodes for being empty because the + * VFS has already blocked operations which add to it. */ int round = 0; - while (zfsvfs->z_nr_znodes > 0) { + while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* * We purge the parent filesystem's super block as the * parent filesystem and all of its snapshots have their * inode's super block set to the parent's filesystem's * super block. Note, 'z_parent' is self referential * for non-snapshots. */ shrink_dcache_sb(zfsvfs->z_parent->z_sb); } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no VFS ops active, and any new VFS ops * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. We also grab an extra reference to all * the remaining inodes so that the kernel does not attempt to free * any inodes of a suspended fs. This can cause deadlocks since the * zfs_resume_fs() process may involve starting threads, which might * attempt to free unreferenced inodes to free up memory for the new * thread. */ if (!unmounting) { mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); if (igrab(ZTOI(zp)) != NULL) zp->z_suspended = B_TRUE; } mutex_exit(&zfsvfs->z_znodes_lock); } /* * If we are unmounting, set the unmounted flag and let new VFS ops * unblock. zfs_inactive will have the unmounted behavior, and all * other VFS ops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ objset_t *os = zfsvfs->z_os; boolean_t os_dirty = B_FALSE; for (int t = 0; t < TXG_SIZE; t++) { if (dmu_objset_is_dirty(os, t)) { os_dirty = B_TRUE; break; } } if (!zfs_is_readonly(zfsvfs) && os_dirty) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); } dmu_objset_evict_dbufs(zfsvfs->z_os); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } #if defined(HAVE_SUPER_SETUP_BDI_NAME) atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); #endif int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) { const char *osname = zm->mnt_osname; struct inode *root_inode = NULL; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; vfs_t *vfs = NULL; int canwrite; int dataset_visible_zone; ASSERT(zm); ASSERT(osname); dataset_visible_zone = zone_dataset_visible(osname, &canwrite); /* * Refuse to mount a filesystem if we are in a namespace and the * dataset is not visible or writable in that namespace. */ if (!INGLOBALZONE(curproc) && (!dataset_visible_zone || !canwrite)) { return (SET_ERROR(EPERM)); } error = zfsvfs_parse_options(zm->mnt_data, &vfs); if (error) return (error); /* * If a non-writable filesystem is being mounted without the * read-only flag, pretend it was set, as done for snapshots. */ if (!canwrite) vfs->vfs_readonly = true; error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { zfsvfs_vfs_free(vfs); goto out; } if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) { zfsvfs_vfs_free(vfs); goto out; } vfs->vfs_data = zfsvfs; zfsvfs->z_vfs = vfs; zfsvfs->z_sb = sb; sb->s_fs_info = zfsvfs; sb->s_magic = ZFS_SUPER_MAGIC; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_blocksize = recordsize; sb->s_blocksize_bits = ilog2(recordsize); error = -zpl_bdi_setup(sb, "zfs"); if (error) goto out; sb->s_bdi->ra_pages = 0; /* Set callback operations for the file system. */ sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acltype_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; zfsvfs->z_snap_defer_time = jiffies; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } /* Allocate a root inode for the filesystem. */ error = zfs_root(zfsvfs, &root_inode); if (error) { (void) zfs_umount(sb); zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ goto out; } /* Allocate a root dentry for the filesystem */ sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) { (void) zfs_umount(sb); zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ error = SET_ERROR(ENOMEM); goto out; } if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); out: if (error) { if (zfsvfs != NULL) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } /* * make sure we don't have dangling sb->s_fs_info which * zfs_preumount will use. */ sb->s_fs_info = NULL; } return (error); } /* * Called when an unmount is requested and certain sanity checks have * already passed. At this point no dentries or inodes have been reclaimed * from their respective caches. We drop the extra reference on the .zfs * control directory to allow everything to be reclaimed. All snapshots * must already have been unmounted to reach this point. */ void zfs_preumount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; /* zfsvfs is NULL when zfs_domount fails during mount */ if (zfsvfs) { zfs_unlinked_drain_stop_wait(zfsvfs); zfsctl_destroy(sb->s_fs_info); /* * Wait for zrele_async before entering evict_inodes in * generic_shutdown_super. The reason we must finish before * evict_inodes is when lazytime is on, or when zfs_purgedir * calls zfs_zget, zrele would bump i_count from 0 to 1. This * would race with the i_count check in evict_inodes. This means * it could destroy the inode while we are still using it. * * We wait for two passes. xattr directories in the first pass * may add xattr entries in zfs_purgedir, so in the second pass * we wait for them. We don't use taskq_wait here because it is * a pool wide taskq. Other mounted filesystems can constantly * do zrele_async and there's no guarantee when taskq will be * empty. */ taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); } } /* * Called once all other unmount released tear down has occurred. * It is our responsibility to release any remaining infrastructure. */ int zfs_umount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; objset_t *os; if (zfsvfs->z_arc_prune != NULL) arc_remove_prune_callback(zfsvfs->z_arc_prune); VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; zpl_bdi_destroy(sb); /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } zfsvfs_free(zfsvfs); sb->s_fs_info = NULL; return (0); } int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) { zfsvfs_t *zfsvfs = sb->s_fs_info; vfs_t *vfsp; boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); int error; if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && !(*flags & SB_RDONLY)) { *flags |= SB_RDONLY; return (EROFS); } error = zfsvfs_parse_options(zm->mnt_data, &vfsp); if (error) return (error); if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); zfs_unregister_callbacks(zfsvfs); zfsvfs_vfs_free(zfsvfs->z_vfs); vfsp->vfs_data = zfsvfs; zfsvfs->z_vfs = vfsp; if (!issnap) (void) zfs_register_callbacks(vfsp); return (error); } int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) { zfsvfs_t *zfsvfs = sb->s_fs_info; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *ipp = NULL; if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { return (SET_ERROR(EINVAL)); } /* LONG_FID_LEN means snapdirs */ if (fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); if (objsetid != ZFSCTL_INO_SNAPDIRS - object) { dprintf("snapdir fid: objsetid (%llu) != " "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n", objsetid, ZFSCTL_INO_SNAPDIRS, object); return (SET_ERROR(EINVAL)); } if (fid_gen > 1 || setgen != 0) { dprintf("snapdir fid: fid_gen (%llu) and setgen " "(%llu)\n", fid_gen, setgen); return (SET_ERROR(EINVAL)); } return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); } if ((err = zfs_enter(zfsvfs, FTAG)) != 0) return (err); /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { *ipp = zfsvfs->z_ctldir; ASSERT(*ipp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); } else { /* * Must have an existing ref, so igrab() * cannot return NULL */ VERIFY3P(igrab(*ipp), !=, NULL); } zfs_exit(zfsvfs, FTAG); return (0); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { zfs_exit(zfsvfs, FTAG); return (err); } /* Don't export xattr stuff */ if (zp->z_pflags & ZFS_XATTR) { zrele(zp); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if ((fid_gen == 0) && (zfsvfs->z_root == object)) fid_gen = zp_gen; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); zrele(zp); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOENT)); } *ipp = ZTOI(zp); if (*ipp) zfs_znode_update_vfs(ITOZ(*ipp)); zfs_exit(zfsvfs, FTAG); return (0); } /* * Block out VFS ops and close zfsvfs_t * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err, err2; znode_t *zp; ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); zfsvfs->z_rollback_time = jiffies; /* * Attempt to re-establish all the active inodes with their * dbufs. If a zfs_rezget() fails, then we unhash the inode * and mark it stale. This prevents a collision if a new * inode/object is created which must use the same inode * number. The stale inode will be be released when the * VFS prunes the dentry holding the remaining references * on the stale inode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { zpl_d_drop_aliases(ZTOI(zp)); remove_inode_hash(ZTOI(zp)); } /* see comment in zfs_suspend_fs() */ if (zp->z_suspended) { zfs_zrele_async(zp); zp->z_suspended = B_FALSE; } } mutex_exit(&zfsvfs->z_znodes_lock); if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) { /* * zfs_suspend_fs() could have interrupted freeing * of dnodes. We need to restart this freeing so * that we don't "leak" the space. */ zfs_unlinked_drain(zfsvfs); } /* * Most of the time zfs_suspend_fs is used for changing the contents * of the underlying dataset. ZFS rollback and receive operations * might create files for which negative dentries are present in * the cache. Since walking the dcache would require a lot of GPL-only * code duplication, it's much easier on these rather rare occasions * just to flush the whole dcache for the given dataset/filesystem. */ shrink_dcache_sb(zfsvfs->z_sb); bail: if (err != 0) zfsvfs->z_unmounted = B_TRUE; /* release the VFS ops */ rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); if (err != 0) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (zfsvfs->z_os) (void) zfs_umount(zfsvfs->z_sb); } return (err); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_sb); zfsvfs->z_unmounted = B_TRUE; return (0); } /* * Automounted snapshots rely on periodic revalidation * to defer snapshots from being automatically unmounted. */ inline void zfs_exit_fs(zfsvfs_t *zfsvfs) { if (!zfsvfs->z_issnap) return; if (time_after(jiffies, zfsvfs->z_snap_defer_time + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { zfsvfs->z_snap_defer_time = jiffies; zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); } } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_unmounted) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } void zfsvfs_update_fromname(const char *oldname, const char *newname) { /* * We don't need to do anything here, the devname is always current by * virtue of zfsvfs->z_sb->s_op->show_devname. */ (void) oldname, (void) newname; } void zfs_init(void) { zfsctl_init(); zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND register_fo_extend(&zpl_file_operations); #endif } void zfs_fini(void) { /* * we don't use outstanding because zpl_posix_acl_free might add more. */ taskq_wait(system_delay_taskq); taskq_wait(system_taskq); #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND unregister_fo_extend(&zpl_file_operations); #endif unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_suspend_fs); EXPORT_SYMBOL(zfs_resume_fs); EXPORT_SYMBOL(zfs_set_version); EXPORT_SYMBOL(zfsvfs_create); EXPORT_SYMBOL(zfsvfs_free); EXPORT_SYMBOL(zfs_is_readonly); EXPORT_SYMBOL(zfs_domount); EXPORT_SYMBOL(zfs_preumount); EXPORT_SYMBOL(zfs_umount); EXPORT_SYMBOL(zfs_remount); EXPORT_SYMBOL(zfs_statvfs); EXPORT_SYMBOL(zfs_vget); EXPORT_SYMBOL(zfs_prune); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 234c4d5ef0e0..33baac9db06b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4232 +1,4232 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using zfs_enter(zfsvfs). * A zfs_exit(zfsvfs) is needed before all returns. Any znodes * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires * range locks) and syncing out cached atime changes. Third, * zfs_zinactive() may require a new tx, which could deadlock the system * if you were already holding one. This deadlock occurs because the tx * currently being operated on prevents a txg from syncing, which * prevents the new tx from progressing, resulting in a deadlock. If you * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Honor ZFS_APPENDONLY file attribute */ - if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } int zfs_close(struct inode *ip, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) static int zfs_fillpage(struct inode *ip, struct page *pp); /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. Update all mapped * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { struct address_space *mp = ZTOI(zp)->i_mapping; int64_t off = start & (PAGE_SIZE - 1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t nbytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); void *pb = kmap(pp); int error = dmu_read(os, zp->z_id, start + off, nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); if (error) { SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); } unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the I/O data synchronized * between the DMU cache and the memory mapped pages. Preferentially read * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; int64_t start = uio->uio_loffset; int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t bytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { /* * If filemap_fault() retries there exists a window * where the page will be unlocked and not up to date. * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { error = zfs_fillpage(ip, pp); if (error) { unlock_page(pp); put_page(pp); return (error); } } ASSERT(PageUptodate(pp) || PageDirty(pp)); unlock_page(pp); void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure. EIO is returned * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *residp) { fstrans_cookie_t cookie; int error; struct iovec iov; iov.iov_base = (void *)data; iov.iov_len = len; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); spl_fstrans_unmark(cookie); if (error == 0) { if (residp != NULL) *residp = zfs_uio_resid(&uio); else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } return (error); } static void zfs_rele_async_task(void *arg) { iput(arg); } void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); /* * If decrementing the count would put us at 0, we can't do it inline * here, because that would be synchronous. Instead, dispatch an iput * to run later. * * For more information on the dangers of a synchronous iput, see the * header comment of this file. */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) return (error); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { zfs_exit(zfsvfs, FTAG); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_TRUE, cr, zfs_init_idmap))) { zrele(*zpp); *zpp = NULL; } zfs_exit(zfsvfs, FTAG); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); zfs_exit(zfsvfs, FTAG); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, mnt_ns))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ static uint64_t null_xattr = 0; int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; VERIFY_IMPLY(xattr_obj_unlinked, xzp); } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); zfs_exit(zfsvfs, FTAG); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap.za_integer_length != 8 || zap.za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap.za_integer_length, (u_longlong_t)zap.za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); type = ZFS_DIRENT_TYPE(zap.za_first_integer); } done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; /* Prefetch znode */ if (prefetch) { dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); } /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); if (error == ENOENT) error = 0; out: zfs_exit(zfsvfs, FTAG); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ int zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); zpl_generic_fillattr(user_ns, ip, sp); /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } zfs_exit(zfsvfs, FTAG); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { count = 0; if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } if (zp->z_projid != dzp->z_projid) { if (!(zp->z_pflags & ZFS_PROJID)) { zp->z_pflags |= ZFS_PROJID; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); } zp->z_projid = dzp->z_projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * mnt_ns - user namespace of the mount * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ip = ZTOI(zp); /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, mnt_ns); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr, mnt_ns); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; uid_t uid; gid_t gid; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), vap->va_uid); gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), vap->va_gid); take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; ZFS_TIME_ENCODE(&ip->i_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( vap->va_mtime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); zfs_exit(zfsvfs, FTAG); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * rflags - RENAME_* flags * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; /* Needed for whiteout inode creation. */ boolean_t fuid_dirtied; zfs_acl_ids_t acl_ids; boolean_t have_acl = B_FALSE; znode_t *wzp = NULL; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return (SET_ERROR(EINVAL)); /* Already checked by Linux VFS, but just to make sure. */ if (rflags & RENAME_EXCHANGE && (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) return (SET_ERROR(EINVAL)); /* * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the * right kind of vattr_t for the whiteout file. These are set * internally by ZFS so should never be incorrect. */ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if ((error = zfs_verify_zp(tdzp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ zfs_exit(zfsvfs, FTAG); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; zfs_exit(zfsvfs, FTAG); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; zfs_exit(zfsvfs, FTAG); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { if (rflags & RENAME_NOREPLACE) { error = SET_ERROR(EEXIST); goto out; } /* * Source and target must be the same type (unless exchanging). */ if (!(rflags & RENAME_EXCHANGE)) { boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; if (s_is_dir != t_is_dir) { error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } else if (rflags & RENAME_EXCHANGE) { /* Target must exist for RENAME_EXCHANGE. */ error = SET_ERROR(ENOENT); goto out; } /* Set up inode creation for RENAME_WHITEOUT. */ if (rflags & RENAME_WHITEOUT) { /* * Whiteout files are not regular files or directories, so to * match zfs_create() we do not inherit the project id. */ uint64_t wo_projid = ZFS_DEFAULT_PROJID; error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); if (error) goto out; if (!have_acl) { error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, &acl_ids, mnt_ns); if (error) goto out; have_acl = B_TRUE; } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { error = SET_ERROR(EDQUOT); goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } if (rflags & RENAME_WHITEOUT) { dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Unlink the source. */ szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); VERIFY0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error) goto commit; /* * Unlink the target. */ if (tzp) { int tzflg = zflg; if (rflags & RENAME_EXCHANGE) { /* This inode will be re-linked soon. */ tzflg |= ZRENAMING; tzp->z_pflags |= ZFS_AV_MODIFIED; if (sdzp->z_pflags & ZFS_PROJINHERIT) tzp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&tzp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); } error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); if (error) goto commit_link_szp; } /* * Create the new target links: * * We always link the target. * * RENAME_EXCHANGE: Link the old target to the source. * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. */ error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error) { /* * If we have removed the existing target, a subsequent call to * zfs_link_create() to add back the same entry, but with a new * dnode (szp), should not fail. */ ASSERT3P(tzp, ==, NULL); goto commit_link_tzp; } switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: error = zfs_link_create(sdl, tzp, tx, ZRENAMING); /* * The same argument as zfs_link_create() failing for * szp applies here, since the source directory must * have had an entry we are replacing. */ ASSERT0(error); if (error) goto commit_unlink_td_szp; break; case RENAME_WHITEOUT: zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); error = zfs_link_create(sdl, wzp, tx, ZNEW); if (error) { zfs_znode_delete(wzp, tx); remove_inode_hash(ZTOI(wzp)); goto commit_unlink_td_szp; } break; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: zfs_log_rename_exchange(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; case RENAME_WHITEOUT: zfs_log_rename_whiteout(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp, wzp); break; default: ASSERT0(rflags & ~RENAME_NOREPLACE); zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; } commit: dmu_tx_commit(tx); out: if (have_acl) zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zrele(szp); if (wzp) { zfs_znode_update_vfs(wzp); zrele(wzp); } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); /* * Clean-up path for broken link state. * * At this point we are in a (very) bad state, so we need to do our * best to correct the state. In particular, all of the nlinks are * wrong because we were destroying and creating links with ZRENAMING. * * In some form, all of these operations have to resolve the state: * * * link_destroy() *must* succeed. Fortunately, this is very likely * since we only just created it. * * * link_create()s are allowed to fail (though they shouldn't because * we only just unlinked them and are putting the entries back * during clean-up). But if they fail, we can just forcefully drop * the nlink value to (at the very least) avoid broken nlink values * -- though in the case of non-empty directories we will have to * panic (otherwise we'd have a leaked directory with a broken ..). */ commit_unlink_td_szp: VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); commit_link_tzp: if (tzp) { if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); } commit_link_szp: if (zfs_link_create(sdl, szp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(szp, tx, NULL)); goto commit; } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * mnt_ns - user namespace of the mount * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); zfs_exit(zfsvfs, FTAG); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; #ifdef HAVE_TMPFILE is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); #endif ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_verify_zp(szp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { zfs_exit(zfsvfs, FTAG); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zfs_exit(zfsvfs, FTAG); return (error); } static void zfs_putpage_sync_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } static void zfs_putpage_async_commit_cb(void *arg) { struct page *pp = arg; znode_t *zp = ITOZ(pp->mapping->host); ClearPageError(pp); end_page_writeback(pp); atomic_dec_32(&zp->z_async_writes_cnt); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * for_sync - does the caller intend to wait synchronously for the * page writeback to complete? * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); zfs_exit(zfsvfs, FTAG); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Speed up any non-sync page writebacks since * they may take several seconds to complete. * Refer to the comment in zpl_fsync() (when * HAVE_FSYNC_RANGE is defined) for details. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { zil_commit(zfsvfs->z_log, zp->z_id); } if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); #else wait_on_page_bit(pp, PG_writeback); #endif } zfs_exit(zfsvfs, FTAG); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; if (!for_sync) atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); #else __set_page_dirty_nobuffers(pp); #endif ClearPageError(pp); end_page_writeback(pp); if (!for_sync) atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { /* * If the caller does not intend to wait synchronously * for this page writeback to complete and there are active * synchronous calls on this file, do a commit so that * the latter don't accidentally end up waiting for * our writeback to complete. Refer to the comment in * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. */ zil_commit(zfsvfs->z_log, zp->z_id); } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); zfs_exit(zfsvfs, FTAG); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: zfs_exit(zfsvfs, FTAG); return (error); } void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { ZFS_TIME_ENCODE(&ip->i_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pp) { zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; void *va = kmap(pp); int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); kunmap(pp); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); } return (error); } /* * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. * pp - page to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ int zfs_getpage(struct inode *ip, struct page *pp) { zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *zp = ITOZ(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_fillpage(ip, pp); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); zfs_exit(zfsvfs, FTAG); return (error); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { (void) addrp; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((vm_flags & VM_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENXIO)); } zfs_exit(zfsvfs, FTAG); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { (void) offset; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (cmd != F_FREESP) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } if ((error = zfs_verify_zp(zp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index 335ae3460c58..52c8e51df659 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -1,2360 +1,2358 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* * Functions needed for userland (ie: libzpool) are not put under * #ifdef_KERNEL; the rest of the functions have dependencies * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL static kmem_cache_t *znode_cache = NULL; static kmem_cache_t *znode_hold_cache = NULL; unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; /* * This is used by the test suite so that it can delay znodes from being * freed in order to inspect the unlinked set. */ static int zfs_unlink_suspend_progress = 0; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_t *zp = buf; inode_init_once(ZTOI(zp)); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_hold_t *zh = buf; mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); zh->zh_refcount = 0; return (0); } static void zfs_znode_hold_cache_destructor(void *buf, void *arg) { (void) arg; znode_hold_t *zh = buf; mutex_destroy(&zh->zh_lock); } void zfs_znode_init(void) { /* * Initialize zcache. The KMC_SLAB hint is used in order that it be * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; if (znode_hold_cache) kmem_cache_destroy(znode_hold_cache); znode_hold_cache = NULL; } /* * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to * serialize access to a znode and its SA buffer while the object is being * created or destroyed. This kind of locking would normally reside in the * znode itself but in this case that's impossible because the znode and SA * buffer may not yet exist. Therefore the locking is handled externally * with an array of mutexes and AVLs trees which contain per-object locks. * * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted * in to the correct AVL tree and finally the per-object lock is held. In * zfs_znode_hold_exit() the process is reversed. The per-object lock is * released, removed from the AVL tree and destroyed if there are no waiters. * * This scheme has two important properties: * * 1) No memory allocations are performed while holding one of the z_hold_locks. * This ensures evict(), which can be called from direct memory reclaim, will * never block waiting on a z_hold_locks which just happens to have hashed * to the same index. * * 2) All locks used to serialize access to an object are per-object and never * shared. This minimizes lock contention without creating a large number * of dedicated locks. * * On the downside it does require znode_lock_t structures to be frequently * allocated and freed. However, because these are backed by a kmem cache * and very short lived this cost is minimal. */ int zfs_znode_hold_compare(const void *a, const void *b) { const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } static boolean_t __maybe_unused zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t held; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; mutex_exit(&zfsvfs->z_hold_locks[i]); return (held); } znode_hold_t * zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, *zh_new, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t found = B_FALSE; zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); if (likely(zh == NULL)) { zh = zh_new; zh->zh_obj = obj; avl_add(&zfsvfs->z_hold_trees[i], zh); } else { ASSERT3U(zh->zh_obj, ==, obj); found = B_TRUE; } zh->zh_refcount++; ASSERT3S(zh->zh_refcount, >, 0); mutex_exit(&zfsvfs->z_hold_locks[i]); if (found == B_TRUE) kmem_cache_free(znode_hold_cache, zh_new); ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); mutex_enter(&zh->zh_lock); return (zh); } void zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) { int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); boolean_t remove = B_FALSE; ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); mutex_exit(&zh->zh_lock); mutex_enter(&zfsvfs->z_hold_locks[i]); ASSERT3S(zh->zh_refcount, >, 0); if (--zh->zh_refcount == 0) { avl_remove(&zfsvfs->z_hold_trees[i], zh); remove = B_TRUE; } mutex_exit(&zfsvfs->z_hold_locks[i]); if (remove == B_TRUE) kmem_cache_free(znode_hold_cache, zh); } dev_t zfs_cmpldev(uint64_t dev) { return (dev); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); mutex_enter(&zp->z_lock); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; mutex_exit(&zp->z_lock); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } /* * Called by new_inode() to allocate a new inode. */ int zfs_inode_alloc(struct super_block *sb, struct inode **ip) { znode_t *zp; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); *ip = ZTOI(zp); return (0); } /* * Called in multiple places when an inode should be destroyed. */ void zfs_inode_destroy(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; } mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } static void zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) { uint64_t rdev = 0; switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND ip->i_fop = &zpl_file_operations.kabi_fops; #else ip->i_fop = &zpl_file_operations; #endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; case S_IFDIR: #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER ip->i_flags |= S_IOPS_WRAPPER; ip->i_op = &zpl_dir_inode_operations.ops; #else ip->i_op = &zpl_dir_inode_operations; #endif ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; case S_IFLNK: ip->i_op = &zpl_symlink_inode_operations; break; /* * rdev is only stored in a SA only for device files. */ case S_IFCHR: case S_IFBLK: (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)); zfs_fallthrough; case S_IFIFO: case S_IFSOCK: init_special_inode(ip, ip->i_mode, rdev); ip->i_op = &zpl_special_inode_operations; break; default: zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", (u_longlong_t)ip->i_ino, ip->i_mode); /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND ip->i_fop = &zpl_file_operations.kabi_fops; #else ip->i_fop = &zpl_file_operations; #endif ip->i_mapping->a_ops = &zpl_address_space_operations; break; } } static void zfs_set_inode_flags(znode_t *zp, struct inode *ip) { /* * Linux and Solaris have different sets of file attributes, so we * restrict this conversion to the intersection of the two. */ #ifdef HAVE_INODE_SET_FLAGS unsigned int flags = 0; if (zp->z_pflags & ZFS_IMMUTABLE) flags |= S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) flags |= S_APPEND; inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); #else if (zp->z_pflags & ZFS_IMMUTABLE) ip->i_flags |= S_IMMUTABLE; else ip->i_flags &= ~S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) ip->i_flags |= S_APPEND; else ip->i_flags &= ~S_APPEND; #endif } /* * Update the embedded inode given the znode. */ void zfs_znode_update_vfs(znode_t *zp) { struct inode *ip; uint32_t blksize; u_longlong_t i_blocks; ASSERT(zp != NULL); ip = ZTOI(zp); /* Skip .zfs control nodes which do not exist on disk. */ if (zfsctl_is_node(ip)) return; dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); } /* * Construct a znode+inode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; uint64_t mode; uint64_t parent; uint64_t tmp_gen; uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; ASSERT(zfsvfs != NULL); ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); zp = ITOZ(ip); ASSERT(zp->z_dirlocks == NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) zp->z_is_mapped = B_FALSE; #endif zp->z_is_ctldir = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; goto error; } zp->z_projid = projid; zp->z_mode = ip->i_mode = mode; ip->i_generation = (uint32_t)tmp_gen; ip->i_blkbits = SPA_MINBLOCKSHIFT; set_nlink(ip, (uint32_t)links); zfs_uid_write(ip, z_uid); zfs_gid_write(ip, z_gid); zfs_set_inode_flags(zp, ip); /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; ZFS_TIME_DECODE(&ip->i_atime, atime); ZFS_TIME_DECODE(&ip->i_mtime, mtime); ZFS_TIME_DECODE(&ip->i_ctime, ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* * The only way insert_inode_locked() can fail is if the ip->i_ino * number is already hashed for this super block. This can never * happen because the inode numbers map 1:1 with the object numbers. * * Exceptions include rolling back a mounted file system, either * from the zfs rollback or zfs recv command. * * Active inodes are unhashed during the rollback, but since zrele * can happen asynchronously, we can't guarantee they've been * unhashed. This can cause hash collisions in unlinked drain * processing so do not hash unlinked znodes. */ if (links > 0) VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) unlock_new_inode(ip); return (zp); error: iput(ip); return (NULL); } /* * Safely mark an inode dirty. Inodes which are part of a read-only * file system or snapshot may not be dirtied. */ void zfs_mark_inode_dirty(struct inode *ip) { zfsvfs_t *zfsvfs = ITOZSB(ip); if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return; mark_inode_dirty(ip); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_TMPFILE - new object is of O_TMPFILE * IS_XATTR - new object is an attribute * acl_ids - ACL related attributes * * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t projid = ZFS_DEFAULT_PROJID; uint64_t rdev = 0; zfsvfs_t *zfsvfs = ZTOZSB(dzp); dmu_buf_t *db; inode_timespec_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; znode_hold_t *zh; if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (S_ISDIR(vap->va_mode)) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } zh = zfs_znode_hold_enter(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } /* * If parent is an xattr, so am I. */ if (dzp->z_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (S_ISDIR(vap->va_mode)) { size = 2; /* contents ("." and "..") */ links = 2; } else { size = 0; links = (flag & IS_TMPFILE) ? 0 : 1; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) rdev = vap->va_rdev; parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { /* * With ZFS_PROJID flag, we can easily know whether there is * project ID stored on disk or not. See zfs_space_delta_cb(). */ if (obj_type != DMU_OT_ZNODE && dmu_objset_projectquota_enabled(zfsvfs->z_os)) pflags |= ZFS_PROJID; /* * Inherit project ID from parent if required. */ projid = zfs_inherit_projid(dzp); if (dzp->z_pflags & ZFS_PROJINHERIT) pflags |= ZFS_PROJINHERIT; } /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & ATTR_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & ATTR_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && pflags & ZFS_PROJID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); } if (obj_type == DMU_OT_ZNODE || (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { /* * The call to zfs_znode_alloc() may fail if memory is low * via the call path: alloc_inode() -> inode_init_always() -> * security_inode_alloc() -> inode_alloc_security(). Since * the existing code is written such that zfs_mknode() can * not fail retry until sufficient memory has been reclaimed. */ do { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); zfs_znode_hold_exit(zfsvfs, zh); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; boolean_t update_inode = B_FALSE; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (update_inode) zfs_set_inode_flags(zp, ZTOI(zp)); } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; znode_hold_t *zh; int err; sa_handle_t *hdl; *zpp = NULL; again: zh = zfs_znode_hold_enter(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); /* * If zp->z_unlinked is set, the znode is already marked * for deletion and should not be discovered. Check this * after checking igrab() due to fsetxattr() & O_TMPFILE. * * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. * The SA handle is still valid but because the VFS * requires that the eviction succeed we must drop * our locks and references to allow the eviction to * complete. The zfs_zget() may then be retried. * * This unlikely case could be optimized by registering * a sops->drop_inode() callback. The callback would * need to detect the active SA hold thereby informing * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { if (zp->z_unlinked) err = SET_ERROR(ENOENT); else err = SET_ERROR(EAGAIN); } else { *zpp = zp; err = 0; } mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); if (err == EAGAIN) { /* inode might need this to finish evict */ cond_resched(); goto again; } return (err); } /* * Not found create new znode/vnode but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } zfs_znode_hold_exit(zfsvfs, zh); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_info_t doi; dmu_buf_t *db; uint64_t obj_num = zp->z_id; uint64_t mode; uint64_t links; sa_bulk_attr_t bulk[11]; int err; int count = 0; uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; /* * skip ctldir, otherwise they will always get invalidated. This will * cause funny behaviour for the mounted snapdirs. Especially for * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent * anyone automount it again as long as someone is still using the * detached mount. */ if (zp->z_is_ctldir) return (0); zh = zfs_znode_hold_enter(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, sizeof (z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, sizeof (z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8); if (err != 0 && err != ENOENT) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(err)); } } zp->z_projid = projid; zp->z_mode = ZTOI(zp)->i_mode = mode; zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } set_nlink(ZTOI(zp), (uint32_t)links); zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = B_FALSE; zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatic removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); if (zp->z_unlinked) zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); znode_hold_t *zh; zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t z_id = zp->z_id; znode_hold_t *zh; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode. */ zh = zfs_znode_hold_enter(zfsvfs, z_id); mutex_enter(&zp->z_lock); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { mutex_exit(&zp->z_lock); zfs_znode_hold_exit(zfsvfs, zh); zfs_rmnode(zp); return; } } mutex_exit(&zp->z_lock); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } #if defined(HAVE_INODE_TIMESPEC64_TIMES) #define zfs_compare_timespec timespec64_compare #else #define zfs_compare_timespec timespec_compare #endif /* * Determine whether the znode's atime must be updated. The logic mostly * duplicates the Linux kernel's relatime_need_update() functionality. * This function is only called if the underlying filesystem actually has * atime updates enabled. */ boolean_t zfs_relatime_need_update(const struct inode *ip) { inode_timespec_t now; gethrestime(&now); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) return (B_TRUE); if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); } /* * Prepare to update znode time stamps. * * IN: zp - znode requiring timestamp update * flag - ATTR_MTIME, ATTR_CTIME flags * * OUT: zp - z_seq * mtime - new mtime * ctime - new ctime * * Note: We don't update atime here, because we rely on Linux VFS to do * atime updating. */ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { inode_timespec_t now; gethrestime(&now); zp->z_seq++; if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * zfs_zero_partial_page - Modeled after update_pages() but * with different arguments and semantics for use by zfs_freesp(). * * Zeroes a piece of a single page cache entry for zp at offset * start and length len. * * Caller must acquire a range lock on the file for the region * being zeroed in order that the ARC and page cache stay in sync. */ static void zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) { struct address_space *mp = ZTOI(zp)->i_mapping; struct page *pp; int64_t off; void *pb; ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); off = start & (PAGE_SIZE - 1); start &= PAGE_MASK; pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); memset(pb + off, 0, len); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); /* * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ if (zn_has_cached_data(zp, off, off + len - 1)) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; /* first possible full page in hole */ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; /* last page of hole */ last_page = (off + len) >> PAGE_SHIFT; /* offset of first_page */ first_page_offset = first_page << PAGE_SHIFT; /* offset of last_page */ last_page_offset = last_page << PAGE_SHIFT; /* truncate whole pages */ if (last_page_offset > first_page_offset) { truncate_inode_pages_range(ZTOI(zp)->i_mapping, first_page_offset, last_page_offset - 1); } /* truncate sub-page ranges */ if (first_page > last_page) { /* entire punched area within a single page */ zfs_zero_partial_page(zp, off, len); } else { /* beginning of punched area at the end of a page */ page_len = first_page_offset - off; if (page_len > 0) zfs_zero_partial_page(zp, off, page_len); /* end of punched area at the beginning of a page */ page_len = off + len - last_page_offset; if (page_len > 0) zfs_zero_partial_page(zp, last_page_offset, page_len); } } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; goto out; } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) goto out; log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); zfs_znode_update_vfs(zp); error = 0; out: /* * Truncate the page cache - for file truncate operations, use * the purpose-built API for truncations. For punching operations, * the truncation is handled under a range lock in zfs_free_range. */ if (len == 0) truncate_setsize(ZTOI(zp), off); return (error); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { struct super_block *sb; zfsvfs_t *zfsvfs; uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int size; int error; int i; znode_t *rootzp = NULL; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; const char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); ASSERT(error == 0); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb * to allow zfs_mknode to work. */ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); sb->s_fs_info = zfsvfs; ZTOI(rootzp)->i_sb = sb; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } mutex_destroy(&zfsvfs->z_znodes_lock); vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); kmem_free(sb, sizeof (struct super_block)); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } #endif /* _KERNEL */ static int zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) { uint64_t sa_obj = 0; int error; error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0 && error != ENOENT) return (error); error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); return (error); } static int zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, dmu_buf_t **db, const void *tag) { dmu_object_info_t doi; int error; if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) return (error); dmu_object_info_from_db(*db, &doi); if ((doi.doi_bonus_type != DMU_OT_SA && doi.doi_bonus_type != DMU_OT_ZNODE) || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t))) { sa_buf_rele(*db, tag); return (SET_ERROR(ENOTSUP)); } error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); if (error != 0) { sa_buf_rele(*db, tag); return (error); } return (0); } static void zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag) { sa_handle_destroy(hdl); sa_buf_rele(db, tag); } /* * Given an object number, return its parent object number and whether * or not the object is an extended attribute directory. */ static int zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; uint64_t parent_mode; sa_bulk_attr_t bulk[3]; sa_handle_t *sa_hdl; dmu_buf_t *sa_db; int count = 0; int error; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, &parent, sizeof (parent)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, &pflags, sizeof (pflags)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &mode, sizeof (mode)); if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); /* * When a link is removed its parent pointer is not changed and will * be invalid. There are two cases where a link is removed but the * file stays around, when it goes to the delete queue and when there * are additional links. */ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); if (error != 0) return (error); error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); if (error != 0) return (error); *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); /* * Extended attributes can be applied to files, directories, etc. * Otherwise the parent must be a directory. */ if (!*is_xattrdir && !S_ISDIR(parent_mode)) return (SET_ERROR(EINVAL)); *pobjp = parent; return (0); } /* * Given an object number, return some zpl level statistics */ static int zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, zfs_stat_t *sb) { sa_bulk_attr_t bulk[4]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, &sb->zs_mode, sizeof (sb->zs_mode)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, &sb->zs_gen, sizeof (sb->zs_gen)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, &sb->zs_links, sizeof (sb->zs_links)); SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, &sb->zs_ctime, sizeof (sb->zs_ctime)); return (sa_bulk_lookup(hdl, bulk, count)); } static int zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, sa_attr_type_t *sa_table, char *buf, int len) { sa_handle_t *sa_hdl; sa_handle_t *prevhdl = NULL; dmu_buf_t *prevdb = NULL; dmu_buf_t *sa_db = NULL; char *path = buf + len - 1; int error; *path = '\0'; sa_hdl = hdl; uint64_t deleteq_obj; VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); error = zap_lookup_int(osp, deleteq_obj, obj); if (error == 0) { return (ESTALE); } else if (error != ENOENT) { return (error); } for (;;) { uint64_t pobj = 0; char component[MAXNAMELEN + 2]; size_t complen; int is_xattrdir = 0; if (prevdb) { ASSERT(prevhdl != NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; if (pobj == obj) { if (path[0] != '/') *--path = '/'; break; } component[0] = '/'; if (is_xattrdir) { strcpy(component + 1, ""); } else { error = zap_value_search(osp, pobj, obj, ZFS_DIRENT_OBJ(-1ULL), component + 1); if (error != 0) break; } complen = strlen(component); path -= complen; ASSERT(path >= buf); memcpy(path, component, complen); obj = pobj; if (sa_hdl != hdl) { prevhdl = sa_hdl; prevdb = sa_db; } error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); if (error != 0) { sa_hdl = prevhdl; sa_db = prevdb; break; } } if (sa_hdl != NULL && sa_hdl != hdl) { ASSERT(sa_db != NULL); zfs_release_sa_handle(sa_hdl, sa_db, FTAG); } if (error == 0) (void) memmove(buf, path, buf + len - path); return (error); } int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) { sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, char *buf, int len) { char *path = buf + len - 1; sa_attr_type_t *sa_table; sa_handle_t *hdl; dmu_buf_t *db; int error; *path = '\0'; error = zfs_sa_setup(osp, &sa_table); if (error != 0) return (error); error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); if (error != 0) return (error); error = zfs_obj_to_stats_impl(hdl, sa_table, sb); if (error != 0) { zfs_release_sa_handle(hdl, db, FTAG); return (error); } error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); zfs_release_sa_handle(hdl, db, FTAG); return (error); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; case ZFS_PROP_ACLTYPE: *value = ZFS_ACLTYPE_OFF; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); /* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); module_param(zfs_unlink_suspend_progress, int, 0644); MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " "(debug - leaks space into the unlinked set)"); #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index 68a7de78f471..7786444fea35 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -1,664 +1,664 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * LLNL-CODE-403049. * Rewritten for Linux by: * Rohan Puri * Brian Behlendorf */ #include #include #include #include #include #include #include #include /* * Common open routine. Disallow any write access. */ static int zpl_common_open(struct inode *ip, struct file *filp) { - if (filp->f_mode & FMODE_WRITE) + if (blk_mode_is_open_write(filp->f_mode)) return (-EACCES); return (generic_file_open(ip, filp)); } /* * Get root directory contents. */ static int zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) { zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); if (!zpl_dir_emit_dots(filp, ctx)) goto out; if (ctx->pos == 2) { if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME, strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR)) goto out; ctx->pos++; } if (ctx->pos == 3) { if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME, strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR)) goto out; ctx->pos++; } out: zpl_exit(zfsvfs, FTAG); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_root_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ /* * Get root directory attributes. */ static int #ifdef HAVE_IDMAP_IOPS_GETATTR zpl_root_getattr_impl(struct mnt_idmap *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_root_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #else zpl_root_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #endif { (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); #else (void) user_ns; #endif #else generic_fillattr(ip, stat); #endif stat->atime = current_time(ip); return (0); } ZPL_GETATTR_WRAPPER(zpl_root_getattr); static struct dentry * zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { cred_t *cr = CRED(); struct inode *ip; int error; crhold(cr); error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); crfree(cr); if (error) { if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } return (d_splice_alias(ip, dentry)); } /* * The '.zfs' control directory file and inode operations. */ const struct file_operations zpl_fops_root = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_root_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_root_iterate, #else .readdir = zpl_root_readdir, #endif }; const struct inode_operations zpl_ops_root = { .lookup = zpl_root_lookup, .getattr = zpl_root_getattr, }; static struct vfsmount * zpl_snapdir_automount(struct path *path) { int error; error = -zfsctl_snapshot_mount(path, 0); if (error) return (ERR_PTR(error)); /* * Rather than returning the new vfsmount for the snapshot we must * return NULL to indicate a mount collision. This is done because * the user space mount calls do_add_mount() which adds the vfsmount * to the name space. If we returned the new mount here it would be * added again to the vfsmount list resulting in list corruption. */ return (NULL); } /* * Negative dentries must always be revalidated so newly created snapshots * can be detected and automounted. Normal dentries should be kept because * as of the 3.18 kernel revaliding the mountpoint dentry will result in * the snapshot being immediately unmounted. */ static int #ifdef HAVE_D_REVALIDATE_NAMEIDATA zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i) #else zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) #endif { return (!!dentry->d_inode); } static dentry_operations_t zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() * callback was used as a hack to trigger the mount. The * resulting vfsmount was then explicitly grafted in to the * name space. While it might be possible to add compatibility * code to accomplish this it would require considerable care. */ .d_automount = zpl_snapdir_automount, .d_revalidate = zpl_snapdir_revalidate, }; static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { fstrans_cookie_t cookie; cred_t *cr = CRED(); struct inode *ip = NULL; int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); spl_fstrans_unmark(cookie); crfree(cr); if (error && error != -ENOENT) return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); d_clear_d_op(dentry); d_set_d_op(dentry, &zpl_dops_snapdirs); dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; return (d_splice_alias(ip, dentry)); } static int zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) { zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); fstrans_cookie_t cookie; char snapname[MAXNAMELEN]; boolean_t case_conflict; uint64_t id, pos; int error = 0; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) goto out; /* Start the position at 0 if it already emitted . and .. */ pos = (ctx->pos == 2 ? 0 : ctx->pos); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, &pos, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error) goto out; if (!zpl_dir_emit(ctx, snapname, strlen(snapname), ZFSCTL_INO_SHARES - id, DT_DIR)) goto out; ctx->pos = pos; } out: spl_fstrans_unmark(cookie); zpl_exit(zfsvfs, FTAG); if (error == -ENOENT) return (0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_snapdir_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ static int #ifdef HAVE_IOPS_RENAME_USERNS zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) #elif defined(HAVE_IOPS_RENAME_IDMAP) zpl_snapdir_rename2(struct mnt_idmap *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) #else zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) #endif { cred_t *cr = CRED(); int error; /* We probably don't want to support renameat2(2) in ctldir */ if (flags) return (-EINVAL); crhold(cr); error = -zfsctl_snapdir_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); ASSERT3S(error, <=, 0); crfree(cr); return (error); } #if (!defined(HAVE_RENAME_WANTS_FLAGS) && \ !defined(HAVE_IOPS_RENAME_USERNS) && \ !defined(HAVE_IOPS_RENAME_IDMAP)) static int zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) { return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0)); } #endif static int zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) { cred_t *cr = CRED(); int error; crhold(cr); error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0); ASSERT3S(error, <=, 0); crfree(cr); return (error); } static int #ifdef HAVE_IOPS_MKDIR_USERNS zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) #elif defined(HAVE_IOPS_MKDIR_IDMAP) zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) #else zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) #endif { cred_t *cr = CRED(); vattr_t *vap; struct inode *ip; int error; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); #if (defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) zpl_vap_init(vap, dip, mode | S_IFDIR, cr, user_ns); #else zpl_vap_init(vap, dip, mode | S_IFDIR, cr, zfs_init_idmap); #endif error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { d_clear_d_op(dentry); d_set_d_op(dentry, &zpl_dops_snapdirs); d_instantiate(dentry, ip); } kmem_free(vap, sizeof (vattr_t)); ASSERT3S(error, <=, 0); crfree(cr); return (error); } /* * Get snapshot directory attributes. */ static int #ifdef HAVE_IDMAP_IOPS_GETATTR zpl_snapdir_getattr_impl(struct mnt_idmap *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_snapdir_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #else zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #endif { (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, ip, stat); #else (void) user_ns; #endif #else generic_fillattr(ip, stat); #endif stat->nlink = stat->size = 2; dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { uint64_t snap_count; int err = zap_count( dmu_objset_pool(ds->ds_objset)->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (err != 0) { zpl_exit(zfsvfs, FTAG); return (-err); } stat->nlink += snap_count; } stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); zpl_exit(zfsvfs, FTAG); return (0); } ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr); /* * The '.zfs/snapshot' directory file operations. These mainly control * generating the list of available snapshots when doing an 'ls' in the * directory. See zpl_snapdir_readdir(). */ const struct file_operations zpl_fops_snapdir = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_snapdir_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_snapdir_iterate, #else .readdir = zpl_snapdir_readdir, #endif }; /* * The '.zfs/snapshot' directory inode operations. These mainly control * creating an inode for a snapshot directory and initializing the needed * infrastructure to automount the snapshot. See zpl_snapdir_lookup(). */ const struct inode_operations zpl_ops_snapdir = { .lookup = zpl_snapdir_lookup, .getattr = zpl_snapdir_getattr, #if (defined(HAVE_RENAME_WANTS_FLAGS) || \ defined(HAVE_IOPS_RENAME_USERNS) || \ defined(HAVE_IOPS_RENAME_IDMAP)) .rename = zpl_snapdir_rename2, #else .rename = zpl_snapdir_rename, #endif .rmdir = zpl_snapdir_rmdir, .mkdir = zpl_snapdir_mkdir, }; static struct dentry * zpl_shares_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { fstrans_cookie_t cookie; cred_t *cr = CRED(); struct inode *ip = NULL; int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfsctl_shares_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); spl_fstrans_unmark(cookie); crfree(cr); if (error) { if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } return (d_splice_alias(ip, dentry)); } static int zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) { fstrans_cookie_t cookie; cred_t *cr = CRED(); zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); znode_t *dzp; int error = 0; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { zpl_dir_emit_dots(filp, ctx); goto out; } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error) goto out; crhold(cr); error = -zfs_readdir(ZTOI(dzp), ctx, cr); crfree(cr); iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_shares_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ static int #ifdef HAVE_USERNS_IOPS_GETATTR zpl_shares_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #elif defined(HAVE_IDMAP_IOPS_GETATTR) zpl_shares_getattr_impl(struct mnt_idmap *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #else zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) #endif { (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *dzp; int error; if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); if (zfsvfs->z_shares_dir == 0) { #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, path->dentry->d_inode, stat); #elif defined(HAVE_GENERIC_FILLATTR_IDMAP) generic_fillattr(user_ns, path->dentry->d_inode, stat); #else (void) user_ns; #endif #else generic_fillattr(path->dentry->d_inode, stat); #endif stat->nlink = stat->size = 2; stat->atime = current_time(ip); zpl_exit(zfsvfs, FTAG); return (0); } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); #else error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); #endif iput(ZTOI(dzp)); } zpl_exit(zfsvfs, FTAG); ASSERT3S(error, <=, 0); return (error); } ZPL_GETATTR_WRAPPER(zpl_shares_getattr); /* * The '.zfs/shares' directory file operations. */ const struct file_operations zpl_fops_shares = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_shares_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_shares_iterate, #else .readdir = zpl_shares_readdir, #endif }; /* * The '.zfs/shares' directory inode operations. */ const struct inode_operations zpl_ops_shares = { .lookup = zpl_shares_lookup, .getattr = zpl_shares_getattr, }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 73526db731c4..3caa0fc6c214 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -1,1392 +1,1391 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include #include #include #if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include #endif #ifdef HAVE_FILE_FADVISE #include #endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include #endif /* * When using fallocate(2) to preallocate space, inflate the requested * capacity check by 10% to account for the required metadata blocks. */ static unsigned int zfs_fallocate_reserve_percent = 110; static int zpl_open(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = generic_file_open(ip, filp); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_release(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); if (ITOZ(ip)->z_atime_dirty) zfs_mark_inode_dirty(ip); crhold(cr); error = -zfs_close(ip, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_readdir(file_inode(filp), ctx, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ #if defined(HAVE_FSYNC_WITHOUT_DENTRY) /* * Linux 2.6.35 - 3.0 API, * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed * redundant. The dentry is still accessible via filp->f_path.dentry, * and we are guaranteed that filp will never be NULL. */ static int zpl_fsync(struct file *filp, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(ITOZ(inode), datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, datasync)); } #endif #elif defined(HAVE_FSYNC_RANGE) /* * Linux 3.1 API, * As of 3.1 the responsibility to call filemap_write_and_wait_range() has * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex * lock is no longer held by the caller, for zfs we don't require the lock * to be held so we don't acquire it. */ static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* * The variables z_sync_writes_cnt and z_async_writes_cnt work in * tandem so that sync writes can detect if there are any non-sync * writes going on and vice-versa. The "vice-versa" part to this logic * is located in zfs_putpage() where non-sync writes check if there are * any ongoing sync writes. If any sync and non-sync writes overlap, * we do a commit to complete the non-sync writes since the latter can * potentially take several seconds to complete and thus block sync * writes in the upcoming call to filemap_write_and_wait_range(). */ atomic_inc_32(&zp->z_sync_writes_cnt); /* * If the following check does not detect an overlapping non-sync write * (say because it's just about to start), then it is guaranteed that * the non-sync write will detect this sync write. This is because we * always increment z_sync_writes_cnt / z_async_writes_cnt before doing * the check on z_async_writes_cnt / z_sync_writes_cnt here and in * zfs_putpage() respectively. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { atomic_dec_32(&zp->z_sync_writes_cnt); return (error); } zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); } error = filemap_write_and_wait_range(inode->i_mapping, start, end); /* * The sync write is not complete yet but we decrement * z_sync_writes_cnt since zfs_fsync() increments and decrements * it internally. If a non-sync write starts just after the decrement * operation but before we call zfs_fsync(), it may not detect this * overlapping sync write but it does not matter since we have already * gone past filemap_write_and_wait_range() and we won't block due to * the non-sync write. */ atomic_dec_32(&zp->z_sync_writes_cnt); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(zp, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); } #endif #else #error "Unsupported fops->fsync() implementation" #endif static inline int zfs_io_flags(struct kiocb *kiocb) { int flags = 0; #if defined(IOCB_DSYNC) if (kiocb->ki_flags & IOCB_DSYNC) flags |= O_DSYNC; #endif #if defined(IOCB_SYNC) if (kiocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; #endif #if defined(IOCB_APPEND) if (kiocb->ki_flags & IOCB_APPEND) flags |= O_APPEND; #endif #if defined(IOCB_DIRECT) if (kiocb->ki_flags & IOCB_DIRECT) flags |= O_DIRECT; #endif return (flags); } /* * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() * is true. This is needed since datasets with inherited "relatime" property * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after * `zfs set relatime=...`), which is what relatime test in VFS by * relatime_need_update() is based on. */ static inline void zpl_file_accessed(struct file *filp) { struct inode *ip = filp->f_mapping->host; if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } } #if defined(HAVE_VFS_RW_ITERATE) /* * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to * manipulate the iov_iter are available. In which case the full iov_iter * can be attached to the uio and correctly handled in the lower layers. * Otherwise, for older kernels extract the iovec and pass it instead. */ static void zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, loff_t pos, ssize_t count, size_t skip) { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); #else -#ifdef HAVE_IOV_ITER_TYPE - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, - iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, + zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, + zfs_uio_iov_iter_type(to) & ITER_KVEC ? + UIO_SYSSPACE : UIO_USERSPACE, count, skip); -#else - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, - to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, - count, skip); -#endif #endif } static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; ssize_t count = iov_iter_count(to); zfs_uio_t uio; zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) { #ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB ssize_t ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); *countp = ret; #else struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); *countp = iov_iter_count(from); ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk); if (ret) return (ret); #endif return (0); } static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; zfs_uio_t uio; size_t count = 0; ssize_t ret; ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } #else /* !HAVE_VFS_RW_ITERATE */ static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; size_t count; ssize_t ret; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; size_t count; ssize_t ret; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); if (ret) return (ret); kiocb->ki_pos = pos; zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) { if (rw == WRITE) return (zpl_iter_write(kiocb, iter)); else return (zpl_iter_read(kiocb, iter)); } #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(rw, kiocb, iter)); } #else #error "Unknown direct IO interface" #endif #else /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_DIRECT_IO_IOVEC) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { if (rw == WRITE) return (zpl_aio_write(kiocb, iov, nr_segs, pos)); else return (zpl_aio_read(kiocb, iov, nr_segs, pos)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { const struct iovec *iovp = iov_iter_iovec(iter); unsigned long nr_segs = iter->nr_segs; ASSERT3S(pos, ==, kiocb->ki_pos); if (rw == WRITE) return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); else return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); } #else #error "Unknown direct IO interface" #endif #endif /* HAVE_VFS_RW_ITERATE */ static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { #if defined(SEEK_HOLE) && defined(SEEK_DATA) fstrans_cookie_t cookie; if (whence == SEEK_DATA || whence == SEEK_HOLE) { struct inode *ip = filp->f_mapping->host; loff_t maxbytes = ip->i_sb->s_maxbytes; loff_t error; spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); spl_inode_unlock_shared(ip); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ return (generic_file_llseek(filp, offset, whence)); } /* * It's worth taking a moment to describe how mmap is implemented * for zfs because it differs considerably from other Linux filesystems. * However, this issue is handled the same way under OpenSolaris. * * The issue is that by design zfs bypasses the Linux page cache and * leaves all caching up to the ARC. This has been shown to work * well for the common read(2)/write(2) case. However, mmap(2) * is problem because it relies on being tightly integrated with the * page cache. To handle this we cache mmap'ed files twice, once in * the ARC and a second time in the page cache. The code is careful * to keep both copies synchronized. * * When a file with an mmap'ed region is written to using write(2) * both the data in the ARC and existing pages in the page cache * are updated. For a read(2) data will be read first from the page * cache then the ARC if needed. Neither a write(2) or read(2) will * will ever result in new pages being added to the page cache. * * New pages are added to the page cache only via .readpage() which * is called when the vfs needs to read a page off disk to back the * virtual memory region. These pages may be modified without * notifying the ARC and will be written out periodically via * .writepage(). This will occur due to either a sync or the usual * page aging behavior. Note because a read(2) of a mmap'ed file * will always check the page cache first even when the ARC is out * of date correct data will still be returned. * * While this implementation ensures correct behavior it does have * have some drawbacks. The most obvious of which is that it * increases the required memory footprint when access mmap'ed * files. It also adds additional complexity to the code keeping * both caches synchronized. * * Longer term it may be possible to cleanly resolve this wart by * mapping page cache pages directly on to the ARC buffers. The * Linux address space operations are flexible enough to allow * selection of which pages back a particular index. The trick * would be working out the details of which subsystem is in * charge, the ARC, the page cache, or both. It may also prove * helpful to move the ARC buffers to a scatter-gather lists * rather than a vmalloc'ed region. */ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); if (error) return (error); error = generic_file_mmap(filp, vma); if (error) return (error); #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) znode_t *zp = ITOZ(ip); mutex_enter(&zp->z_lock); zp->z_is_mapped = B_TRUE; mutex_exit(&zp->z_lock); #endif return (error); } /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). */ static inline int zpl_readpage_common(struct page *pp) { fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); cookie = spl_fstrans_mark(); int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); unlock_page(pp); return (error); } #ifdef HAVE_VFS_READ_FOLIO static int zpl_read_folio(struct file *filp, struct folio *folio) { return (zpl_readpage_common(&folio->page)); } #else static int zpl_readpage(struct file *filp, struct page *pp) { return (zpl_readpage_common(pp)); } #endif static int zpl_readpage_filler(void *data, struct page *pp) { return (zpl_readpage_common(pp)); } /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ #ifdef HAVE_VFS_READPAGES static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); } #else static void zpl_readahead(struct readahead_control *ractl) { struct page *page; while ((page = readahead_page(ractl)) != NULL) { int ret; ret = zpl_readpage_filler(NULL, page); put_page(page); if (ret) break; } } #endif static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { boolean_t *for_sync = data; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); return (0); } #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) { (void) zpl_putpage(&pp->page, wbc, data); return (0); } #endif static inline int zpl_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, void *data) { int result; #ifdef HAVE_WRITEPAGE_T_FOLIO result = write_cache_pages(mapping, wbc, zpl_putfolio, data); #else result = write_cache_pages(mapping, wbc, zpl_putpage, data); #endif return (result); } static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfsvfs_t *zfsvfs = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; if ((result = zpl_enter(zfsvfs, FTAG)) != 0) return (result); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; zpl_exit(zfsvfs, FTAG); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; result = zpl_write_cache_pages(mapping, wbc, &for_sync); if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = zpl_write_cache_pages(mapping, wbc, &for_sync); } return (result); } /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); return (zpl_putpage(pp, wbc, &for_sync)); } /* * The flag combination which matches the behavior of zfs_space() is * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. * * The original mode=0 (allocate space) behavior can be reasonably emulated * by checking if enough space exists and creating a sparse file, as real * persistent space reservation is not possible due to COW, snapshots, etc. */ static long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { cred_t *cr = CRED(); loff_t olen; fstrans_cookie_t cookie; int error = 0; int test_mode = FALLOC_FL_PUNCH_HOLE; #ifdef HAVE_FALLOC_FL_ZERO_RANGE test_mode |= FALLOC_FL_ZERO_RANGE; #endif if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0) return (-EOPNOTSUPP); if (offset < 0 || len <= 0) return (-EINVAL); spl_inode_lock(ip); olen = i_size_read(ip); crhold(cr); cookie = spl_fstrans_mark(); if (mode & (test_mode)) { flock64_t bf; if (mode & FALLOC_FL_KEEP_SIZE) { if (offset > olen) goto out_unmark; if (offset + len > olen) len = olen - offset; } bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = offset; bf.l_len = len; bf.l_pid = 0; error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { unsigned int percent = zfs_fallocate_reserve_percent; struct kstatfs statfs; /* Legacy mode, disable fallocate compatibility. */ if (percent == 0) { error = -EOPNOTSUPP; goto out_unmark; } /* * Use zfs_statvfs() instead of dmu_objset_space() since it * also checks project quota limits, which are relevant here. */ error = zfs_statvfs(ip, &statfs); if (error) goto out_unmark; /* * Shrink available space a bit to account for overhead/races. * We know the product previously fit into availbytes from * dmu_objset_space(), so the smaller product will also fit. */ if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { error = -ENOSPC; goto out_unmark; } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); } out_unmark: spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); return (error); } static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } static int zpl_ioctl_getversion(struct file *filp, void __user *arg) { uint32_t generation = file_inode(filp)->i_generation; return (copy_to_user(arg, &generation, sizeof (generation))); } #ifdef HAVE_FILE_FADVISE static int zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { struct inode *ip = file_inode(filp); znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os = zfsvfs->z_os; int error = 0; if (S_ISFIFO(ip->i_mode)) return (-ESPIPE); if (offset < 0 || len < 0) return (-EINVAL); if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_WILLNEED: #ifdef HAVE_GENERIC_FADVISE if (zn_has_cached_data(zp, offset, offset + len - 1)) error = generic_fadvise(filp, offset, len, advice); #endif /* * Pass on the caller's size directly, but note that * dmu_prefetch_max will effectively cap it. If there * really is a larger sequential access pattern, perhaps * dmu_zfetch will detect it. */ if (len == 0) len = i_size_read(ip) - offset; dmu_prefetch(os, zp->z_id, 0, offset, len, ZIO_PRIORITY_ASYNC_READ); break; case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_DONTNEED: case POSIX_FADV_NOREUSE: /* ignored for now */ break; default: error = -EINVAL; break; } zfs_exit(zfsvfs, FTAG); return (error); } #endif /* HAVE_FILE_FADVISE */ #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) static uint32_t __zpl_ioctl_getflags(struct inode *ip) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; uint32_t ioctl_flags = 0; if (zfs_flags & ZFS_IMMUTABLE) ioctl_flags |= FS_IMMUTABLE_FL; if (zfs_flags & ZFS_APPENDONLY) ioctl_flags |= FS_APPEND_FL; if (zfs_flags & ZFS_NODUMP) ioctl_flags |= FS_NODUMP_FL; if (zfs_flags & ZFS_PROJINHERIT) ioctl_flags |= ZFS_PROJINHERIT_FL; return (ioctl_flags & ZFS_FL_USER_VISIBLE); } /* * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file * attributes common to both Linux and Solaris are mapped. */ static int zpl_ioctl_getflags(struct file *filp, void __user *arg) { uint32_t flags; int err; flags = __zpl_ioctl_getflags(file_inode(filp)); err = copy_to_user(arg, &flags, sizeof (flags)); return (err); } /* * fchange() is a helper macro to detect if we have been asked to change a * flag. This is ugly, but the requirement that we do this is a consequence of * how the Linux file attribute interface was designed. Another consequence is * that concurrent modification of files suffers from a TOCTOU race. Neither * are things we can fix without modifying the kernel-userland interface, which * is outside of our jurisdiction. */ #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | ZFS_PROJINHERIT_FL)) return (-EOPNOTSUPP); if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) return (-EACCES); if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, xoap->xoa_projinherit); #undef FLAG_CHANGE return (0); } static int zpl_ioctl_setflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint32_t flags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&flags, arg, sizeof (flags))) return (-EFAULT); err = __zpl_ioctl_setflags(ip, flags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static int zpl_ioctl_getxattr(struct file *filp, void __user *arg) { zfsxattr_t fsx = { 0 }; struct inode *ip = file_inode(filp); int err; fsx.fsx_xflags = __zpl_ioctl_getflags(ip); fsx.fsx_projid = ITOZ(ip)->z_projid; err = copy_to_user(arg, &fsx, sizeof (fsx)); return (err); } static int zpl_ioctl_setxattr(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); zfsxattr_t fsx; cred_t *cr = CRED(); xvattr_t xva; xoptattr_t *xoap; int err; fstrans_cookie_t cookie; if (copy_from_user(&fsx, arg, sizeof (fsx))) return (-EFAULT); if (!zpl_is_valid_projid(fsx.fsx_projid)) return (-EINVAL); err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); if (err) return (err); xoap = xva_getxoptattr(&xva); XVA_SET_REQ(&xva, XAT_PROJID); xoap->xoa_projid = fsx.fsx_projid; crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } /* * Expose Additional File Level Attributes of ZFS. */ static int zpl_ioctl_getdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags = ITOZ(ip)->z_pflags; dosflags &= ZFS_DOS_FL_USER_VISIBLE; int err = copy_to_user(arg, &dosflags, sizeof (dosflags)); return (err); } static int __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE)) return (-EOPNOTSUPP); if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \ ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly); FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden); FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system); FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive); FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink); FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse); FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline); FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse); #undef FLAG_CHANGE return (0); } /* * Set Additional File Level Attributes of ZFS. */ static int zpl_ioctl_setdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&dosflags, arg, sizeof (dosflags))) return (-EFAULT); err = __zpl_ioctl_setdosflags(ip, dosflags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC_GETVERSION: return (zpl_ioctl_getversion(filp, (void *)arg)); case FS_IOC_GETFLAGS: return (zpl_ioctl_getflags(filp, (void *)arg)); case FS_IOC_SETFLAGS: return (zpl_ioctl_setflags(filp, (void *)arg)); case ZFS_IOC_FSGETXATTR: return (zpl_ioctl_getxattr(filp, (void *)arg)); case ZFS_IOC_FSSETXATTR: return (zpl_ioctl_setxattr(filp, (void *)arg)); case ZFS_IOC_GETDOSFLAGS: return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); case ZFS_IOC_COMPAT_FICLONE: return (zpl_ioctl_ficlone(filp, (void *)arg)); case ZFS_IOC_COMPAT_FICLONERANGE: return (zpl_ioctl_ficlonerange(filp, (void *)arg)); case ZFS_IOC_COMPAT_FIDEDUPERANGE: return (zpl_ioctl_fideduperange(filp, (void *)arg)); default: return (-ENOTTY); } } #ifdef CONFIG_COMPAT static long zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; default: return (-ENOTTY); } return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); } #endif /* CONFIG_COMPAT */ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, #else .readahead = zpl_readahead, #endif #ifdef HAVE_VFS_READ_FOLIO .read_folio = zpl_read_folio, #else .readpage = zpl_readpage, #endif .writepage = zpl_writepage, .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS .set_page_dirty = __set_page_dirty_nobuffers, #endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO .dirty_folio = filemap_dirty_folio, #endif }; #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND const struct file_operations_extend zpl_file_operations = { .kabi_fops = { #else const struct file_operations zpl_file_operations = { #endif .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, #ifdef HAVE_VFS_RW_ITERATE #ifdef HAVE_NEW_SYNC_READ .read = new_sync_read, .write = new_sync_write, #endif .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER +#ifdef HAVE_COPY_SPLICE_READ + .splice_read = copy_splice_read, +#else .splice_read = generic_file_splice_read, +#endif .splice_write = iter_file_splice_write, #endif #else .read = do_sync_read, .write = do_sync_write, .aio_read = zpl_aio_read, .aio_write = zpl_aio_write, #endif .mmap = zpl_mmap, .fsync = zpl_fsync, #ifdef HAVE_FILE_AIO_FSYNC .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, #ifdef HAVE_VFS_COPY_FILE_RANGE .copy_file_range = zpl_copy_file_range, #endif #ifdef HAVE_VFS_CLONE_FILE_RANGE .clone_file_range = zpl_clone_file_range, #endif #ifdef HAVE_VFS_REMAP_FILE_RANGE .remap_file_range = zpl_remap_file_range, #endif #ifdef HAVE_VFS_DEDUPE_FILE_RANGE .dedupe_file_range = zpl_dedupe_file_range, #endif #ifdef HAVE_FILE_FADVISE .fadvise = zpl_fadvise, #endif .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND }, /* kabi_fops */ .copy_file_range = zpl_copy_file_range, .clone_file_range = zpl_clone_file_range, #endif }; const struct file_operations zpl_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, #if defined(HAVE_VFS_ITERATE_SHARED) .iterate_shared = zpl_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_iterate, #else .readdir = zpl_readdir, #endif .fsync = zpl_fsync, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; /* CSTYLED */ module_param(zfs_fallocate_reserve_percent, uint, 0644); MODULE_PARM_DESC(zfs_fallocate_reserve_percent, "Percentage of length to use for the available capacity check"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 38bc8e2c4eeb..7a95b54bdf0d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -1,1620 +1,1640 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_BLK_MQ #include #endif static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync); static unsigned int zvol_major = ZVOL_MAJOR; static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; #ifdef HAVE_BLK_MQ static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; /* * The maximum number of volblocksize blocks to process per thread. Typically, * write heavy workloads preform better with higher values here, and read * heavy workloads preform better with lower values, but that's not a hard * and fast rule. It's basically a knob to tune between "less overhead with * less parallelism" and "more overhead, but more parallelism". * * '8' was chosen as a reasonable, balanced, default based off of sequential * read and write tests to a zvol in an NVMe pool (with 16 CPUs). */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; #endif #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ #endif /* * Finalize our BIO or request. */ #ifdef HAVE_BLK_MQ #define END_IO(zv, bio, rq, error) do { \ if (bio) { \ BIO_END_IO(bio, error); \ } else { \ blk_mq_end_request(rq, errno_to_bi_status(error)); \ } \ } while (0) #else #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) #endif #ifdef HAVE_BLK_MQ static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; static unsigned int zvol_actual_blk_mq_queue_depth; #endif struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ #ifdef HAVE_BLK_MQ struct blk_mq_tag_set tag_set; #endif /* Set from the global 'zvol_use_blk_mq' at zvol load */ boolean_t use_blk_mq; }; static taskq_t *zvol_taskq; static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; struct request *rq; } zv_request_t; typedef struct zv_work { struct request *rq; struct work_struct work; } zv_work_t; typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; } zv_request_task_t; static zv_request_task_t * zv_request_task_create(zv_request_t zvr) { zv_request_task_t *task; task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); taskq_init_ent(&task->ent); task->zvr = zvr; return (task); } static void zv_request_task_free(zv_request_task_t *task) { kmem_free(task, sizeof (*task)); } #ifdef HAVE_BLK_MQ /* * This is called when a new block multiqueue request comes in. A request * contains one or more BIOs. */ static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; zvol_state_t *zv = rq->q->queuedata; /* Tell the kernel that we are starting to process this request */ blk_mq_start_request(rq); if (blk_rq_is_passthrough(rq)) { /* Skip non filesystem request */ blk_mq_end_request(rq, BLK_STS_IOERR); return (BLK_STS_IOERR); } zvol_request_impl(zv, NULL, rq, 0); /* Acknowledge to the kernel that we got this request */ return (BLK_STS_OK); } static struct blk_mq_ops zvol_blk_mq_queue_ops = { .queue_rq = zvol_mq_queue_rq, }; /* Initialize our blk-mq struct */ static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) { struct zvol_state_os *zso = zv->zv_zso; memset(&zso->tag_set, 0, sizeof (zso->tag_set)); /* Initialize tag set. */ zso->tag_set.ops = &zvol_blk_mq_queue_ops; zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; zso->tag_set.numa_node = NUMA_NO_NODE; zso->tag_set.cmd_size = 0; /* * We need BLK_MQ_F_BLOCKING here since we do blocking calls in * zvol_request_impl() */ zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; zso->tag_set.driver_data = zv; return (blk_mq_alloc_tag_set(&zso->tag_set)); } #endif /* HAVE_BLK_MQ */ /* * Given a path, return TRUE if path is a ZVOL. */ boolean_t zvol_os_is_zvol(const char *path) { dev_t dev = 0; if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; /* bio marked as FLUSH need to flush before write */ if (io_is_flush(bio, rq)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); END_IO(zv, bio, rq, 0); return; } zfs_uio_bvec_init(&uio, bio, rq); ssize_t start_resid = uio.uio_resid; /* * With use_blk_mq, accounting is done by blk_mq_start_request() * and blk_mq_end_request(), so we can skip it here. */ if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } boolean_t sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_write_task(void *arg) { zv_request_task_t *task = arg; zvol_write(&task->zvr); zv_request_task_free(task); } static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; uint64_t start = io_offset(bio, rq); uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) goto unlock; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_rangelock_exit(lr); if (error == 0 && sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_discard_task(void *arg) { zv_request_task_t *task = arg; zvol_discard(&task->zvr); zv_request_task_free(task); } static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; boolean_t acct = B_FALSE; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); zfs_uio_bvec_init(&uio, bio, rq); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; /* * When blk-mq is being used, accounting is done by * blk_mq_start_request() and blk_mq_end_request(). */ if (bio) { acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, READ, bio); } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, READ, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_read_task(void *arg) { zv_request_task_t *task = arg; zvol_read(&task->zvr); zv_request_task_free(task); } /* * Process a BIO or request * * Either 'bio' or 'rq' should be set depending on if we are processing a * bio or a request (both should not be set). * * force_sync: Set to 0 to defer processing to a background taskq * Set to 1 to process data synchronously */ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync) { fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); if (zvol_request_sync) force_sync = 1; zv_request_t zvr = { .zv = zv, .bio = bio, .rq = rq, }; if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); END_IO(zv, bio, rq, -SET_ERROR(EIO)); goto out; } zv_request_task_t *task; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { END_IO(zv, bio, rq, -SET_ERROR(EROFS)); goto out; } /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o * completes. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data, &zv->zv_kstat.dk_zil_sums); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_create_minor */ VERIFY0((zv->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } /* * We don't want this thread to be blocked waiting for i/o to * complete, so we instead wait from a taskq callback. The * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling * zvol_request() again. There are typically only a small * number of threads available to call zvol_request() (e.g. * one per iSCSI target), so keeping the latency of * zvol_request() low is important for performance. * * The zvol_request_sync module parameter allows this * behavior to be altered, for performance evaluation * purposes. If the callback blocks, setting * zvol_request_sync=1 will result in much worse performance. * * We can have up to zvol_threads concurrent i/o's being * processed for all zvols on the system. This is typically * a vast improvement over the zvol_request_sync=1 behavior * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_discard_task, task, 0, &task->ent); } } else { if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_write_task, task, 0, &task->ent); } } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { END_IO(zv, bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_read_task, task, 0, &task->ent); } } out: spl_fstrans_unmark(cookie); } #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID static void zvol_submit_bio(struct bio *bio) #else static blk_qc_t zvol_submit_bio(struct bio *bio) #endif #else static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) #endif { #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif #endif zvol_state_t *zv = q->queuedata; zvol_request_impl(zv, bio, NULL, 0); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif } static int +#ifdef HAVE_BLK_MODE_T +zvol_open(struct gendisk *disk, blk_mode_t flag) +#else zvol_open(struct block_device *bdev, fmode_t flag) +#endif { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); retry: #endif rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting - * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free() + * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ +#ifdef HAVE_BLK_MODE_T + zv = disk->private_data; +#else zv = bdev->bd_disk->private_data; +#endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); } mutex_enter(&zv->zv_state_lock); /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); } else { drop_suspend = B_TRUE; } } else { drop_suspend = B_TRUE; } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { boolean_t drop_namespace = B_FALSE; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) * the __blkdev_get() function calls fops->open() with the * bdev->bd_mutex lock held. This can result in a deadlock * when zvols from one pool are used as vdevs in another. * * To prevent a lock inversion deadlock we preemptively * take the spa_namespace_lock. Normally the lock will not * be contended and this is safe because spa_open_common() * handles the case where the caller already holds the * spa_namespace_lock. * * When the lock cannot be aquired after multiple retries * this must be the vdev on zvol deadlock case and we have * no choice but to return an error. For 5.12 and older * kernels returning -ERESTARTSYS will result in the * bdev->bd_mutex being dropped, then reacquired, and * fops->open() being called again. This process can be * repeated safely until both locks are acquired. For 5.13 * and newer the -ERESTARTSYS retry logic was removed from * the kernel so the only option is to return the error for * the caller to handle it. */ if (!mutex_owned(&spa_namespace_lock)) { if (!mutex_tryenter(&spa_namespace_lock)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); #ifdef HAVE_BLKDEV_GET_ERESTARTSYS schedule(); return (SET_ERROR(-ERESTARTSYS)); #else if ((gethrtime() - start) > timeout) return (SET_ERROR(-ERESTARTSYS)); schedule_timeout(MSEC_TO_TICK(10)); goto retry; #endif } else { drop_namespace = B_TRUE; } } - error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); + error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + if ((blk_mode_is_open_write(flag)) && + (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); error = SET_ERROR(-EROFS); } else { zv->zv_open_count++; } } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == 0) +#ifdef HAVE_BLK_MODE_T + disk_check_media_change(disk); +#else zfs_check_media_change(bdev); +#endif return (error); } static void -zvol_release(struct gendisk *disk, fmode_t mode) +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG +zvol_release(struct gendisk *disk) +#else +zvol_release(struct gendisk *disk, fmode_t unused) +#endif { +#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) + (void) unused; +#endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: fsync_bdev(bdev); invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { struct gendisk *disk = zv->zv_zso->zvo_disk; #if defined(HAVE_REVALIDATE_DISK_SIZE) revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); #elif defined(HAVE_REVALIDATE_DISK) revalidate_disk(disk); #else zvol_revalidate_disk(disk); #endif return (0); } void zvol_os_clear_private(zvol_state_t *zv) { /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_zso->zvo_disk->private_data = NULL; } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } /* * Why have two separate block_device_operations structs? * * Normally we'd just have one, and assign 'submit_bio' as needed. However, * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we * can't just change submit_bio dynamically at runtime. So just create two * separate structs to get around this. */ static const struct block_device_operations zvol_ops_blk_mq = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif }; static int zvol_alloc_non_blk_mq(struct zvol_state_os *zso) { #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); if (zso->zvo_disk == NULL) return (1); zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ return (0); } static int zvol_alloc_blk_mq(zvol_state_t *zv) { #ifdef HAVE_BLK_MQ struct zvol_state_os *zso = zv->zv_zso; /* Allocate our blk-mq tag_set */ if (zvol_blk_mq_alloc_tag_set(zv) != 0) return (1); #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); if (zso->zvo_disk == NULL) { blk_mq_free_tag_set(&zso->tag_set); return (1); } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Allocate queue */ zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); if (IS_ERR(zso->zvo_queue)) { blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; #endif #endif return (0); } /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (NULL); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef HAVE_BLK_MQ zv->zv_zso->use_blk_mq = zvol_use_blk_mq; #endif /* * The block layer has 3 interfaces for getting BIOs: * * 1. blk-mq request queues (new) * 2. submit_bio() (oldest) * 3. regular request queues (old). * * Each of those interfaces has two permutations: * * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates * both the disk and its queue (5.14 kernel or newer) * * b) We don't have blk_*alloc_disk(), and have to allocate the * disk and the queue separately. (5.13 kernel or older) */ if (zv->zv_zso->use_blk_mq) { ret = zvol_alloc_blk_mq(zv); zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { ret = zvol_alloc_non_blk_mq(zso); zso->zvo_disk->fops = &zvol_ops; } if (ret != 0) goto out_kmem; blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); if (!zv->zv_zso->use_blk_mq) { /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); } /* Enable /proc/diskstats */ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; /* * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. * This is accomplished by limiting the number of minors for the * device to one and explicitly disabling partition scanning. */ if (volmode == ZFS_VOLMODE_DEV) { zso->zvo_disk->minors = 1; zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; } zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. * * This function may take many milliseconds to complete (e.g. we've seen * it take over 256ms), due to the calls to "blk_cleanup_queue" and * "del_gendisk". Thus, consumers need to be careful to account for this * latency when calling this function. */ void zvol_os_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ defined(HAVE_BLK_ALLOC_DISK) #if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else put_disk(zv->zv_zso->zvo_disk); #endif #else blk_cleanup_queue(zv->zv_zso->zvo_queue); put_disk(zv->zv_zso->zvo_disk); #endif #ifdef HAVE_BLK_MQ if (zv->zv_zso->use_blk_mq) blk_mq_free_tag_set(&zv->zv_zso->tag_set); #endif ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } void zvol_wait_close(zvol_state_t *zv) { } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ int zvol_os_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); if (zv->zv_zso->use_blk_mq) { /* * IO requests can be really big (1MB). When an IO request * comes in, it is passed off to zvol_read() or zvol_write() * in a new thread, where it is chunked up into 'volblocksize' * sized pieces and processed. So for example, if the request * is a 1MB write and your volblocksize is 128k, one zvol_write * thread will take that request and sequentially do ten 128k * IOs. This is due to the fact that the thread needs to lock * each volblocksize sized block. So you might be wondering: * "instead of passing the whole 1MB request to one thread, * why not pass ten individual 128k chunks to ten threads and * process the whole write in parallel?" The short answer is * that there's a sweet spot number of chunks that balances * the greater parallelism with the added overhead of more * threads. The sweet spot can be different depending on if you * have a read or write heavy workload. Writes typically want * high chunk counts while reads typically want lower ones. On * a test pool with 6 NVMe drives in a 3x 2-disk mirror * configuration, with volblocksize=8k, the sweet spot for good * sequential reads and writes was at 8 chunks. */ /* * Below we tell the kernel how big we want our requests * to be. You would think that blk_queue_io_opt() would be * used to do this since it is used to "set optimal request * size for the queue", but that doesn't seem to do * anything - the kernel still gives you huge requests * with tons of little PAGE_SIZE segments contained within it. * * Knowing that the kernel will just give you PAGE_SIZE segments * no matter what, you can say "ok, I want PAGE_SIZE byte * segments, and I want 'N' of them per request", where N is * the correct number of segments for the volblocksize and * number of chunks you want. */ #ifdef HAVE_BLK_MQ if (zvol_blk_mq_blocks_per_thread != 0) { unsigned int chunks; chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, PAGE_SIZE); blk_queue_max_segments(zv->zv_zso->zvo_queue, (zv->zv_volblocksize * chunks) / PAGE_SIZE); } else { /* * Special case: zvol_blk_mq_blocks_per_thread = 0 * Max everything out. */ blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); } #endif } else { blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); } blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_zso->zvo_queue, zv->zv_volblocksize); #ifdef QUEUE_FLAG_DISCARD blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); #endif /* This flag was introduced in kernel version 4.12. */ #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; ASSERT3P(zv->zv_zilog, ==, NULL); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else replayed_zil = zil_replay(os, zv, zvol_replay_vector); } if (replayed_zil) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); /* * Keep in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can * be called at any time. Incidentally, add_disk() itself calls * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); #ifdef HAVE_ADD_DISK_RET error = add_disk(zv->zv_zso->zvo_disk); #else add_disk(zv->zv_zso->zvo_disk); #endif } else { ida_simple_remove(&zvol_ida, idx); } return (error); } void zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); } void zvol_os_set_disk_ro(zvol_state_t *zv, int flags) { set_disk_ro(zv->zv_zso->zvo_disk, flags); } void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) { set_capacity(zv->zv_zso->zvo_disk, capacity); } int zvol_init(void) { int error; /* * zvol_threads is the module param the user passes in. * * zvol_actual_threads is what we use internally, since the user can * pass zvol_thread = 0 to mean "use all the CPUs" (the default). */ static unsigned int zvol_actual_threads; if (zvol_threads == 0) { /* * See dde9380a1 for why 32 was chosen here. This should * probably be refined to be some multiple of the number * of CPUs. */ zvol_actual_threads = MAX(num_online_cpus(), 32); } else { zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } #ifdef HAVE_BLK_MQ if (zvol_blk_mq_queue_depth == 0) { zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; } else { zvol_actual_blk_mq_queue_depth = MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); } if (zvol_blk_mq_threads == 0) { zvol_blk_mq_actual_threads = num_online_cpus(); } else { zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1024); } #endif zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } zvol_init_impl(); ida_init(&zvol_ida); return (0); } void zvol_fini(void) { zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); ida_destroy(&zvol_ida); } /* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); #ifdef HAVE_BLK_MQ module_param(zvol_blk_mq_queue_depth, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); module_param(zvol_use_blk_mq, uint, 0644); MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, "Process volblocksize blocks per thread"); #endif #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); #endif /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c index 2e5c22c11490..5dd08f597f33 100644 --- a/sys/contrib/openzfs/module/zfs/spa_errlog.c +++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c @@ -1,1489 +1,1498 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2014, Delphix. All rights reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, 2022, George Amanakis. All rights reserved. */ /* * Routines to manage the on-disk persistent error log. * * Each pool stores a log of all logical data errors seen during normal * operation. This is actually the union of two distinct logs: the last log, * and the current log. All errors seen are logged to the current log. When a * scrub completes, the current log becomes the last log, the last log is thrown * out, and the current log is reinitialized. This way, if an error is somehow * corrected, a new scrub will show that it no longer exists, and will be * deleted from the log when the scrub completes. * * The log is stored using a ZAP object whose key is a string form of the * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an * optional 'objset:object' human-readable string describing the data. When an * error is first logged, this string will be empty, indicating that no name is * known. This prevents us from having to issue a potentially large amount of * I/O to discover the object name during an error path. Instead, we do the * calculation when the data is requested, storing the result so future queries * will be faster. * * If the head_errlog feature is enabled, a different on-disk format is used. * The error log of each head dataset is stored separately in the zap object * and keyed by the head id. This enables listing every dataset affected in * userland. In order to be able to track whether an error block has been * modified or added to snapshots since it was marked as an error, a new tuple * is introduced: zbookmark_err_phys_t. It allows the storage of the birth * transaction group of an error block on-disk. The birth transaction group is * used by check_filesystem() to assess whether this block was freed, * re-written or added to a snapshot since its marking as an error. * * This log is then shipped into an nvlist where the key is the dataset name and * the value is the object name. Userland is then responsible for uniquifying * this list and displaying it to the user. */ #include #include #include #include #include #include #include #include #include #define NAME_MAX_LEN 64 typedef struct clones { uint64_t clone_ds; list_node_t node; } clones_t; /* * spa_upgrade_errlog_limit : A zfs module parameter that controls the number * of on-disk error log entries that will be converted to the new * format when enabling head_errlog. Defaults to 0 which converts * all log entries. */ static uint_t spa_upgrade_errlog_limit = 0; /* * Convert a bookmark to a string. */ static void bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) { (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); } /* * Convert an err_phys to a string. */ static void errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) { (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); } /* * Convert a string to a err_phys. */ void name_to_errphys(char *buf, zbookmark_err_phys_t *zep) { zep->zb_object = zfs_strtonum(buf, &buf); ASSERT(*buf == ':'); zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zep->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zep->zb_birth = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } /* * Convert a string to a bookmark. */ static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { zb->zb_objset = zfs_strtonum(buf, &buf); ASSERT(*buf == ':'); zb->zb_object = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { zb->zb_objset = dataset; zb->zb_object = zep->zb_object; zb->zb_level = zep->zb_level; zb->zb_blkid = zep->zb_blkid; } static void name_to_object(char *buf, uint64_t *obj) { *obj = zfs_strtonum(buf, &buf); ASSERT(*buf == '\0'); } /* * Retrieve the head filesystem. */ static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) { dsl_dataset_t *ds; int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); ASSERT(head_ds); *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } /* * Log an uncorrectable error to the persistent error log. We add it to the * spa's list of pending errors. The changes are actually synced out to disk * during spa_errlog_sync(). */ void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) { spa_error_entry_t search; spa_error_entry_t *new; avl_tree_t *tree; avl_index_t where; /* * If we are trying to import a pool, ignore any errors, as we won't be * writing to the pool any time soon. */ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return; mutex_enter(&spa->spa_errlist_lock); /* * If we have had a request to rotate the log, log it to the next list * instead of the current one. */ if (spa->spa_scrub_active || spa->spa_scrub_finished) tree = &spa->spa_errlist_scrub; else tree = &spa->spa_errlist_last; search.se_bookmark = *zb; if (avl_find(tree, &search, &where) != NULL) { mutex_exit(&spa->spa_errlist_lock); return; } new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *zb; /* * If the head_errlog feature is enabled, store the birth txg now. In * case the file is deleted before spa_errlog_sync() runs, we will not * be able to retrieve the birth txg. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { new->se_zep.zb_object = zb->zb_object; new->se_zep.zb_level = zb->zb_level; new->se_zep.zb_blkid = zb->zb_blkid; /* * birth may end up being NULL, e.g. in zio_done(). We * will handle this in process_error_block(). */ if (birth != NULL) new->se_zep.zb_birth = *birth; } avl_insert(tree, new, where); mutex_exit(&spa->spa_errlist_lock); } int find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg) { objset_t *os; int error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); dnode_t *dn; blkptr_t bp; error = dnode_hold(os, zep->zb_object, FTAG, &dn); if (error != 0) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, NULL); if (error == 0 && BP_IS_HOLE(&bp)) error = SET_ERROR(ENOENT); *birth_txg = bp.blk_birth; rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * This function finds the oldest affected filesystem containing an error * block. */ int find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t *top_affected_fs) { uint64_t oldest_dsobj; int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, &oldest_dsobj); if (error != 0) return (error); dsl_dataset_t *ds; error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); *top_affected_fs = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } #ifdef _KERNEL /* * Copy the bookmark to the end of the user-space buffer which starts at * uaddr and has *count unused entries, and decrement *count by 1. */ static int copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) { if (*count == 0) return (SET_ERROR(ENOMEM)); *count -= 1; if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t), sizeof (zbookmark_phys_t)) != 0) return (SET_ERROR(EFAULT)); return (0); } /* * Each time the error block is referenced by a snapshot or clone, add a * zbookmark_phys_t entry to the userspace array at uaddr. The array is * filled from the back and the in-out parameter *count is modified to be the * number of unused entries at the beginning of the array. The function * scrub_filesystem() is modelled after this one. */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count, list_t *clones_list) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; int error = dsl_dataset_hold_obj_flags(dp, head_ds, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); uint64_t latest_txg; uint64_t txg_to_consider = spa->spa_syncing_txg; boolean_t check_snapshot = B_TRUE; error = find_birth_txg(ds, zep, &latest_txg); /* * If find_birth_txg() errors out otherwise, let txg_to_consider be * equal to the spa's syncing txg: if check_filesystem() errors out * then affected snapshots or clones will not be checked. */ if (error == 0 && zep->zb_birth == latest_txg) { /* Block neither free nor rewritten. */ zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } check_snapshot = B_FALSE; } else if (error == 0) { txg_to_consider = latest_txg; } /* * Retrieve the number of snapshots if the dataset is not a snapshot. */ uint64_t snap_count = 0; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { error = zap_count(spa->spa_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (error != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } } if (snap_count == 0) { /* Filesystem without snapshots. */ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t), KM_SLEEP); int aff_snap_count = 0; uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { error = dsl_dataset_hold_obj_flags(dp, snap_obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) goto out; if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } boolean_t affected = B_TRUE; if (check_snapshot) { uint64_t blk_txg; error = find_birth_txg(ds, zep, &blk_txg); affected = (error == 0 && zep->zb_birth == blk_txg); } /* Report errors in snapshots. */ if (affected) { snap_obj_array[aff_snap_count] = snap_obj; aff_snap_count++; zbookmark_phys_t zb; zep_to_zb(snap_obj, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); goto out; } } snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); } if (zap_clone == 0 || aff_snap_count == 0) return (0); /* Check clones. */ zap_cursor_t *zc; zap_attribute_t *za; zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer, DS_HOLD_FLAG_DECRYPT, FTAG, &clone); if (error != 0) break; /* * Only clones whose origins were affected could also * have affected snapshots. */ boolean_t found = B_FALSE; for (int i = 0; i < snap_count; i++) { if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj == snap_obj_array[i]) found = B_TRUE; } dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG); if (!found) continue; clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP); ct->clone_ds = za->za_first_integer; list_insert_tail(clones_list, ct); } zap_cursor_fini(zc); kmem_free(za, sizeof (*za)); kmem_free(zc, sizeof (*zc)); out: kmem_free(snap_obj_array, sizeof (*snap_obj_array)); return (error); } static int process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count) { /* * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the * birth txg or the head filesystem of the block pointer. This may * happen e.g. when an encrypted filesystem is not mounted or when * the key is not loaded. In this case do not proceed to * check_filesystem(), instead do the accounting here. */ if (zep->zb_birth == 0 || head_ds == 0) { zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); int error = copyout_entry(&zb, uaddr, count); if (error != 0) { return (error); } return (0); } uint64_t top_affected_fs; uint64_t init_count = *count; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); if (error == 0) { clones_t *ct; list_t clones_list; list_create(&clones_list, sizeof (clones_t), offsetof(clones_t, node)); error = check_filesystem(spa, top_affected_fs, zep, uaddr, count, &clones_list); while ((ct = list_remove_head(&clones_list)) != NULL) { error = check_filesystem(spa, ct->clone_ds, zep, uaddr, count, &clones_list); kmem_free(ct, sizeof (*ct)); if (error) { while (!list_is_empty(&clones_list)) { ct = list_remove_head(&clones_list); kmem_free(ct, sizeof (*ct)); } break; } } list_destroy(&clones_list); } if (error == 0 && init_count == *count) { /* * If we reach this point, no errors have been detected * in the checked filesystems/snapshots. Before returning mark * the error block to be removed from the error lists and logs. */ zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); spa_remove_error(spa, &zb, &zep->zb_birth); } return (error); } #endif /* Return the number of errors in the error log */ uint64_t spa_get_last_errlog_size(spa_t *spa) { uint64_t total = 0, count; mutex_enter(&spa->spa_errlog_lock); if (spa->spa_errlog_last != 0 && zap_count(spa->spa_meta_objset, spa->spa_errlog_last, &count) == 0) total += count; mutex_exit(&spa->spa_errlog_lock); return (total); } /* * If a healed bookmark matches an entry in the error log we stash it in a tree * so that we can later remove the related log entries in sync context. */ static void spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, const uint64_t *birth) { char name[NAME_MAX_LEN]; if (obj == 0) return; boolean_t held_list = B_FALSE; boolean_t held_log = B_FALSE; if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { bookmark_to_name(healed_zb, name, sizeof (name)); if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset, name) == 0) { if (!MUTEX_HELD(&spa->spa_errlog_lock)) { mutex_enter(&spa->spa_errlog_lock); held_log = B_TRUE; } /* * Found an error matching healed zb, add zb to our * tree of healed errors */ avl_tree_t *tree = &spa->spa_errlist_healed; spa_error_entry_t search; spa_error_entry_t *new; avl_index_t where; search.se_bookmark = *healed_zb; if (!MUTEX_HELD(&spa->spa_errlist_lock)) { mutex_enter(&spa->spa_errlist_lock); held_list = B_TRUE; } if (avl_find(tree, &search, &where) != NULL) { if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); return; } new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *healed_zb; avl_insert(tree, new, where); if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); } return; } zbookmark_err_phys_t healed_zep; healed_zep.zb_object = healed_zb->zb_object; healed_zep.zb_level = healed_zb->zb_level; healed_zep.zb_blkid = healed_zb->zb_blkid; if (birth != NULL) healed_zep.zb_birth = *birth; else healed_zep.zb_birth = 0; errphys_to_name(&healed_zep, name, sizeof (name)); zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { if (zap_contains(spa->spa_meta_objset, za.za_first_integer, name) == 0) { if (!MUTEX_HELD(&spa->spa_errlog_lock)) { mutex_enter(&spa->spa_errlog_lock); held_log = B_TRUE; } avl_tree_t *tree = &spa->spa_errlist_healed; spa_error_entry_t search; spa_error_entry_t *new; avl_index_t where; search.se_bookmark = *healed_zb; if (!MUTEX_HELD(&spa->spa_errlist_lock)) { mutex_enter(&spa->spa_errlist_lock); held_list = B_TRUE; } if (avl_find(tree, &search, &where) != NULL) { if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); continue; } new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *healed_zb; new->se_zep = healed_zep; avl_insert(tree, new, where); if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); } } zap_cursor_fini(&zc); } /* * If this error exists in the given tree remove it. */ static void remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb) { spa_error_entry_t search, *found; avl_index_t where; mutex_enter(&spa->spa_errlist_lock); search.se_bookmark = *zb; if ((found = avl_find(t, &search, &where)) != NULL) { avl_remove(t, found); kmem_free(found, sizeof (spa_error_entry_t)); } mutex_exit(&spa->spa_errlist_lock); } /* * Removes all of the recv healed errors from both on-disk error logs */ static void spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) { char name[NAME_MAX_LEN]; spa_error_entry_t *se; void *cookie = NULL; ASSERT(MUTEX_HELD(&spa->spa_errlog_lock)); while ((se = avl_destroy_nodes(&spa->spa_errlist_healed, &cookie)) != NULL) { remove_error_from_list(spa, s, &se->se_bookmark); remove_error_from_list(spa, l, &se->se_bookmark); if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { bookmark_to_name(&se->se_bookmark, name, sizeof (name)); (void) zap_remove(spa->spa_meta_objset, spa->spa_errlog_last, name, tx); (void) zap_remove(spa->spa_meta_objset, spa->spa_errlog_scrub, name, tx); } else { errphys_to_name(&se->se_zep, name, sizeof (name)); zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { zap_remove(spa->spa_meta_objset, za.za_first_integer, name, tx); } zap_cursor_fini(&zc); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_scrub); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { zap_remove(spa->spa_meta_objset, za.za_first_integer, name, tx); } zap_cursor_fini(&zc); } kmem_free(se, sizeof (spa_error_entry_t)); } } /* * Stash away healed bookmarks to remove them from the on-disk error logs * later in spa_remove_healed_errors(). */ void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth) { spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth); spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth); } static uint64_t approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj) { if (spa_err_obj == 0) return (0); uint64_t total = 0; zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t count; if (zap_count(spa->spa_meta_objset, za.za_first_integer, &count) == 0) total += count; } zap_cursor_fini(&zc); return (total); } /* * Return the approximate number of errors currently in the error log. This * will be nonzero if there are some errors, but otherwise it may be more * or less than the number of entries returned by spa_get_errlog(). */ uint64_t spa_approx_errlog_size(spa_t *spa) { uint64_t total = 0; if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { mutex_enter(&spa->spa_errlog_lock); uint64_t count; if (spa->spa_errlog_scrub != 0 && zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, &count) == 0) total += count; if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && zap_count(spa->spa_meta_objset, spa->spa_errlog_last, &count) == 0) total += count; mutex_exit(&spa->spa_errlog_lock); } else { mutex_enter(&spa->spa_errlog_lock); total += approx_errlog_size_impl(spa, spa->spa_errlog_last); total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub); mutex_exit(&spa->spa_errlog_lock); } mutex_enter(&spa->spa_errlist_lock); total += avl_numnodes(&spa->spa_errlist_last); total += avl_numnodes(&spa->spa_errlist_scrub); mutex_exit(&spa->spa_errlist_lock); return (total); } /* * This function sweeps through an on-disk error log and stores all bookmarks * as error bookmarks in a new ZAP object. At the end we discard the old one, * and spa_update_errlog() will set the spa's on-disk error log to new ZAP * object. */ static void sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; zbookmark_phys_t zb; uint64_t count; *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); /* * If we cannnot perform the upgrade we should clear the old on-disk * error logs. */ if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) { VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); return; } for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { if (spa_upgrade_errlog_limit != 0 && zc.zc_cd == spa_upgrade_errlog_limit) break; name_to_bookmark(za.za_name, &zb); zbookmark_err_phys_t zep; zep.zb_object = zb.zb_object; zep.zb_level = zb.zb_level; zep.zb_blkid = zb.zb_blkid; zep.zb_birth = 0; /* * In case of an error we should simply continue instead of * returning prematurely. See the next comment. */ uint64_t head_ds; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *ds; objset_t *os; int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) continue; head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; /* * The objset and the dnode are required for getting the block * pointer, which is used to determine if BP_IS_HOLE(). If * getting the objset or the dnode fails, do not create a * zap entry (presuming we know the dataset) as this may create * spurious errors that we cannot ever resolve. If an error is * truly persistent, it should re-appear after a scan. */ if (dmu_objset_from_ds(ds, &os) != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } dnode_t *dn; blkptr_t bp; if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } rw_enter(&dn->dn_struct_rwlock, RW_READER); error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp, NULL, NULL); if (error == EACCES) error = 0; else if (!error) zep.zb_birth = bp.blk_birth; rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); if (error != 0 || BP_IS_HOLE(&bp)) continue; uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, *newobj, head_ds, err_obj, tx); } char buf[64]; errphys_to_name(&zep, buf, sizeof (buf)); const char *name = ""; (void) zap_update(spa->spa_meta_objset, err_obj, buf, 1, strlen(name) + 1, name, tx); } zap_cursor_fini(&zc); VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); } void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) { uint64_t newobj = 0; mutex_enter(&spa->spa_errlog_lock); if (spa->spa_errlog_last != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); spa->spa_errlog_last = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, + sizeof (uint64_t), 1, &spa->spa_errlog_last, tx); } if (spa->spa_errlog_scrub != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); spa->spa_errlog_scrub = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, + sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx); } + mutex_exit(&spa->spa_errlog_lock); } #ifdef _KERNEL /* * If an error block is shared by two datasets it will be counted twice. */ static int process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { if (obj == 0) return (0); zap_cursor_t *zc; zap_attribute_t *za; zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (zap_cursor_init(zc, spa->spa_meta_objset, obj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { if (*count == 0) { zap_cursor_fini(zc); kmem_free(zc, sizeof (*zc)); kmem_free(za, sizeof (*za)); return (SET_ERROR(ENOMEM)); } zbookmark_phys_t zb; name_to_bookmark(za->za_name, &zb); int error = copyout_entry(&zb, uaddr, count); if (error != 0) { zap_cursor_fini(zc); kmem_free(zc, sizeof (*zc)); kmem_free(za, sizeof (*za)); return (error); } } zap_cursor_fini(zc); kmem_free(zc, sizeof (*zc)); kmem_free(za, sizeof (*za)); return (0); } for (zap_cursor_init(zc, spa->spa_meta_objset, obj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { zap_cursor_t *head_ds_cursor; zap_attribute_t *head_ds_attr; head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); uint64_t head_ds_err_obj = za->za_first_integer; uint64_t head_ds; name_to_object(za->za_name, &head_ds); for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { zbookmark_err_phys_t head_ds_block; name_to_errphys(head_ds_attr->za_name, &head_ds_block); int error = process_error_block(spa, head_ds, &head_ds_block, uaddr, count); if (error != 0) { zap_cursor_fini(head_ds_cursor); kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); kmem_free(head_ds_attr, sizeof (*head_ds_attr)); zap_cursor_fini(zc); kmem_free(za, sizeof (*za)); kmem_free(zc, sizeof (*zc)); return (error); } } zap_cursor_fini(head_ds_cursor); kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); kmem_free(head_ds_attr, sizeof (*head_ds_attr)); } zap_cursor_fini(zc); kmem_free(za, sizeof (*za)); kmem_free(zc, sizeof (*zc)); return (0); } static int process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) { spa_error_entry_t *se; if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { int error = copyout_entry(&se->se_bookmark, uaddr, count); if (error != 0) { return (error); } } return (0); } for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { uint64_t head_ds = 0; int error = get_head_ds(spa, se->se_bookmark.zb_objset, &head_ds); /* * If get_head_ds() errors out, set the head filesystem * to the filesystem stored in the bookmark of the * error block. */ if (error != 0) head_ds = se->se_bookmark.zb_objset; error = process_error_block(spa, head_ds, &se->se_zep, uaddr, count); if (error != 0) return (error); } return (0); } #endif /* * Copy all known errors to userland as an array of bookmarks. This is * actually a union of the on-disk last log and current log, as well as any * pending error requests. * * Because the act of reading the on-disk log could cause errors to be * generated, we have two separate locks: one for the error log and one for the * in-core error lists. We only need the error list lock to log and error, so * we grab the error log lock while we read the on-disk logs, and only pick up * the error list lock when we are finished. */ int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) { int ret = 0; #ifdef _KERNEL /* * The pool config lock is needed to hold a dataset_t via (among other * places) process_error_list() -> process_error_block()-> * find_top_affected_fs(), and lock ordering requires that we get it * before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); if (!ret && !spa->spa_scrub_finished) ret = process_error_log(spa, spa->spa_errlog_last, uaddr, count); mutex_enter(&spa->spa_errlist_lock); if (!ret) ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, count); if (!ret) ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, count); mutex_exit(&spa->spa_errlist_lock); mutex_exit(&spa->spa_errlog_lock); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); #else (void) spa, (void) uaddr, (void) count; #endif return (ret); } /* * Called when a scrub completes. This simply set a bit which tells which AVL * tree to add new errors. spa_errlog_sync() is responsible for actually * syncing the changes to the underlying objects. */ void spa_errlog_rotate(spa_t *spa) { mutex_enter(&spa->spa_errlist_lock); spa->spa_scrub_finished = B_TRUE; mutex_exit(&spa->spa_errlist_lock); } /* * Discard any pending errors from the spa_t. Called when unloading a faulted * pool, as the errors encountered during the open cannot be synced to disk. */ void spa_errlog_drain(spa_t *spa) { spa_error_entry_t *se; void *cookie; mutex_enter(&spa->spa_errlist_lock); cookie = NULL; while ((se = avl_destroy_nodes(&spa->spa_errlist_last, &cookie)) != NULL) kmem_free(se, sizeof (spa_error_entry_t)); cookie = NULL; while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, &cookie)) != NULL) kmem_free(se, sizeof (spa_error_entry_t)); mutex_exit(&spa->spa_errlist_lock); } /* * Process a list of errors into the current on-disk log. */ void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) { spa_error_entry_t *se; char buf[NAME_MAX_LEN]; void *cookie; if (avl_numnodes(t) == 0) return; /* create log if necessary */ if (*obj == 0) *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); /* add errors to the current log */ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); const char *name = se->se_name ? se->se_name : ""; (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, strlen(name) + 1, name, tx); } } else { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { zbookmark_err_phys_t zep; zep.zb_object = se->se_zep.zb_object; zep.zb_level = se->se_zep.zb_level; zep.zb_blkid = se->se_zep.zb_blkid; zep.zb_birth = se->se_zep.zb_birth; uint64_t head_ds = 0; int error = get_head_ds(spa, se->se_bookmark.zb_objset, &head_ds); /* * If get_head_ds() errors out, set the head filesystem * to the filesystem stored in the bookmark of the * error block. */ if (error != 0) head_ds = se->se_bookmark.zb_objset; uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, *obj, head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, *obj, head_ds, err_obj, tx); } errphys_to_name(&zep, buf, sizeof (buf)); const char *name = se->se_name ? se->se_name : ""; (void) zap_update(spa->spa_meta_objset, err_obj, buf, 1, strlen(name) + 1, name, tx); } } /* purge the error list */ cookie = NULL; while ((se = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(se, sizeof (spa_error_entry_t)); } static void delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) { if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { VERIFY0(dmu_object_free(spa->spa_meta_objset, za.za_first_integer, tx)); } zap_cursor_fini(&zc); } VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); } /* * Sync the error log out to disk. This is a little tricky because the act of * writing the error log requires the spa_errlist_lock. So, we need to lock the * error lists, take a copy of the lists, and then reinitialize them. Then, we * drop the error list lock and take the error log lock, at which point we * do the errlog processing. Then, if we encounter an I/O error during this * process, we can successfully add the error to the list. Note that this will * result in the perpetual recycling of errors, but it is an unlikely situation * and not a performance critical operation. */ void spa_errlog_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; avl_tree_t scrub, last; int scrub_finished; mutex_enter(&spa->spa_errlist_lock); /* * Bail out early under normal circumstances. */ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && avl_numnodes(&spa->spa_errlist_last) == 0 && avl_numnodes(&spa->spa_errlist_healed) == 0 && !spa->spa_scrub_finished) { mutex_exit(&spa->spa_errlist_lock); return; } spa_get_errlists(spa, &last, &scrub); scrub_finished = spa->spa_scrub_finished; spa->spa_scrub_finished = B_FALSE; mutex_exit(&spa->spa_errlist_lock); /* * The pool config lock is needed to hold a dataset_t via * sync_error_list() -> get_head_ds(), and lock ordering * requires that we get it before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); /* * Remove healed errors from errors. */ spa_remove_healed_errors(spa, &last, &scrub, tx); /* * Sync out the current list of errors. */ sync_error_list(spa, &last, &spa->spa_errlog_last, tx); /* * Rotate the log if necessary. */ if (scrub_finished) { if (spa->spa_errlog_last != 0) delete_errlog(spa, spa->spa_errlog_last, tx); spa->spa_errlog_last = spa->spa_errlog_scrub; spa->spa_errlog_scrub = 0; sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); } /* * Sync out any pending scrub errors. */ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); /* * Update the MOS to reflect the new values. */ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, &spa->spa_errlog_last, tx); (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx); dmu_tx_commit(tx); mutex_exit(&spa->spa_errlog_lock); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); } static void delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, dmu_tx_t *tx) { if (spa_err_obj == 0) return; zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t head_ds; name_to_object(za.za_name, &head_ds); if (head_ds == ds) { (void) zap_remove(spa->spa_meta_objset, spa_err_obj, za.za_name, tx); VERIFY0(dmu_object_free(spa->spa_meta_objset, za.za_first_integer, tx)); break; } } zap_cursor_fini(&zc); } void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) { mutex_enter(&spa->spa_errlog_lock); delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); mutex_exit(&spa->spa_errlog_lock); } static int find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, uint64_t *txg) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; int error = dsl_dataset_hold_obj_flags(dp, old_head, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; while (prev_obj != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 && dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) break; if (error != 0) return (error); prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); ASSERT(prev_obj != 0); *txg = prev_obj_txg; return (0); } static void swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t old_head, dmu_tx_t *tx) { if (spa_err_obj == 0) return; uint64_t old_head_errlog; int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, old_head, &old_head_errlog); /* If no error log, then there is nothing to do. */ if (error != 0) return; uint64_t txg; error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg); if (error != 0) return; /* * Create an error log if the file system being promoted does not * already have one. */ uint64_t new_head_errlog; error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, &new_head_errlog); if (error != 0) { new_head_errlog = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, new_head, new_head_errlog, tx); } zap_cursor_t zc; zap_attribute_t za; zbookmark_err_phys_t err_block; for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { const char *name = ""; name_to_errphys(za.za_name, &err_block); if (err_block.zb_birth < txg) { (void) zap_update(spa->spa_meta_objset, new_head_errlog, za.za_name, 1, strlen(name) + 1, name, tx); (void) zap_remove(spa->spa_meta_objset, old_head_errlog, za.za_name, tx); } } zap_cursor_fini(&zc); } void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, dmu_tx_t *tx) { mutex_enter(&spa->spa_errlog_lock); swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); mutex_exit(&spa->spa_errlog_lock); } #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); EXPORT_SYMBOL(spa_approx_errlog_size); EXPORT_SYMBOL(spa_get_last_errlog_size); EXPORT_SYMBOL(spa_get_errlog); EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); EXPORT_SYMBOL(spa_errlog_sync); EXPORT_SYMBOL(spa_get_errlists); EXPORT_SYMBOL(spa_delete_dataset_errlog); EXPORT_SYMBOL(spa_swap_errlog); EXPORT_SYMBOL(sync_error_list); EXPORT_SYMBOL(spa_upgrade_errlog); EXPORT_SYMBOL(find_top_affected_fs); EXPORT_SYMBOL(find_birth_txg); EXPORT_SYMBOL(zep_to_zb); EXPORT_SYMBOL(name_to_errphys); #endif /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW, "Limit the number of errors which will be upgraded to the new " "on-disk error log when enabling head_errlog"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index b30676b42d88..9e9c9c22549d 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1,4219 +1,4228 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system * calls that change the file system. Each itx has enough information to * be able to replay them after a system crash, power loss, or * equivalent failure mode. These are stored in memory until either: * * 1. they are committed to the pool by the DMU transaction group * (txg), at which point they can be discarded; or * 2. they are committed to the on-disk ZIL for the dataset being * modified (e.g. due to an fsync, O_DSYNC, or other synchronous * requirement). * * In the event of a crash or power loss, the itxs contained by each * dataset's on-disk ZIL will be replayed when that dataset is first * instantiated (e.g. if the dataset is a normal filesystem, when it is * first mounted). * * As hinted at above, there is one ZIL per dataset (both the in-memory * representation, and the on-disk representation). The on-disk format * consists of 3 parts: * * - a single, per-dataset, ZIL header; which points to a chain of * - zero or more ZIL blocks; each of which contains * - zero or more ZIL records * * A ZIL record holds the information necessary to replay a single * system call transaction. A ZIL block can hold many ZIL records, and * the blocks are chained together, similarly to a singly linked list. * * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL * block in the chain, and the ZIL header points to the first block in * the chain. * * Note, there is not a fixed place in the pool to hold these ZIL * blocks; they are dynamically allocated and freed as needed from the * blocks available on the pool, though they can be preferentially * allocated from a dedicated "log" vdev. */ /* * This controls the amount of time that a ZIL block (lwb) will remain * "open" when it isn't "full", and it has a thread waiting for it to be * committed to stable storage. Please refer to the zil_commit_waiter() * function (and the comments within it) for more details. */ static uint_t zfs_commit_timeout_pct = 5; /* * Minimal time we care to delay commit waiting for more ZIL records. * At least FreeBSD kernel can't sleep for less than 2us at its best. * So requests to sleep for less then 5us is a waste of CPU time with * a risk of significant log latency increase due to oversleep. */ static uint64_t zil_min_commit_timeout = 5000; /* * See zil.h for more information about these fields. */ static zil_kstat_values_t zil_stats = { { "zil_commit_count", KSTAT_DATA_UINT64 }, { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; static kstat_t *zil_kstats_global; /* * Disable intent logging replay. This global ZIL switch affects all pools. */ int zil_replay_disable = 0; /* * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to * the disk(s) by the ZIL after an LWB write has completed. Setting this * will cause ZIL corruption on power loss if a volatile out-of-order * write cache is enabled. */ static int zil_nocacheflush = 0; /* * Limit SLOG write size per commit executed with synchronous priority. * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; if (BP_IS_EMBEDDED(bp)) return (0); dva = BP_IDENTITY(bp); if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0], sizeof (zc->zc_word[ZIL_ZC_GUID_0])); (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1], sizeof (zc->zc_word[ZIL_ZC_GUID_1])); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } static int zil_kstats_global_update(kstat_t *ksp, int rw) { zil_kstat_values_t *zs = ksp->ks_data; ASSERT3P(&zil_stats, ==, zs); if (rw == KSTAT_WRITE) { return (SET_ERROR(EACCES)); } zil_kstat_values_update(zs, &zil_sums_global); return (0); } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; if (!decrypt) zio_flags |= ZIO_FLAG_RAW; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || zilc->zc_nused < sizeof (*zilc) || zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { *begin = lr; *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; /* * If we are not using the resulting data, we are just checking that * it hasn't been corrupted so we don't need to waste CPU time * decompressing and decrypting it. */ if (wbuf == NULL) zio_flags |= ZIO_FLAG_RAW; ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) memcpy(wbuf, abuf->b_data, arc_buf_size(abuf)); arc_buf_destroy(abuf, &abuf); } return (error); } void zil_sums_init(zil_sums_t *zs) { wmsum_init(&zs->zil_commit_count, 0); wmsum_init(&zs->zil_commit_writer_count, 0); wmsum_init(&zs->zil_itx_count, 0); wmsum_init(&zs->zil_itx_indirect_count, 0); wmsum_init(&zs->zil_itx_indirect_bytes, 0); wmsum_init(&zs->zil_itx_copied_count, 0); wmsum_init(&zs->zil_itx_copied_bytes, 0); wmsum_init(&zs->zil_itx_needcopy_count, 0); wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void zil_sums_fini(zil_sums_t *zs) { wmsum_fini(&zs->zil_commit_count); wmsum_fini(&zs->zil_commit_writer_count); wmsum_fini(&zs->zil_itx_count); wmsum_fini(&zs->zil_itx_indirect_count); wmsum_fini(&zs->zil_itx_indirect_bytes); wmsum_fini(&zs->zil_itx_copied_count); wmsum_fini(&zs->zil_itx_copied_bytes); wmsum_fini(&zs->zil_itx_needcopy_count); wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_write); wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); wmsum_fini(&zs->zil_itx_metaslab_slog_write); wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) { zs->zil_commit_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_count); zs->zil_commit_writer_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_writer_count); zs->zil_itx_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_count); zs->zil_itx_indirect_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_count); zs->zil_itx_indirect_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_indirect_bytes); zs->zil_itx_copied_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_count); zs->zil_itx_copied_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_copied_bytes); zs->zil_itx_needcopy_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_count); zs->zil_itx_needcopy_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_needcopy_bytes); zs->zil_itx_metaslab_normal_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); zs->zil_itx_metaslab_normal_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); zs->zil_itx_metaslab_normal_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); zs->zil_itx_metaslab_slog_write.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); zs->zil_itx_metaslab_slog_alloc.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, boolean_t decrypt) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; int error = 0; /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *lrp, *end; arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; error = parse_blk_func(zilog, &blk, arg, txg); if (error != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, &lrp, &end, &abuf); if (error != 0) { if (abuf) arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS read log block error %d, " "dataset %s, seq 0x%llx\n", error, name, (u_longlong_t)blk_seq); } break; } for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) { arc_buf_destroy(abuf, &abuf); goto done; } error = parse_lr_func(zilog, lr, arg, txg); if (error != 0) { arc_buf_destroy(abuf, &abuf); goto done; } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); return (error); } static int zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { (void) tx; ASSERT(!BP_IS_HOLE(bp)); /* * As we call this function from the context of a rewind to a * checkpoint, each ZIL block whose txg is later than the txg * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ if (bp->blk_birth >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) return (0); zio_free(zilog->zl_spa, first_txg, bp); return (0); } static int zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { (void) zilog, (void) lrc, (void) tx, (void) first_txg; return (0); } static int zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; ASSERT(lrc->lrc_txtype == TX_WRITE); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); } return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } static int zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); if (tx == NULL) { return (0); } /* * XXX: Do we need to byteswap lr? */ spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. */ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } return (0); } static int zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) { switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: return (zil_claim_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, uint64_t claim_txg) { (void) claim_txg; zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; ASSERT(lrc->lrc_txtype == TX_WRITE); /* * If we previously claimed it, we need to free it. */ if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); } return (0); } static int zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; spa_t *spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); if (tx == NULL) { return (0); } spa = zilog->zl_spa; for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; if (!BP_IS_HOLE(bp)) { zio_free(spa, dmu_tx_get_txg(tx), bp); } } return (0); } static int zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) { if (claim_txg == 0) { return (0); } switch (lrc->lrc_txtype) { case TX_WRITE: return (zil_free_write(zilog, lrc, tx, claim_txg)); case TX_CLONE_RANGE: return (zil_free_clone_range(zilog, lrc, tx)); default: return (0); } } static int zil_lwb_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; return (TREE_CMP(v1, v2)); } /* * Allocate a new lwb. We may already have a block pointer for it, in which * case we get size and version from there. Or we may not yet, in which case * we choose them here and later make the block allocation match. */ static lwb_t * zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb_state_t state) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; if (bp) { lwb->lwb_blk = *bp; lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); sz = BP_GET_LSIZE(bp); } else { BP_ZERO(&lwb->lwb_blk); lwb->lwb_slim = (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL); } lwb->lwb_slog = slog; lwb->lwb_error = 0; if (lwb->lwb_slim) { lwb->lwb_nmax = sz; lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); } else { lwb->lwb_nmax = sz - sizeof (zil_chain_t); lwb->lwb_nused = lwb->lwb_nfilled = 0; } lwb->lwb_sz = sz; lwb->lwb_state = state; lwb->lwb_buf = zio_buf_alloc(sz); lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; lwb->lwb_alloc_txg = txg; lwb->lwb_max_txg = 0; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); if (state != LWB_STATE_NEW) zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); } static void zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); ASSERT3P(lwb->lwb_child_zio, ==, NULL); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); VERIFY(list_is_empty(&lwb->lwb_itxs)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); /* * Clear the zilog's field to indicate this lwb is no longer * valid, and prevent use-after-free errors. */ if (zilog->zl_last_lwb_opened == lwb) zilog->zl_last_lwb_opened = NULL; kmem_cache_free(zil_lwb_cache, lwb); } /* * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ static void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(spa_writeable(zilog->zl_spa)); if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); } } /* * Determine if the zil is dirty in the specified txg. Callers wanting to * ensure that the dirty state does not change must hold the itxg_lock for * the specified txg. Holding the lock will ensure that the zil cannot be * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current * state. */ static boolean_t __maybe_unused zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) return (B_TRUE); return (B_FALSE); } /* * Determine if the zil is dirty. The zil is considered dirty if it has * any pending itx records that have not been cleaned by zil_clean(). */ static boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) return (B_TRUE); } return (B_FALSE); } /* * Its called in zil_commit context (zil_process_commit_list()/zil_create()). * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled. * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every * zil_commit. */ static void zil_commit_activate_saxattr_feature(zilog_t *zilog) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); uint64_t txg = 0; dmu_tx_t *tx = NULL; if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(ds, tx); txg = dmu_tx_get_txg(tx); mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; boolean_t slog = FALSE; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianness */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, ZIL_MIN_BLKSZ, &slog); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write block (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block pointer into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { /* * If "zilsaxattr" feature is enabled on zpool, then activate * it now when we're creating the ZIL chain. We can't wait with * this until we write the first xattr log record because we * need to wait for the feature activation to sync out. */ if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL) { mutex_enter(&ds->ds_lock); ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); } dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } else { /* * This branch covers the case where we enable the feature on a * zpool that has existing ZIL headers. */ zil_commit_activate_saxattr_feature(zilog); } IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL, dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)); ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); IMPLY(error == 0, lwb != NULL); return (lwb); } /* * In one tx, free all log blocks and clear the log header. If keep_first * is set, then we're replaying a log with no content. We want to keep the * first block, however, so that the first synchronous transaction doesn't * require a txg_wait_synced() in zil_create(). We don't need to * txg_wait_synced() here either when keep_first is set, because both * zil_create() and zil_destroy() will wait for any in-progress destroys * to complete. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); } } else if (!keep_first) { zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); return (B_TRUE); } void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE); } int zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) { dmu_tx_t *tx = txarg; zilog_t *zilog; uint64_t first_txg; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_own_obj(dp, ds->ds_object, DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { cmn_err(CE_WARN, "can't open objset for %llu, error %u", (unsigned long long)ds->ds_object, error); } return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); first_txg = spa_min_claim_txg(zilog->zl_spa); /* * If the spa_log_state is not set to be cleared, check whether * the current uberblock is a checkpoint one and if the current * header has been claimed before moving on. * * If the current uberblock is a checkpointed uberblock then * one of the following scenarios took place: * * 1] We are currently rewinding to the checkpoint of the pool. * 2] We crashed in the middle of a checkpoint rewind but we * did manage to write the checkpointed uberblock to the * vdev labels, so when we tried to import the pool again * the checkpointed uberblock was selected from the import * procedure. * * In both cases we want to zero out all the ZIL blocks, except * the ones that have been claimed at the time of the checkpoint * (their zh_claim_txg != 0). The reason is that these blocks * may be corrupted since we may have reused their locations on * disk after we took the checkpoint. * * We could try to set spa_log_state to SPA_LOG_CLEAR earlier * when we first figure out whether the current uberblock is * checkpointed or not. Unfortunately, that would discard all * the logs, including the ones that are claimed, and we would * leak space. */ if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0)) { if (!BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_clear_log_block, zil_noop_log_record, tx, first_txg, B_FALSE); } BP_ZERO(&zh->zh_log); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * If we are not rewinding and opening the pool normally, then * the min_claim_txg should be equal to the first txg of the pool. */ ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg, B_FALSE); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_disown(os, B_FALSE, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ int zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) { (void) dp; zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_from_ds(ds, &os); if (error != 0) { cmn_err(CE_WARN, "can't open objset %llu, error %d", (unsigned long long)ds->ds_object, error); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the * log as its content should have already been synced to the * pool. */ spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) return (0); /* * Check whether the current uberblock is checkpointed (e.g. * we are rewinding) and whether the current header has been * claimed or not. If it hasn't then skip verifying it. We * do this because its ZIL blocks may be part of the pool's * state before the rewind, which is no longer valid. */ zil_header_t *zh = zil_header_in_syncing_context(zilog); if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && zh->zh_claim_txg == 0) return (0); } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_min_claim_txg(os->os_spa), B_FALSE); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } /* * When an itx is "skipped", this function is used to properly mark the * waiter as "done, and signal any thread(s) waiting on it. An itx can * be skipped (and not committed to an lwb) for a variety of reasons, * one of them being that the itx was committed via spa_sync(), prior to * it being committed to an lwb; this can happen if a thread calling * zil_commit() is racing with spa_sync(). */ static void zil_commit_waiter_skip(zil_commit_waiter_t *zcw) { mutex_enter(&zcw->zcw_lock); ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } /* * This function is used when the given waiter is to be linked into an * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. * At this point, the waiter will no longer be referenced by the itx, * and instead, will be referenced by the lwb. */ static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { /* * The lwb_waiters field of the lwb is protected by the zilog's * zl_issuer_lock while the lwb is open and zl_lock otherwise. * zl_issuer_lock also protects leaving the open state. * zcw_lwb setting is protected by zl_issuer_lock and state != * flush_done, which transition is protected by zl_lock. */ ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); IMPLY(lwb->lwb_state != LWB_STATE_OPENED, MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(&lwb->lwb_waiters, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); zcw->zcw_lwb = lwb; } /* * This function is used when zio_alloc_zil() fails to allocate a ZIL * block, and the given waiter must be linked to the "nolwb waiters" * list inside of zil_process_commit_list(). */ static void zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(nolwb, zcw); ASSERT3P(zcw->zcw_lwb, ==, NULL); } void zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) { avl_tree_t *t = &lwb->lwb_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); if (zil_nocacheflush) return; mutex_enter(&lwb->lwb_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&lwb->lwb_vdev_lock); } static void zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) { avl_tree_t *src = &lwb->lwb_vdev_tree; avl_tree_t *dst = &nlwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does * not need the protection of lwb_vdev_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ mutex_enter(&nlwb->lwb_vdev_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. */ while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { avl_index_t where; if (avl_find(dst, zv, &where) == NULL) { avl_insert(dst, zv, where); } else { kmem_free(zv, sizeof (*zv)); } } mutex_exit(&nlwb->lwb_vdev_lock); } void zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) { lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); } /* * This function is a called after all vdevs associated with a given lwb * write have completed their DKIOCFLUSHWRITECACHE command; or as soon * as the lwb write completes, if "zil_nocacheflush" is set. Further, * all "previous" lwb's will have completed before this function is * called; i.e. this function is called for all previous lwbs before * it's called for "this" lwb (enforced via zio the dependencies * configured in zil_lwb_set_zio_dependency()). * * The intention is for this function to be called as soon as the * contents of an lwb are considered "stable" on disk, and will survive * any sudden loss of power. At this point, any threads waiting for the * lwb to reach this state are signalled, and the "waiter" structures * are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; zil_commit_waiter_t *zcw; itx_t *itx; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8; lwb->lwb_root_zio = NULL; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number * for ztest. We only update this value when all the log * writes succeeded, because ztest wants to ASSERT that * it got the whole log chain. */ zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been * propagated "up" to this specific LWB's root ZIO, in * order for this error handling to work correctly. This * includes ZIO errors from either this LWB's write or * flush, as well as any errors from other dependent LWBs * (e.g. a root LWB ZIO that might be a child of this LWB). * * With that said, it's important to note that LWB flush * errors are not propagated up to the LWB root ZIO. * This is incorrect behavior, and results in VDEV flush * errors not being handled correctly here. See the * comment above the call to "zio_flush" for details. */ zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); } uint64_t txg = lwb->lwb_issued_txg; /* Once we drop the lock, lwb may be freed by zil_sync(). */ mutex_exit(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); zilog->zl_lwb_inflight[txg & TXG_MASK]--; if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) cv_broadcast(&zilog->zl_lwb_io_cv); mutex_exit(&zilog->zl_lwb_io_lock); } /* * Wait for the completion of all issued write/flush of that txg provided. * It guarantees zil_lwb_flush_vdevs_done() is called and returned. */ static void zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) { ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa)); mutex_enter(&zilog->zl_lwb_io_lock); while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0) cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lwb_io_lock); #ifdef ZFS_DEBUG mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); lwb_t *lwb = list_head(&zilog->zl_lwb_list); while (lwb != NULL) { if (lwb->lwb_issued_txg <= txg) { ASSERT(lwb->lwb_state != LWB_STATE_ISSUED); ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE); IMPLY(lwb->lwb_issued_txg > 0, lwb->lwb_state == LWB_STATE_FLUSH_DONE); } IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE, lwb->lwb_buf == NULL); lwb = list_next(&zilog->zl_lwb_list, lwb); } mutex_exit(&zilog->zl_lwb_io_lock); mutex_exit(&zilog->zl_lock); #endif } /* * This is called when an lwb's write zio completes. The callback's * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved * in writing out this specific lwb's data, and in the case that cache * flushes have been deferred, vdevs involved in writing the data for * previous lwbs. The writes corresponding to all the vdevs in the * lwb_vdev_tree will have completed by the time this is called, due to * the zio dependencies configured in zil_lwb_set_zio_dependency(), * which takes deferred flushes into account. The lwb will be "done" * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio * completion callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; spa_t *spa = zio->io_spa; zilog_t *zilog = lwb->lwb_zilog; avl_tree_t *t = &lwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; lwb_t *nlwb; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); abd_free(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); lwb->lwb_buf = NULL; mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; + + /* + * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not + * called for it yet, and when it will be, it won't be able to make + * its write ZIO a parent this ZIO. In such case we can not defer + * our flushes or below may be a race between the done callbacks. + */ nlwb = list_next(&zilog->zl_lwb_list, lwb); + if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) + nlwb = NULL; mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) return; /* * If there was an IO error, we're not going to call zio_flush() * on these vdevs, so we simply empty the tree and free the * nodes. We avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. * * Additionally, we don't perform any further error handling at * this point (e.g. setting "zcw_zio_error" appropriately), as * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, * we expect any error seen here, to have been propagated to * that function). */ if (zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; } /* * If this lwb does not have any threads waiting for it to * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE * command to the vdevs written to by "this" lwb, and instead * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE * command for those vdevs. Thus, we merge the vdev tree of * "this" lwb with the vdev tree of the "next" lwb in the list, * and assume the "next" lwb will handle flushing the vdevs (or * deferring the flush(s) again). * * This is a useful performance optimization, especially for * workloads with lots of async write activity and few sync * write and/or fsync activity, as it has the potential to * coalesce multiple flush commands to a vdev into one. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); return; } while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL && !vd->vdev_nowritecache) { /* * The "ZIO_FLAG_DONT_PROPAGATE" is currently * always used within "zio_flush". This means, * any errors when flushing the vdev(s), will * (unfortunately) not be handled correctly, * since these "zio_flush" errors will not be * propagated up to "zil_lwb_flush_vdevs_done". */ zio_flush(lwb->lwb_root_zio, vd); } kmem_free(zv, sizeof (*zv)); } } /* * Build the zio dependency chain, which is used to preserve the ordering of * lwb completions that is required by the semantics of the ZIL. Each new lwb * zio becomes a parent of the previous lwb zio, such that the new lwb's zio * cannot complete until the previous lwb's zio completes. * * This is required by the semantics of zil_commit(): the commit waiters * attached to the lwbs will be woken in the lwb zio's completion callback, * so this zio dependency graph ensures the waiters are woken in the correct * order (the same order the lwbs were created). */ static void zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb); if (prev_lwb == NULL || prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE) return; /* * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. * * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous * lwb will rely on this lwb to flush the vdevs written to by that * previous lwb. Thus, we need to ensure this lwb doesn't issue the * flush until after the previous lwb's write completes. We ensure * this ordering by setting the zio parent/child relationship here. * * Without this relationship on the lwb's write zio, it's possible * for this lwb's write to complete prior to the previous lwb's write * completing; and thus, the vdevs for the previous lwb would be * flushed prior to that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion handler, * zil_lwb_write_done()). */ if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); } else { ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL); zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio); } /* * This function's purpose is to "open" an lwb such that it is ready to * accept new itxs being committed to it. This function is idempotent; if * the passed in lwb has already been opened, it is essentially a no-op. */ static void zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (lwb->lwb_state != LWB_STATE_NEW) { ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); return; } mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); } /* * Define a limited set of intent log block sizes. * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ static const struct { uint64_t limit; uint64_t blksz; } zil_block_buckets[] = { { 4096, 4096 }, /* non TX_WRITE */ { 8192 + 4096, 8192 + 4096 }, /* database */ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ { 131072, 131072 }, /* < 128KB writes */ { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; /* * Maximum block size used by the ZIL. This is picked up when the ZIL is * initialized. Otherwise this should not be used directly; see * zl_max_block_size instead. */ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* * Close the log block for being issued and allocate the next one. * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { int i; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); lwb->lwb_state = LWB_STATE_CLOSED; /* * If there was an allocation failure then returned NULL will trigger * zil_commit_writer_stall() at the caller. This is inherently racy, * since allocation may not have happened yet. */ if (lwb->lwb_error != 0) return (NULL); /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guessing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) continue; zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, zil_blksz, uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); } /* * Finalize previously closed block and issue the write zio. */ static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { spa_t *spa = zilog->zl_spa; zil_chain_t *zilc; boolean_t slog; zbookmark_phys_t zb; zio_priority_t prio; int error; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* Actually fill the lwb with the data. */ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; itx = list_next(&lwb->lwb_itxs, itx)) zil_lwb_commit(zilog, lwb, itx); lwb->lwb_nused = lwb->lwb_nfilled; lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); /* * The lwb is now ready to be issued, but it can be only if it already * got its block pointer allocated or the allocation has failed. * Otherwise leave it as-is, relying on some other thread to issue it * after allocating its block pointer via calling zil_lwb_write_issue() * for the previous lwb(s) in the chain. */ mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_READY; if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { mutex_exit(&zilog->zl_lock); return; } mutex_exit(&zilog->zl_lock); next_lwb: if (lwb->lwb_slim) zilc = (zil_chain_t *)lwb->lwb_buf; else zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); int wsz = lwb->lwb_sz; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); zil_lwb_add_block(lwb, &lwb->lwb_blk); if (lwb->lwb_slim) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; } memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; } else { /* * We can't write the lwb if there was an allocation failure, * so create a null zio instead just to maintain dependencies. */ lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL, zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL); lwb->lwb_write_zio->io_error = lwb->lwb_error; } if (lwb->lwb_child_zio) zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio); /* * Open transaction to allocate the next block pointer. */ dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); uint64_t txg = dmu_tx_get_txg(tx); /* * Allocate next the block pointer unless we are already in error. */ lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb); blkptr_t *bp = &zilc->zc_next_blk; BP_ZERO(bp); error = lwb->lwb_error; if (error == 0) { error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, &slog); } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; } /* * Reduce TXG open time by incrementing inflight counter and committing * the transaciton. zil_sync() will wait for it to return to zero. */ mutex_enter(&zilog->zl_lwb_io_lock); lwb->lwb_issued_txg = txg; zilog->zl_lwb_inflight[txg & TXG_MASK]++; zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg); mutex_exit(&zilog->zl_lwb_io_lock); dmu_tx_commit(tx); spa_config_enter(spa, SCL_STATE, lwb, RW_READER); /* * We've completed all potentially blocking operations. Update the * nlwb and allow it proceed without possible lock order reversals. */ mutex_enter(&zilog->zl_lock); zil_lwb_set_zio_dependency(zilog, lwb); lwb->lwb_state = LWB_STATE_ISSUED; if (nlwb) { nlwb->lwb_blk = *bp; nlwb->lwb_error = error; nlwb->lwb_slog = slog; nlwb->lwb_alloc_txg = txg; if (nlwb->lwb_state != LWB_STATE_READY) nlwb = NULL; } mutex_exit(&zilog->zl_lock); if (lwb->lwb_slog) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } else { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, lwb->lwb_nused); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, wsz); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, BP_GET_LSIZE(&lwb->lwb_blk)); } lwb->lwb_issued_timestamp = gethrtime(); if (lwb->lwb_child_zio) zio_nowait(lwb->lwb_child_zio); zio_nowait(lwb->lwb_write_zio); zio_nowait(lwb->lwb_root_zio); /* * If nlwb was ready when we gave it the block pointer, * it is on us to issue it and possibly following ones. */ lwb = nlwb; if (lwb) goto next_lwb; } /* * Maximum amount of data that can be put into single log block. */ uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize) { return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize); } /* * Maximum amount of log space we agree to waste to reduce number of * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). */ static inline uint64_t zil_max_waste_space(zilog_t *zilog) { return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8); } /* * Maximum amount of write data for WR_COPIED. For correctness, consumers * must fall back to WR_NEED_COPY if we can't fit the entire record into one * maximum sized log block, because each WR_COPIED record must fit in a * single log block. For space efficiency, we want to fit two records into a * max-sized log block. */ uint64_t zil_max_copied_data(zilog_t *zilog) { return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t)); } /* * Estimate space needed in the lwb for the itx. Allocate more lwbs or * split the itx as needed, but don't touch the actual transaction data. * Has to be called under zl_issuer_lock to call zil_lwb_write_close() * to chain more lwbs. */ static lwb_t * zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { itx_t *citx; lr_t *lr, *clr; lr_write_t *lrw; uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); lr = &itx->itx_lr; lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead * it's simply used as a place holder on the commit list, and * provides a mechanism for attaching a "commit waiter" onto the * correct lwb (such that the waiter can be signalled upon * completion of that lwb). Thus, we don't process this itx's * log record if it's a commit itx (these itx's don't have log * records), and instead link the itx's waiter onto the lwb's * list of waiters. * * For more details, see the comment above zil_commit(). */ if (lr->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_lwb(itx->itx_private, lwb); list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lr->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); cont: /* * If this record won't fit in the current log block, start a new one. * For WR_NEED_COPY optimize layout for minimal number of chunks. */ lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t)); if (reclen > lwb_sp || (reclen + dlen > lwb_sp && lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); if (lwb == NULL) return (NULL); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; /* * There must be enough space in the new, empty log block to * hold reclen. For WR_COPIED, we need to fit the whole * record in one block, and reclen is the header size + the * data size. For WR_NEED_COPY, we can create multiple * records, splitting the data into multiple blocks, so we * only need to fit one word of data per block; in this case * reclen is just the header size (no data). */ ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } dnow = MIN(dlen, lwb_sp - reclen); if (dlen > dnow) { ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); citx = zil_itx_clone(itx); clr = &citx->itx_lr; lr_write_t *clrw = (lr_write_t *)clr; clrw->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; } else { citx = itx; clr = lr; } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ clr->lrc_seq = ++zilog->zl_lr_seq; lwb->lwb_nused += reclen + dnow; ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); zil_lwb_add_txg(lwb, lr->lrc_txg); list_insert_tail(&lwb->lwb_itxs, citx); dlen -= dnow; if (dlen > 0) { zilog->zl_cur_used += reclen; goto cont; } if (lr->lrc_txtype == TX_WRITE && lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); return (lwb); } /* * Fill the actual transaction data into the lwb, following zil_lwb_assign(). * Does not require locking. */ static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) { lr_t *lr, *lrb; lr_write_t *lrw, *lrwb; char *lr_buf; uint64_t dlen, reclen; lr = &itx->itx_lr; lrw = (lr_write_t *)lr; if (lr->lrc_txtype == TX_COMMIT) return; if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { dlen = 0; } reclen = lr->lrc_reclen; ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; memcpy(lr_buf, lr, reclen); lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, lrw->lr_length); } else { char *dbuf; int error; if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; ZIL_STAT_BUMP(zilog, zil_itx_indirect_count); ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes, lrw->lr_length); if (lwb->lwb_child_zio == NULL) { lwb->lwb_child_zio = zio_root( zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } } /* * The "lwb_child_zio" we pass in will become a child of * "lwb_write_zio", when one is created, so one will be * a parent of any zio's created by the "zl_get_data". * This way "lwb_write_zio" will first wait for children * block pointers before own writing, and then for their * writing completion before the vdev cache flushing. */ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_child_zio); if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ memset((char *)dbuf + lrwb->lr_length, 0, dlen - lrwb->lr_length); } /* * Typically, the only return values we should see from * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or * EALREADY. However, it is also possible to see other * error values such as ENOSPC or EINVAL from * dmu_read() -> dnode_hold() -> dnode_hold_impl() or * ENXIO as well as a multitude of others from the * block layer through dmu_buf_hold() -> dbuf_read() * -> zio_wait(), as well as through dmu_read() -> * dnode_hold() -> dnode_hold_impl() -> dbuf_read() -> * zio_wait(). When these errors happen, we can assume * that neither an immediate write nor an indirect * write occurred, so we need to fall back to * txg_wait_synced(). This is unusual, so we print to * dmesg whenever one of these errors occurs. */ switch (error) { case 0: break; default: cmn_err(CE_WARN, "zil_lwb_commit() received " "unexpected error %d from ->zl_get_data()" ". Falling back to txg_wait_synced().", error); zfs_fallthrough; case EIO: txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: return; } } } lwb->lwb_nfilled += reclen + dlen; ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * zil_itx_create(uint64_t txtype, size_t olrsize) { size_t itxsize, lrsize; itx_t *itx; lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_lr.lrc_seq = 0; /* defensive */ memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize); itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; itx->itx_callback_data = NULL; itx->itx_size = itxsize; return (itx); } static itx_t * zil_itx_clone(itx_t *oitx) { itx_t *itx = zio_data_buf_alloc(oitx->itx_size); memcpy(itx, oitx, oitx->itx_size); itx->itx_callback = NULL; itx->itx_callback_data = NULL; return (itx); } void zil_itx_destroy(itx_t *itx) { IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); if (itx->itx_callback != NULL) itx->itx_callback(itx->itx_callback_data); zio_data_buf_free(itx, itx->itx_size); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(void *arg) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itxs_t *itxs = arg; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_remove_head(list)) != NULL) { /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * * - a thread calls zil_commit() which assigns the * commit itx to a per-txg i_sync_list * - zil_itxg_clean() is called (e.g. via spa_sync()) * while the waiter is still on the i_sync_list * * There's nothing to prevent syncing the txg while the * waiter is on the i_sync_list. This normally doesn't * happen because spa_sync() is slower than zil_commit(), * but if zil_commit() calls txg_wait_synced() (e.g. * because zil_create() or zil_commit_writer_stall() is * called) we will hit this case. */ if (itx->itx_lr.lrc_txtype == TX_COMMIT) zil_commit_waiter_skip(itx->itx_private); zil_itx_destroy(itx); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; return (TREE_CMP(o1, o2)); } /* * Remove all async itx with the given oid. */ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; ian = avl_find(t, &oid, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " "txg %llu", (u_longlong_t)itxg->itxg_txg); clean = itxg->itxg_itxs; } itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); /* * We don't want to dirty the ZIL using ZILTEST_TXG, because * zil_clean() will never be called using ZILTEST_TXG. Thus, we * need to be careful to always dirty the ZIL using the "real" * TXG (not itxg_txg) even when the SPA is frozen. */ zilog_dirty(zilog, dmu_tx_get_txg(tx)); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been committed) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; ASSERT3U(synced_txg, <, ZILTEST_TXG); mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT3U(itxg->itxg_txg, !=, 0); clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ ASSERT3P(zilog->zl_dmu_pool, !=, NULL); ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, zil_itxg_clean, clean_me, TQ_NOSLEEP); if (id == TASKQID_INVALID) zil_itxg_clean(clean_me); } /* * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ static uint64_t zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg, wtxg = 0; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. That's okay since we'll * only commit things in the future. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If we're adding itx records to the zl_itx_commit_list, * then the zil better be dirty in this "txg". We can assert * that here since we're holding the itxg_lock which will * prevent spa_sync from cleaning it. Once we add the itxs * to the zl_itx_commit_list we must commit it to disk even * if it's unnecessary (i.e. the txg was synced). */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_t *sync_list = &itxg->itxg_itxs->i_sync_list; if (unlikely(zilog->zl_suspend > 0)) { /* * ZIL was just suspended, but we lost the race. * Allow all earlier itxs to be committed, but ask * caller to do txg_wait_synced(txg) for any new. */ if (!list_is_empty(sync_list)) wtxg = MAX(wtxg, txg); } else { list_move_tail(commit_list, sync_list); } mutex_exit(&itxg->itxg_lock); } return (wtxg); } /* * Move the async itxs for a specified object to commit into sync lists. */ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; /* * This is inherently racy, since there is nothing to prevent * the last synced txg from changing. */ for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { ian = avl_find(t, &foid, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } /* * This function will prune commit itxs that are at the head of the * commit list (it won't prune past the first non-commit itx), and * either: a) attach them to the last lwb that's still pending * completion, or b) skip them altogether. * * This is used as a performance optimization to prevent commit itxs * from generating new lwbs when it's unnecessary to do so. */ static void zil_prune_commit_list(zilog_t *zilog) { itx_t *itx; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; if (lrc->lrc_txtype != TX_COMMIT) break; mutex_enter(&zilog->zl_lock); lwb_t *last_lwb = zilog->zl_last_lwb_opened; if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { /* * All of the itxs this waiter was waiting on * must have already completed (or there were * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ zil_commit_waiter_skip(itx->itx_private); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); } mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); zil_itx_destroy(itx); } IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } static void zil_commit_writer_stall(zilog_t *zilog) { /* * When zio_alloc_zil() fails to allocate the next lwb block on * disk, we must call txg_wait_synced() to ensure all of the * lwbs in the zilog's zl_lwb_list are synced and then freed (in * zil_sync()), such that any subsequent ZIL writer (i.e. a call * to zil_process_commit_list()) will have to call zil_create(), * and start a new ZIL chain. * * Since zil_alloc_zil() failed, the lwb that was previously * issued does not have a pointer to the "next" lwb on disk. * Thus, if another ZIL writer thread was to allocate the "next" * on-disk lwb, that block could be leaked in the event of a * crash (because the previous lwb on-disk would not point to * it). * * We must hold the zilog's zl_issuer_lock while we do this, to * ensure no new threads enter zil_process_commit_list() until * all lwb's in the zl_lwb_list have been synced and freed * (which is achieved via the txg_wait_synced() call). */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); txg_wait_synced(zilog->zl_dmu_pool, 0); ASSERT(list_is_empty(&zilog->zl_lwb_list)); } /* * This function will traverse the commit list, creating new lwbs as * needed, and committing the itxs from the commit list to these newly * created lwbs. Additionally, as a new lwb is created, the previous * lwb will be issued to the zio layer to be written to disk. */ static void zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; list_t nolwb_waiters; lwb_t *lwb, *plwb; itx_t *itx; boolean_t first = B_TRUE; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_is_empty(&zilog->zl_itx_commit_list)) return; list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { lwb = zil_create(zilog); } else { /* * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will * have already been created (zl_lwb_list not empty). */ zil_commit_activate_saxattr_feature(zilog); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); first = (lwb->lwb_state == LWB_STATE_NEW) && ((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL || plwb->lwb_state == LWB_STATE_FLUSH_DONE); } while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; ASSERT3U(txg, !=, 0); if (lrc->lrc_txtype == TX_COMMIT) { DTRACE_PROBE2(zil__process__commit__itx, zilog_t *, zilog, itx_t *, itx); } else { DTRACE_PROBE2(zil__process__normal__itx, zilog_t *, zilog, itx_t *, itx); } boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); /* * If the txg of this itx has already been synced out, then * we don't need to commit this itx to an lwb. This is * because the data of this itx will have already been * written to the main pool. This is inherently racy, and * it's still ok to commit an itx whose txg has already * been synced; this will result in a write that's * unnecessary, but will do no harm. * * With that said, we always want to commit TX_COMMIT itxs * to an lwb, regardless of whether or not that itx's txg * has been synced out. We do this to ensure any OPENED lwb * will always have at least one zil_commit_waiter_t linked * to the lwb. * * As a counter-example, if we skipped TX_COMMIT itx's * whose txg had already been synced, the following * situation could occur if we happened to be racing with * spa_sync: * * 1. We commit a non-TX_COMMIT itx to an lwb, where the * itx's txg is 10 and the last synced txg is 9. * 2. spa_sync finishes syncing out txg 10. * 3. We move to the next itx in the list, it's a TX_COMMIT * whose txg is 10, so we skip it rather than committing * it to the lwb used in (1). * * If the itx that is skipped in (3) is the last TX_COMMIT * itx in the commit list, than it's possible for the lwb * used in (1) to remain in the OPENED state indefinitely. * * To prevent the above scenario from occurring, ensuring * that once an lwb is OPENED it will transition to ISSUED * and eventually DONE, we always commit TX_COMMIT itx's to * an lwb here, even if that itx's txg has already been * synced. * * Finally, if the pool is frozen, we _always_ commit the * itx. The point of freezing the pool is to prevent data * from being written to the main pool via spa_sync, and * instead rely solely on the ZIL to persistently store the * data; i.e. when the pool is frozen, the last synced txg * value can't be trusted. */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); } else if ((zcw->zcw_lwb != NULL && zcw->zcw_lwb != lwb) || zcw->zcw_done) { /* * Our lwb is done, leave the rest of * itx list to somebody else who care. */ first = B_FALSE; break; } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } list_insert_tail(&nolwb_itxs, itx); } } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); } } if (lwb == NULL) { /* * This indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this happens, we must stall * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* * Additionally, we have to signal and mark the "nolwb" * waiters as "done" here, since without an lwb, we * can't do this via zil_lwb_flush_vdevs_done() like * normal. */ zil_commit_waiter_t *zcw; while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) zil_commit_waiter_skip(zcw); /* * And finally, we have to destroy the itx's that * couldn't be committed to an lwb; this will also call * the itx's callback if one exists for the itx. */ while ((itx = list_remove_head(&nolwb_itxs)) != NULL) zil_itx_destroy(itx); } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_OPENED); /* * At this point, the ZIL block pointed at by the "lwb" * variable is in "new" or "opened" state. * * If it's "new", then no itxs have been committed to it, so * there's no point in issuing its zio (i.e. it's "empty"). * * If it's "opened", then it contains one or more itxs that * eventually need to be committed to stable storage. In * this case we intentionally do not issue the lwb's zio * to disk yet, and instead rely on one of the following * two mechanisms for issuing the zio: * * 1. Ideally, there will be more ZIL activity occurring on * the system, such that this function will be immediately * called again by different thread and this lwb will be * closed by zil_lwb_assign(). This way, the lwb will be * "full" when it is issued to disk, and we'll make use of * the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, zil_commit_waiter() will close it and issue * the zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount * of ZIL activity occurring on the system at that time. * * We do this for a couple of reasons: * * 1. To try and reduce the number of IOPs needed to * write the same number of itxs. If an lwb has space * available in its buffer for more itxs, and more itxs * will be committed relatively soon (relative to the * latency of performing a write), then it's beneficial * to wait for these "next" itxs. This way, more itxs * can be committed to stable storage with fewer writes. * * 2. To try and use the largest lwb block size that the * incoming rate of itxs can support. Again, this is to * try and pack as many itxs into as few lwbs as * possible, without significantly impacting the latency * of each individual itx. * * If we had no already running or open LWBs, it can be * the workload is single-threaded. And if the ZIL write * latency is very small or if the LWB is almost full, it * may be cheaper to bypass the delay. */ if (lwb->lwb_state == LWB_STATE_OPENED && first) { hrtime_t sleep = zilog->zl_last_lwb_latency * zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_nmax - lwb->lwb_nused < lwb->lwb_nmax / 8) { list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); zilog->zl_cur_used = 0; if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); } } } } } /* * This function is responsible for ensuring the passed in commit waiter * (and associated commit itx) is committed to an lwb. If the waiter is * not already committed to an lwb, all itxs in the zilog's queue of * itxs will be processed. The assumption is the passed in waiter's * commit itx will found in the queue just like the other non-commit * itxs, such that when the entire queue is processed, the waiter will * have been committed to an lwb. * * The lwb associated with the passed in waiter is not guaranteed to * have been issued by the time this function completes. If the lwb is * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ static uint64_t zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { list_t ilwbs; lwb_t *lwb; uint64_t wtxg = 0; ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { /* * It's possible that, while we were waiting to acquire * the "zl_issuer_lock", another thread committed this * waiter to an lwb. If that occurs, we bail out early, * without processing any of the zilog's queue of itxs. * * On certain workloads and system configurations, the * "zl_issuer_lock" can become highly contended. In an * attempt to reduce this contention, we immediately drop * the lock if the waiter has already been processed. * * We've measured this optimization to reduce CPU spent * contending on this lock by up to 5%, using a system * with 32 CPUs, low latency storage (~50 usec writes), * and 1024 threads performing sync writes. */ goto out; } ZIL_STAT_BUMP(zilog, zil_commit_writer_count); wtxg = zil_get_commit_list(zilog); zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); while ((lwb = list_remove_head(&ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); list_destroy(&ilwbs); return (wtxg); } static void zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); ASSERT3B(zcw->zcw_done, ==, B_FALSE); lwb_t *lwb = zcw->zcw_lwb; ASSERT3P(lwb, !=, NULL); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW); /* * If the lwb has already been issued by another thread, we can * immediately return since there's no work to be done (the * point of this function is to issue the lwb). Additionally, we * do this prior to acquiring the zl_issuer_lock, to avoid * acquiring it when it's not necessary to do so. */ if (lwb->lwb_state != LWB_STATE_OPENED) return; /* * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order * elsewhere. */ mutex_exit(&zcw->zcw_lock); mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zcw->zcw_lock); /* * Since we just dropped and re-acquired the commit waiter's * lock, we have to re-check to see if the waiter was marked * "done" during that process. If the waiter was marked "done", * the "lwb" pointer is no longer valid (it can be free'd after * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ if (zcw->zcw_done) { mutex_exit(&zilog->zl_issuer_lock); return; } ASSERT3P(lwb, ==, zcw->zcw_lwb); /* * We've already checked this above, but since we hadn't acquired * the zilog's zl_issuer_lock, we have to perform this check a * second time while holding the lock. * * We don't need to hold the zl_lock since the lwb cannot transition * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb * _can_ transition from CLOSED to DONE, but it's OK to race with * that transition since we treat the lwb the same, whether it's in * the CLOSED, ISSUED or DONE states. * * The important thing, is we treat the lwb differently depending on * if it's OPENED or CLOSED, and block any other threads that might * attempt to close/issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call * zil_lwb_write_close() if the lwb had already been closed/issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state != LWB_STATE_OPENED) { mutex_exit(&zilog->zl_issuer_lock); return; } /* * We do not need zcw_lock once we hold zl_issuer_lock and know lwb * is still open. But we have to drop it to avoid a deadlock in case * callback of zio issued by zil_lwb_write_issue() try to get it, * while zil_lwb_write_issue() is blocked on attempt to issue next * lwb it found in LWB_STATE_READY state. */ mutex_exit(&zcw->zcw_lock); /* * As described in the comments above zil_commit_waiter() and * zil_process_commit_list(), we need to issue this lwb's zio * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); /* * Since the lwb's zio hadn't been issued by the time this thread * reached its timeout, we reset the zilog's "zl_cur_used" field * to influence the zil block size selection algorithm. * * By having to issue the lwb's zio here, it means the size of the * lwb was too large, given the incoming throughput of itxs. By * setting "zl_cur_used" to zero, we communicate this fact to the * block size selection algorithm, so it can take this information * into account, and potentially select a smaller size for the * next lwb block that is allocated. */ zilog->zl_cur_used = 0; if (nlwb == NULL) { /* * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the * zil_commit_writer_stall() function for more details. */ zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); mutex_exit(&zilog->zl_issuer_lock); } else { mutex_exit(&zilog->zl_issuer_lock); zil_lwb_write_issue(zilog, lwb); } mutex_enter(&zcw->zcw_lock); } /* * This function is responsible for performing the following two tasks: * * 1. its primary responsibility is to block until the given "commit * waiter" is considered "done". * * 2. its secondary responsibility is to issue the zio for the lwb that * the given "commit waiter" is waiting on, if this function has * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. * * For more details, see zil_process_commit_list(); more specifically, * the comment at the bottom of that function. */ static void zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) { ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT(spa_writeable(zilog->zl_spa)); mutex_enter(&zcw->zcw_lock); /* * The timeout is scaled based on the lwb latency to avoid * significantly impacting the latency of each individual itx. * For more details, see the comment at the bottom of the * zil_process_commit_list() function. */ int pct = MAX(zfs_commit_timeout_pct, 1); hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; hrtime_t wakeup = gethrtime() + sleep; boolean_t timedout = B_FALSE; while (!zcw->zcw_done) { ASSERT(MUTEX_HELD(&zcw->zcw_lock)); lwb_t *lwb = zcw->zcw_lwb; /* * Usually, the waiter will have a non-NULL lwb field here, * but it's possible for it to be NULL as a result of * zil_commit() racing with spa_sync(). * * When zil_clean() is called, it's possible for the itxg * list (which may be cleaned via a taskq) to contain * commit itxs. When this occurs, the commit waiters linked * off of these commit itxs will not be committed to an * lwb. Additionally, these commit waiters will not be * marked done until zil_commit_waiter_skip() is called via * zil_itxg_clean(). * * Thus, it's possible for this commit waiter (i.e. the * "zcw" variable) to be found in this "in between" state; * where it's "zcw_lwb" field is NULL, and it hasn't yet * been skipped, so it's "zcw_done" field is still B_FALSE. */ IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW); if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { ASSERT3B(timedout, ==, B_FALSE); /* * If the lwb hasn't been issued yet, then we * need to wait with a timeout, in case this * function needs to issue the lwb after the * timeout is reached; responsibility (2) from * the comment above this function. */ int rc = cv_timedwait_hires(&zcw->zcw_cv, &zcw->zcw_lock, wakeup, USEC2NSEC(1), CALLOUT_FLAG_ABSOLUTE); if (rc != -1 || zcw->zcw_done) continue; timedout = B_TRUE; zil_commit_waiter_timeout(zilog, zcw); if (!zcw->zcw_done) { /* * If the commit waiter has already been * marked "done", it's possible for the * waiter's lwb structure to have already * been freed. Thus, we can only reliably * make these assertions if the waiter * isn't done. */ ASSERT3P(lwb, ==, zcw->zcw_lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); } } else { /* * If the lwb isn't open, then it must have already * been issued. In that case, there's no need to * use a timeout when waiting for the lwb to * complete. * * Additionally, if the lwb is NULL, the waiter * will soon be signaled and marked done via * zil_clean() and zil_itxg_clean(), so no timeout * is required. */ IMPLY(lwb != NULL, lwb->lwb_state == LWB_STATE_CLOSED || lwb->lwb_state == LWB_STATE_READY || lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || lwb->lwb_state == LWB_STATE_FLUSH_DONE); cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); } } mutex_exit(&zcw->zcw_lock); } static zil_commit_waiter_t * zil_alloc_commit_waiter(void) { zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&zcw->zcw_node); zcw->zcw_lwb = NULL; zcw->zcw_done = B_FALSE; zcw->zcw_zio_error = 0; return (zcw); } static void zil_free_commit_waiter(zil_commit_waiter_t *zcw) { ASSERT(!list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, NULL); ASSERT3B(zcw->zcw_done, ==, B_TRUE); mutex_destroy(&zcw->zcw_lock); cv_destroy(&zcw->zcw_cv); kmem_cache_free(zil_zcw_cache, zcw); } /* * This function is used to create a TX_COMMIT itx and assign it. This * way, it will be linked into the ZIL's list of synchronous itxs, and * then later committed to an lwb (or skipped) when * zil_process_commit_list() is called. */ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); /* * Since we are not going to create any new dirty data, and we * can even help with clearing the existing dirty data, we * should not be subject to the dirty data based delays. We * use TXG_NOTHROTTLE to bypass the delay mechanism. */ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; itx->itx_private = zcw; zil_itx_assign(zilog, itx, tx); dmu_tx_commit(tx); } /* * Commit ZFS Intent Log transactions (itxs) to stable storage. * * When writing ZIL transactions to the on-disk representation of the * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple * itxs can be committed to a single lwb. Once a lwb is written and * committed to stable storage (i.e. the lwb is written, and vdevs have * been flushed), each itx that was committed to that lwb is also * considered to be committed to stable storage. * * When an itx is committed to an lwb, the log record (lr_t) contained * by the itx is copied into the lwb's zio buffer, and once this buffer * is written to disk, it becomes an on-disk ZIL block. * * As itxs are generated, they're inserted into the ZIL's queue of * uncommitted itxs. The semantics of zil_commit() are such that it will * block until all itxs that were in the queue when it was called, are * committed to stable storage. * * If "foid" is zero, this means all "synchronous" and "asynchronous" * itxs, for all objects in the dataset, will be committed to stable * storage prior to zil_commit() returning. If "foid" is non-zero, all * "synchronous" itxs for all objects, but only "asynchronous" itxs * that correspond to the foid passed in, will be committed to stable * storage prior to zil_commit() returning. * * Generally speaking, when zil_commit() is called, the consumer doesn't * actually care about _all_ of the uncommitted itxs. Instead, they're * simply trying to waiting for a specific itx to be committed to disk, * but the interface(s) for interacting with the ZIL don't allow such * fine-grained communication. A better interface would allow a consumer * to create and assign an itx, and then pass a reference to this itx to * zil_commit(); such that zil_commit() would return as soon as that * specific itx was committed to disk (instead of waiting for _all_ * itxs to be committed). * * When a thread calls zil_commit() a special "commit itx" will be * generated, along with a corresponding "waiter" for this commit itx. * zil_commit() will wait on this waiter's CV, such that when the waiter * is marked done, and signaled, zil_commit() will return. * * This commit itx is inserted into the queue of uncommitted itxs. This * provides an easy mechanism for determining which itxs were in the * queue prior to zil_commit() having been called, and which itxs were * added after zil_commit() was called. * * The commit itx is special; it doesn't have any on-disk representation. * When a commit itx is "committed" to an lwb, the waiter associated * with it is linked onto the lwb's list of waiters. Then, when that lwb * completes, each waiter on the lwb's list is marked done and signaled * -- allowing the thread waiting on the waiter to return from zil_commit(). * * It's important to point out a few critical factors that allow us * to make use of the commit itxs, commit waiters, per-lwb lists of * commit waiters, and zio completion callbacks like we're doing: * * 1. The list of waiters for each lwb is traversed, and each commit * waiter is marked "done" and signaled, in the zio completion * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion * callback of the root zio for the DKIOCFLUSHWRITECACHE commands * that are sent to the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as * itxs are added to the queue, they are added to the tail of * in-memory linked lists. * * When committing the itxs to lwbs (to be written to disk), they * are committed in the same order in which the itxs were added to * the uncommitted queue's linked list(s); i.e. the linked list of * itxs to commit is traversed from head to tail, and each itx is * committed to an lwb in that order. * * * To clarify: * * - the order of "sync" itxs is preserved w.r.t. other * "sync" itxs, regardless of the corresponding objects. * - the order of "async" itxs is preserved w.r.t. other * "async" itxs corresponding to the same object. * - the order of "async" itxs is *not* preserved w.r.t. other * "async" itxs corresponding to different objects. * - the order of "sync" itxs w.r.t. "async" itxs (or vice * versa) is *not* preserved, even for itxs that correspond * to the same object. * * For more details, see: zil_itx_assign(), zil_async_to_sync(), * zil_get_commit_list(), and zil_process_commit_list(). * * 3. The lwbs represent a linked list of blocks on disk. Thus, any * lwb cannot be considered committed to stable storage, until its * "previous" lwb is also committed to stable storage. This fact, * coupled with the fact described above, means that itxs are * committed in (roughly) the order in which they were generated. * This is essential because itxs are dependent on prior itxs. * Thus, we *must not* deem an itx as being committed to stable * storage, until *all* prior itxs have also been committed to * stable storage. * * To enforce this ordering of lwb zio's, while still leveraging as * much of the underlying storage performance as possible, we rely * on two fundamental concepts: * * 1. The creation and issuance of lwb zio's is protected by * the zilog's "zl_issuer_lock", which ensures only a single * thread is creating and/or issuing lwb's at a time * 2. The "previous" lwb is a child of the "current" lwb * (leveraging the zio parent-child dependency graph) * * By relying on this parent-child zio relationship, we can have * many lwb zio's concurrently issued to the underlying storage, * but the order in which they complete will be the same order in * which they were created. */ void zil_commit(zilog_t *zilog, uint64_t foid) { /* * We should never attempt to call zil_commit on a snapshot for * a couple of reasons: * * 1. A snapshot may never be modified, thus it cannot have any * in-flight itxs that would have modified the dataset. * * 2. By design, when zil_commit() is called, a commit itx will * be assigned to this zilog; as a result, the zilog will be * dirtied. We must not dirty the zilog of a snapshot; there's * checks in the code that enforce this invariant, and will * cause a panic if it's not upheld. */ ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; if (!spa_writeable(zilog->zl_spa)) { /* * If the SPA is not writable, there should never be any * pending itxs waiting to be committed to disk. If that * weren't true, we'd skip writing those itxs out, and * would break the semantics of zil_commit(); thus, we're * verifying that truth before we return to the caller. */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); for (int i = 0; i < TXG_SIZE; i++) ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); return; } /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out * lwbs like would be done in zil_commit_write(). Thus, we * simply rely on txg_wait_synced() to maintain the necessary * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { txg_wait_synced(zilog->zl_dmu_pool, 0); return; } zil_commit_impl(zilog, foid); } void zil_commit_impl(zilog_t *zilog, uint64_t foid) { ZIL_STAT_BUMP(zilog, zil_commit_count); /* * Move the "async" itxs for the specified foid to the "sync" * queues, such that they will be later committed (or skipped) * to an lwb when zil_process_commit_list() is called. * * Since these "async" itxs must be committed prior to this * call to zil_commit returning, we must perform this operation * before we call zil_commit_itx_assign(). */ zil_async_to_sync(zilog, foid); /* * We allocate a new "waiter" structure which will initially be * linked to the commit itx using the itx's "itx_private" field. * Since the commit itx doesn't represent any on-disk state, * when it's committed to an lwb, rather than copying the its * lr_t into the lwb's buffer, the commit itx's "waiter" will be * added to the lwb's list of waiters. Then, when the lwb is * committed to stable storage, each waiter in the lwb's list of * waiters will be marked "done", and signalled. * * We must create the waiter and assign the commit itx prior to * calling zil_commit_writer(), or else our specific commit itx * is not guaranteed to be committed to an lwb prior to calling * zil_commit_waiter(). */ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { /* * If there was an error writing out the ZIL blocks that * this thread is waiting on, then we fallback to * relying on spa_sync() to write out the data this * thread is waiting on. Obviously this has performance * implications, but the expectation is for this to be * an exceptional case, and shouldn't occur often. */ DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); } else if (wtxg != 0) { txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; zil_lwb_flush_wait_all(zilog, txg); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); ASSERT(list_is_empty(&zilog->zl_lwb_list)); memset(zh, 0, sizeof (zil_header_t)); memset(zilog->zl_replayed_seq, 0, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } else { /* * A destroyed ZIL chain can't contain any TX_SETSAXATTR * records. So, deactivate the feature for this dataset. * We activate it again when we start a new ZIL chain. */ if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) dsl_dataset_deactivate_feature(ds, SPA_FEATURE_ZILSAXATTR, tx); } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); if (!BP_IS_HOLE(&lwb->lwb_blk)) zio_free(spa, txg, &lwb->lwb_blk); zil_free_lwb(zilog, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_is_empty(&zilog->zl_lwb_list)) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } static int zil_lwb_cons(void *vbuf, void *unused, int kmflag) { (void) unused, (void) kmflag; lwb_t *lwb = vbuf; list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } static void zil_lwb_dest(void *vbuf, void *unused) { (void) unused; lwb_t *lwb = vbuf; mutex_destroy(&lwb->lwb_vdev_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); zil_zcw_cache = kmem_cache_create("zil_zcw_cache", sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zil_sums_init(&zil_sums_global); zil_kstats_global = kstat_create("zfs", 0, "zil", "misc", KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zil_kstats_global != NULL) { zil_kstats_global->ks_data = &zil_stats; zil_kstats_global->ks_update = zil_kstats_global_update; zil_kstats_global->ks_private = NULL; kstat_install(zil_kstats_global); } } void zil_fini(void) { kmem_cache_destroy(zil_zcw_cache); kmem_cache_destroy(zil_lwb_cache); if (zil_kstats_global != NULL) { kstat_delete(zil_kstats_global); zil_kstats_global = NULL; } zil_sums_fini(&zil_sums_global); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; zilog->zl_max_block_size = zil_maxblocksize; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { int i; zilog->zl_stop_sync = 1; ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_issuer_lock); mutex_destroy(&zilog->zl_lock); mutex_destroy(&zilog->zl_lwb_io_lock); cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_lwb_io_cv); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums) { zilog_t *zilog = dmu_objset_zil(os); ASSERT3P(zilog->zl_get_data, ==, NULL); ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; zilog->zl_sums = zil_sums; return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *lwb; uint64_t txg; if (!dmu_objset_is_snapshot(zilog->zl_os)) { zil_commit(zilog, 0); } else { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); } mutex_enter(&zilog->zl_lock); txg = zilog->zl_dirty_max_txg; lwb = list_tail(&zilog->zl_lwb_list); if (lwb != NULL) { txg = MAX(txg, lwb->lwb_alloc_txg); txg = MAX(txg, lwb->lwb_max_txg); } mutex_exit(&zilog->zl_lock); /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); mutex_exit(&zilog->zl_lwb_io_lock); /* * We need to use txg_wait_synced() to wait until that txg is synced. * zil_sync() will guarantee all lwbs up to that txg have been * written out, flushed, and cleaned. */ if (txg != 0) txg_wait_synced(zilog->zl_dmu_pool, txg); if (zilog_is_dirty(zilog)) zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, (u_longlong_t)txg); if (txg < spa_freeze_txg(zilog->zl_spa)) VERIFY(!zilog_is_dirty(zilog)); zilog->zl_get_data = NULL; /* * We should have only one lwb left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); } static const char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * On old version pools, we suspend the log briefly when taking a * snapshot so that it will have an empty intent log. * * Long holds are not really intended to be used the way we do here -- * held for such a short time. A concurrent caller of dsl_dataset_long_held() * could fail. Therefore we take pains to only put a long hold if it is * actually necessary. Fortunately, it will only be necessary if the * objset is currently mounted (or the ZVOL equivalent). In that case it * will already have a long hold, so we are not really making things any worse. * * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or * zvol_state_t), and use their mechanism to prevent their hold from being * dropped (e.g. VFS_HOLD()). However, that would be even more pain for * very little gain. * * if cookiep == NULL, this does both the suspend & resume. * Otherwise, it returns with the dataset "long held", and the cookie * should be passed into zil_resume(). */ int zil_suspend(const char *osname, void **cookiep) { objset_t *os; zilog_t *zilog; const zil_header_t *zh; int error; error = dmu_objset_hold(osname, suspend_tag, &os); if (error != 0) return (error); zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume * (i.e. called from zil_vdev_offline()), and there's nothing to do * for the suspend because it's already suspended, or there's no ZIL. */ if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (0); } dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; if (zilog->zl_suspend > 1) { /* * Someone else is already suspending it. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } /* * If there is no pointer to an on-disk block, this ZIL must not * be active (e.g. filesystem not mounted), so there's nothing * to clean up. */ if (BP_IS_HOLE(&zh->zh_log)) { ASSERT(cookiep != NULL); /* fast path already handled */ *cookiep = os; mutex_exit(&zilog->zl_lock); return (0); } /* * The ZIL has work to do. Ensure that the associated encryption * key will remain mapped while we are committing the log by * grabbing a reference to it. If the key isn't loaded we have no * choice but to return an error until the wrapping key is loaded. */ if (os->os_encrypted && dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) { zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); return (SET_ERROR(EACCES)); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); /* * We need to use zil_commit_impl to ensure we wait for all * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed * to disk before proceeding. If we used zil_commit instead, it * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be * LWB_STATE_FLUSH_DONE before returning. */ zil_commit_impl(zilog, 0); /* * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). */ txg_wait_synced(zilog->zl_dmu_pool, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); if (os->os_encrypted) dsl_dataset_remove_key_mapping(dmu_objset_ds(os)); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } void zil_resume(void *cookie) { objset_t *os = cookie; zilog_t *zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { zil_replay_func_t *const *zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, const lr_t *lr, int error) { char name[ZFS_MAX_DATASET_NAME_LEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ memcpy(zr->zr_lr, lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error != 0) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); } static int zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { (void) bp, (void) arg, (void) claim_txg; zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. * Return B_TRUE if there were any entries to replay. */ boolean_t zil_replay(objset_t *os, void *arg, zil_replay_func_t *const replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { return (zil_destroy(zilog, B_TRUE)); } zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg, B_TRUE); vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; return (B_TRUE); } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } int zil_reset(const char *osname, void *arg) { (void) arg; int error = zil_suspend(osname, NULL); /* EACCES means crypto key not loaded */ if ((error == EACCES) || (error == EBUSY)) return (SET_ERROR(error)); if (error != 0) return (SET_ERROR(EEXIST)); return (0); } EXPORT_SYMBOL(zil_alloc); EXPORT_SYMBOL(zil_free); EXPORT_SYMBOL(zil_open); EXPORT_SYMBOL(zil_close); EXPORT_SYMBOL(zil_replay); EXPORT_SYMBOL(zil_replaying); EXPORT_SYMBOL(zil_destroy); EXPORT_SYMBOL(zil_destroy_sync); EXPORT_SYMBOL(zil_itx_create); EXPORT_SYMBOL(zil_itx_destroy); EXPORT_SYMBOL(zil_itx_assign); EXPORT_SYMBOL(zil_commit); EXPORT_SYMBOL(zil_claim); EXPORT_SYMBOL(zil_check_log_chain); EXPORT_SYMBOL(zil_sync); EXPORT_SYMBOL(zil_clean); EXPORT_SYMBOL(zil_suspend); EXPORT_SYMBOL(zil_resume); EXPORT_SYMBOL(zil_lwb_add_block); EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); EXPORT_SYMBOL(zil_sums_init); EXPORT_SYMBOL(zil_sums_fini); EXPORT_SYMBOL(zil_kstat_values_update); ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW, "ZIL block open timeout percentage"); ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW, "Minimum delay we care for ZIL block commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, "Disable intent logging replay"); ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, "Disable ZIL cache flushes"); ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, "Limit in bytes slog sync writes per commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, "Limit in bytes of ZIL log block size"); diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh index 0a4d24fa695a..cfa26f54b11f 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh @@ -1,269 +1,269 @@ #!/bin/ksh -p # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # # Copyright (c) 2017 by Delphix. All rights reserved. # Copyright (c) 2020 by Lawrence Livermore National Security LLC. . $STF_SUITE/include/libtest.shlib # # DESCRIPTION: # Create a pool and populate it with files of various # recordsizes # # STRATEGY: # 1. Create pool # 2. Populate it # 3. Run zdb -Pbbb on pool # 4. Verify variance on blocksizes # function cleanup { datasetexists $TESTPOOL && destroy_pool $TESTPOOL } SPA_MAXBLOCKSHIFT=24 function histo_populate_test_pool { if [ $# -ne 1 ]; then log_note "histo_populate_test_pool: insufficient parameters" log_fail "hptp: 1 requested $# received" fi typeset pool=$1 set -A recordsizes typeset -i min_rsbits=9 #512 typeset -i max_rsbits=SPA_MAXBLOCKSHIFT #16 MiB typeset -i sum_filesizes=0 re_number='^[0-9]+$' let histo_pool_size=$(get_pool_prop size ${pool}) if [[ ! ${histo_pool_size} =~ ${re_number} ]]; then log_fail "histo_pool_size is not numeric ${pool_size}" fi let max_pool_record_size=$(get_prop recordsize ${pool}) if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then log_fail "hptp: max_pool_record_size is not numeric ${max_pool_record_size}" fi sum_filesizes=$(echo "2^21"|bc) ((min_pool_size=12*sum_filesizes)) if [ ${histo_pool_size} -lt ${min_pool_size} ]; then log_note "hptp: Your pool size ${histo_pool_size}" log_fail "hptp: is less than minimum ${min_pool_size}" fi this_ri=min_rsbits file_num=0 total_count=0 ################### # generate 10% + 20% + 30% + 31% = 91% of the filespace # attempting to use 100% will lead to no space left on device # Heuristic testing showed that 91% was the practical upper # bound on the default 4G zpool (mirrored) that is used in # testing. # # In order to expedite testing, we will only fill 2G (of 4G) # of the test pool. You may want to modify this for # standalone testing. # # In filling only 50% of the pool, we create one object on # each "pass" below to achieve multiple objects per record # size. Creating one file per object would lead to # excessive file creation time. ################### # for pass in 10 20 30 31 # 91% for pass in 20 20 10 # 50% do ((thiscount=(((histo_pool_size*pass)/100)/sum_filesizes))) ((total_count+=thiscount)) for rb in $(seq ${min_rsbits} ${max_rsbits}) do this_rs=$(echo "2^${rb}" | bc) if [ ${this_rs} -gt ${max_pool_record_size} ]; then continue fi if [ ! -d /${pool}/B_${this_rs} ]; then zfs create ${pool}/B_${this_rs} zfs set recordsize=${this_rs} \ ${pool}/B_${this_rs} fi #################### # Create the files in the devices and datasets # of the right size. The files are filled # with random data to defeat the compression # # Note that the dd output is suppressed unless # there are errors #################### dd if=/dev/urandom \ of=/${pool}/B_${this_rs}/file_${filenum} \ bs=${this_rs} count=${thiscount} \ iflag=fullblock 2>&1 | \ grep -ve "records in" -e "records out" -e "bytes.*copied" ((filenum+=1)) done done #################### # Testing showed that on some devices, unless the pool is # synchronized, that the block counts will be below the # anticipated sizes since not all of the blocks will be flushed # to the device. This 'sync' command prevents that from # happening. #################### sync_pool ${pool} } function histo_check_test_pool { if [ $# -ne 1 ]; then log_note "histo_check_test_pool: insufficient parameters" log_fail "hctp: 1 requested $# received" fi typeset pool=$1 set -A recordsizes set -A recordcounts typeset -i rb typeset -i min_rsbits=9 #512 typeset -i max_rsbits=SPA_MAXBLOCKSHIFT+1 typeset -i this_rs typeset -i this_ri typeset -i sum_filesizes=0 let histo_check_pool_size=$(get_pool_prop size ${pool}) if [[ ! ${histo_check_pool_size} =~ ${re_number} ]]; then log_fail "histo_check_pool_size is not numeric ${histo_check_pool_size}" fi let max_pool_record_size=$(get_prop recordsize ${pool}) if [[ ! ${max_pool_record_size} =~ ${re_number} ]]; then log_fail "hctp: max_pool_record_size is not numeric ${max_pool_record_size}" fi stripped="${TEST_BASE_DIR}/${pool}_stripped.txt" zdb -Pbbb ${pool} | \ sed -e '1,/^block[ ][ ]*psize[ ][ ]*lsize.*$/d' \ -e '/^size[ ]*Count/d' -e '/^$/,$d' \ > ${stripped} sum_filesizes=$(echo "2^21"|bc) ################### # generate 10% + 20% + 30% + 31% = 91% of the filespace # attempting to use 100% will lead to no space left on device # attempting to use 100% will lead to no space left on device # Heuristic testing showed that 91% was the practical upper # bound on the default 4G zpool (mirrored) that is used in # testing. # # In order to expedite testing, we will only fill 2G (of 4G) # of the test pool. You may want to modify this for # standalone testing. # # In filling only 50% of the pool, we create one object on # each "pass" below to achieve multiple objects per record # size. Creating one file per object would lead to # excessive file creation time. ################### # for pass in 10 20 30 31 # 91% for pass in 20 20 10 # 50% do ((thiscount=(((histo_check_pool_size*pass)/100)/sum_filesizes))) for rb in $(seq ${min_rsbits} ${max_rsbits}) do blksize=$(echo "2^$rb"|bc) if [ $blksize -le $max_pool_record_size ]; then ((recordcounts[$blksize]+=thiscount)) fi done done ################### # compare the above computed counts for blocks against # lsize count. Since some devices have a minimum hardware # blocksize > 512, we cannot compare against the asize count. # E.G., if the HWBlocksize = 4096, then the asize counts for # 512, 1024 and 2048 will be zero and rolled up into the # 4096 blocksize count for asize. For verification we stick # to just lsize counts. # - # The max_variance is hard-coded here at 12% to leave us some - # margin. Testing has shown this normally to be in the range - # of 2%-8%, but it may be as large as 11%. + # Variances are expected since this test does not account for + # metadata. The hardcoded limit here is empirical and should + # not be construed as deterministic. ################### - let max_variance=12 + let max_variance=15 let fail_value=0 let error_count=0 log_note "Comparisons for ${pool}" log_note "Bsize is the blocksize, Count is predicted value" log_note "Bsize\tCount\tpsize\tlsize\tasize" while read -r blksize pc pl pm lc ll lm ac al am do if [ $blksize -gt $max_pool_record_size ]; then continue fi log_note \ "$blksize\t${recordcounts[${blksize}]}\t$pc\t$lc\t$ac" ################### # get the computer record count and compute the # difference percentage in integer arithmetic ################### rc=${recordcounts[${blksize}]} ((rclc=(rc-lc)<0?lc-rc:rc-lc)) # absolute value ((dp=(rclc*100)/rc)) ################### # Check against the allowed variance ################### if [ $dp -gt ${max_variance} ]; then log_note \ "Expected variance < ${max_variance}% observed ${dp}%" if [ ${dp} -gt ${fail_value} ]; then fail_value=${dp} ((error_count++)) fi fi done < ${stripped} rm "${stripped}" if [ ${fail_value} -gt 0 ]; then if [ ${error_count} -eq 1 ]; then log_note "hctp: There was ${error_count} error" else log_note "hctp:There were a total of ${error_count} errors" fi log_fail \ "hctp: Max variance of ${max_variance}% exceeded, saw ${fail_value}%" fi } log_assert "Verify zdb -Pbbb (block histogram) works as expected" log_onexit cleanup verify_runnable "global" verify_disk_count "$DISKS" 2 default_mirror_setup_noexit $DISKS histo_populate_test_pool $TESTPOOL histo_check_test_pool $TESTPOOL log_pass "Histogram for zdb"