Index: head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c =================================================================== --- head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c (revision 329627) +++ head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c (revision 329628) @@ -1,3307 +1,3307 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #if defined(_KERNEL) && !defined(_BOOT) #include #include #else #include #include #include #include #endif #ifndef offsetof #define offsetof(s, m) ((size_t)(&(((s *)0)->m))) #endif #define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++ #if defined(__FreeBSD__) && !defined(_KERNEL) /* * libnvpair is the lowest commen denominator for ZFS related libraries, * defining aok here makes it usable by all ZFS related libraries */ int aok; #endif /* * nvpair.c - Provides kernel & userland interfaces for manipulating * name-value pairs. * * Overview Diagram * * +--------------+ * | nvlist_t | * |--------------| * | nvl_version | * | nvl_nvflag | * | nvl_priv -+-+ * | nvl_flag | | * | nvl_pad | | * +--------------+ | * V * +--------------+ last i_nvp in list * | nvpriv_t | +---------------------> * |--------------| | * +--+- nvp_list | | +------------+ * | | nvp_last -+--+ + nv_alloc_t | * | | nvp_curr | |------------| * | | nvp_nva -+----> | nva_ops | * | | nvp_stat | | nva_arg | * | +--------------+ +------------+ * | * +-------+ * V * +---------------------+ +-------------------+ * | i_nvp_t | +-->| i_nvp_t | +--> * |---------------------| | |-------------------| | * | nvi_next -+--+ | nvi_next -+--+ * | nvi_prev (NULL) | <----+ nvi_prev | * | . . . . . . . . . . | | . . . . . . . . . | * | nvp (nvpair_t) | | nvp (nvpair_t) | * | - nvp_size | | - nvp_size | * | - nvp_name_sz | | - nvp_name_sz | * | - nvp_value_elem | | - nvp_value_elem | * | - nvp_type | | - nvp_type | * | - data ... | | - data ... | * +---------------------+ +-------------------+ * * * * +---------------------+ +---------------------+ * | i_nvp_t | +--> +-->| i_nvp_t (last) | * |---------------------| | | |---------------------| * | nvi_next -+--+ ... --+ | nvi_next (NULL) | * <-+- nvi_prev |<-- ... <----+ nvi_prev | * | . . . . . . . . . | | . . . . . . . . . | * | nvp (nvpair_t) | | nvp (nvpair_t) | * | - nvp_size | | - nvp_size | * | - nvp_name_sz | | - nvp_name_sz | * | - nvp_value_elem | | - nvp_value_elem | * | - DATA_TYPE_NVLIST | | - nvp_type | * | - data (embedded) | | - data ... | * | nvlist name | +---------------------+ * | +--------------+ | * | | nvlist_t | | * | |--------------| | * | | nvl_version | | * | | nvl_nvflag | | * | | nvl_priv --+---+----> * | | nvl_flag | | * | | nvl_pad | | * | +--------------+ | * +---------------------+ * * * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will * allow value to be aligned on 8 byte boundary * * name_len is the length of the name string including the null terminator * so it must be >= 1 */ #define NVP_SIZE_CALC(name_len, data_len) \ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t nelem, const void *data); #define NV_STAT_EMBEDDED 0x1 #define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp)) #define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp)) #define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz)) #define NVPAIR2I_NVP(nvp) \ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) int nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) { va_list valist; int err = 0; nva->nva_ops = nvo; nva->nva_arg = NULL; va_start(valist, nvo); if (nva->nva_ops->nv_ao_init != NULL) err = nva->nva_ops->nv_ao_init(nva, valist); va_end(valist); return (err); } void nv_alloc_reset(nv_alloc_t *nva) { if (nva->nva_ops->nv_ao_reset != NULL) nva->nva_ops->nv_ao_reset(nva); } void nv_alloc_fini(nv_alloc_t *nva) { if (nva->nva_ops->nv_ao_fini != NULL) nva->nva_ops->nv_ao_fini(nva); } nv_alloc_t * nvlist_lookup_nv_alloc(nvlist_t *nvl) { nvpriv_t *priv; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); return (priv->nvp_nva); } static void * nv_mem_zalloc(nvpriv_t *nvp, size_t size) { nv_alloc_t *nva = nvp->nvp_nva; void *buf; if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL) bzero(buf, size); return (buf); } static void nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) { nv_alloc_t *nva = nvp->nvp_nva; nva->nva_ops->nv_ao_free(nva, buf, size); } static void nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) { bzero(priv, sizeof (nvpriv_t)); priv->nvp_nva = nva; priv->nvp_stat = stat; } static nvpriv_t * nv_priv_alloc(nv_alloc_t *nva) { nvpriv_t *priv; /* * nv_mem_alloc() cannot called here because it needs the priv * argument. */ if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL) return (NULL); nv_priv_init(priv, nva, 0); return (priv); } /* * Embedded lists need their own nvpriv_t's. We create a new * nvpriv_t using the parameters and allocator from the parent * list's nvpriv_t. */ static nvpriv_t * nv_priv_alloc_embedded(nvpriv_t *priv) { nvpriv_t *emb_priv; if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL) return (NULL); nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED); return (emb_priv); } static void nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) { nvl->nvl_version = NV_VERSION; nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE); nvl->nvl_priv = (uint64_t)(uintptr_t)priv; nvl->nvl_flag = 0; nvl->nvl_pad = 0; } uint_t nvlist_nvflag(nvlist_t *nvl) { return (nvl->nvl_nvflag); } /* * nvlist_alloc - Allocate nvlist. */ /*ARGSUSED1*/ int nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) { #if defined(_KERNEL) && !defined(_BOOT) return (nvlist_xalloc(nvlp, nvflag, (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); #else return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep)); #endif } int nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva) { nvpriv_t *priv; if (nvlp == NULL || nva == NULL) return (EINVAL); if ((priv = nv_priv_alloc(nva)) == NULL) return (ENOMEM); if ((*nvlp = nv_mem_zalloc(priv, NV_ALIGN(sizeof (nvlist_t)))) == NULL) { nv_mem_free(priv, priv, sizeof (nvpriv_t)); return (ENOMEM); } nvlist_init(*nvlp, nvflag, priv); return (0); } /* * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair. */ static nvpair_t * nvp_buf_alloc(nvlist_t *nvl, size_t len) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *buf; nvpair_t *nvp; size_t nvsize; /* * Allocate the buffer */ nvsize = len + offsetof(i_nvp_t, nvi_nvp); if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL) return (NULL); nvp = &buf->nvi_nvp; nvp->nvp_size = len; return (nvp); } /* * nvp_buf_free - de-Allocate an i_nvp_t. */ static void nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp); nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize); } /* * nvp_buf_link - link a new nv pair into the nvlist. */ static void nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr = NVPAIR2I_NVP(nvp); /* Put element at end of nvlist */ if (priv->nvp_list == NULL) { priv->nvp_list = priv->nvp_last = curr; } else { curr->nvi_prev = priv->nvp_last; priv->nvp_last->nvi_next = curr; priv->nvp_last = curr; } } /* * nvp_buf_unlink - unlink an removed nvpair out of the nvlist. */ static void nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr = NVPAIR2I_NVP(nvp); /* * protect nvlist_next_nvpair() against walking on freed memory. */ if (priv->nvp_curr == curr) priv->nvp_curr = curr->nvi_next; if (curr == priv->nvp_list) priv->nvp_list = curr->nvi_next; else curr->nvi_prev->nvi_next = curr->nvi_next; if (curr == priv->nvp_last) priv->nvp_last = curr->nvi_prev; else curr->nvi_next->nvi_prev = curr->nvi_prev; } /* * take a nvpair type and number of elements and make sure the are valid */ static int i_validate_type_nelem(data_type_t type, uint_t nelem) { switch (type) { case DATA_TYPE_BOOLEAN: if (nelem != 0) return (EINVAL); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_STRING: case DATA_TYPE_HRTIME: case DATA_TYPE_NVLIST: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif if (nelem != 1) return (EINVAL); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: /* we allow arrays with 0 elements */ break; default: return (EINVAL); } return (0); } /* * Verify nvp_name_sz and check the name string length. */ static int i_validate_nvpair_name(nvpair_t *nvp) { if ((nvp->nvp_name_sz <= 0) || (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0))) return (EFAULT); /* verify the name string, make sure its terminated */ if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0') return (EFAULT); return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT); } static int i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data) { switch (type) { case DATA_TYPE_BOOLEAN_VALUE: if (*(boolean_t *)data != B_TRUE && *(boolean_t *)data != B_FALSE) return (EINVAL); break; case DATA_TYPE_BOOLEAN_ARRAY: { int i; for (i = 0; i < nelem; i++) if (((boolean_t *)data)[i] != B_TRUE && ((boolean_t *)data)[i] != B_FALSE) return (EINVAL); break; } default: break; } return (0); } /* * This function takes a pointer to what should be a nvpair and it's size * and then verifies that all the nvpair fields make sense and can be * trusted. This function is used when decoding packed nvpairs. */ static int i_validate_nvpair(nvpair_t *nvp) { data_type_t type = NVP_TYPE(nvp); int size1, size2; /* verify nvp_name_sz, check the name string length */ if (i_validate_nvpair_name(nvp) != 0) return (EFAULT); if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0) return (EFAULT); /* * verify nvp_type, nvp_value_elem, and also possibly * verify string values and get the value size. */ size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); size1 = nvp->nvp_size - NVP_VALOFF(nvp); if (size2 < 0 || size1 != NV_ALIGN(size2)) return (EFAULT); return (0); } static int nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl) { nvpriv_t *priv; i_nvp_t *curr; if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL) return (EINVAL); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { nvpair_t *nvp = &curr->nvi_nvp; int err; if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp), NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0) return (err); } return (0); } /* * Frees all memory allocated for an nvpair (like embedded lists) with * the exception of the nvpair buffer itself. */ static void nvpair_free(nvpair_t *nvp) { switch (NVP_TYPE(nvp)) { case DATA_TYPE_NVLIST: nvlist_free(EMBEDDED_NVL(nvp)); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); int i; for (i = 0; i < NVP_NELEM(nvp); i++) nvlist_free(nvlp[i]); break; } default: break; } } /* * nvlist_free - free an unpacked nvlist */ void nvlist_free(nvlist_t *nvl) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return; /* * Unpacked nvlist are linked through i_nvp_t */ curr = priv->nvp_list; while (curr != NULL) { nvpair_t *nvp = &curr->nvi_nvp; curr = curr->nvi_next; nvpair_free(nvp); nvp_buf_free(nvl, nvp); } if (!(priv->nvp_stat & NV_STAT_EMBEDDED)) nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t))); else nvl->nvl_priv = 0; nv_mem_free(priv, priv, sizeof (nvpriv_t)); } static int nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; if (nvp == NULL) return (0); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) if (&curr->nvi_nvp == nvp) return (1); return (0); } /* * Make a copy of nvlist */ /*ARGSUSED1*/ int nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) { #if defined(_KERNEL) && !defined(_BOOT) return (nvlist_xdup(nvl, nvlp, (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); #else return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep)); #endif } int nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) { int err; nvlist_t *ret; if (nvl == NULL || nvlp == NULL) return (EINVAL); if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0) return (err); if ((err = nvlist_copy_pairs(nvl, ret)) != 0) nvlist_free(ret); else *nvlp = ret; return (err); } /* * Remove all with matching name */ int nvlist_remove_all(nvlist_t *nvl, const char *name) { nvpriv_t *priv; i_nvp_t *curr; int error = ENOENT; if (nvl == NULL || name == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (EINVAL); curr = priv->nvp_list; while (curr != NULL) { nvpair_t *nvp = &curr->nvi_nvp; curr = curr->nvi_next; if (strcmp(name, NVP_NAME(nvp)) != 0) continue; nvp_buf_unlink(nvl, nvp); nvpair_free(nvp); nvp_buf_free(nvl, nvp); error = 0; } return (error); } /* * Remove first one with matching name and type */ int nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || name == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (EINVAL); curr = priv->nvp_list; while (curr != NULL) { nvpair_t *nvp = &curr->nvi_nvp; if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) { nvp_buf_unlink(nvl, nvp); nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (0); } curr = curr->nvi_next; } return (ENOENT); } int nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) { if (nvl == NULL || nvp == NULL) return (EINVAL); nvp_buf_unlink(nvl, nvp); nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (0); } /* * This function calculates the size of an nvpair value. * * The data argument controls the behavior in case of the data types * DATA_TYPE_STRING and * DATA_TYPE_STRING_ARRAY * Is data == NULL then the size of the string(s) is excluded. */ static int i_get_value_size(data_type_t type, const void *data, uint_t nelem) { uint64_t value_sz; if (i_validate_type_nelem(type, nelem) != 0) return (-1); /* Calculate required size for holding value */ switch (type) { case DATA_TYPE_BOOLEAN: value_sz = 0; break; case DATA_TYPE_BOOLEAN_VALUE: value_sz = sizeof (boolean_t); break; case DATA_TYPE_BYTE: value_sz = sizeof (uchar_t); break; case DATA_TYPE_INT8: value_sz = sizeof (int8_t); break; case DATA_TYPE_UINT8: value_sz = sizeof (uint8_t); break; case DATA_TYPE_INT16: value_sz = sizeof (int16_t); break; case DATA_TYPE_UINT16: value_sz = sizeof (uint16_t); break; case DATA_TYPE_INT32: value_sz = sizeof (int32_t); break; case DATA_TYPE_UINT32: value_sz = sizeof (uint32_t); break; case DATA_TYPE_INT64: value_sz = sizeof (int64_t); break; case DATA_TYPE_UINT64: value_sz = sizeof (uint64_t); break; #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: value_sz = sizeof (double); break; #endif case DATA_TYPE_STRING: if (data == NULL) value_sz = 0; else value_sz = strlen(data) + 1; break; case DATA_TYPE_BOOLEAN_ARRAY: value_sz = (uint64_t)nelem * sizeof (boolean_t); break; case DATA_TYPE_BYTE_ARRAY: value_sz = (uint64_t)nelem * sizeof (uchar_t); break; case DATA_TYPE_INT8_ARRAY: value_sz = (uint64_t)nelem * sizeof (int8_t); break; case DATA_TYPE_UINT8_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint8_t); break; case DATA_TYPE_INT16_ARRAY: value_sz = (uint64_t)nelem * sizeof (int16_t); break; case DATA_TYPE_UINT16_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint16_t); break; case DATA_TYPE_INT32_ARRAY: value_sz = (uint64_t)nelem * sizeof (int32_t); break; case DATA_TYPE_UINT32_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint32_t); break; case DATA_TYPE_INT64_ARRAY: value_sz = (uint64_t)nelem * sizeof (int64_t); break; case DATA_TYPE_UINT64_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t); break; case DATA_TYPE_STRING_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t); if (data != NULL) { char *const *strs = data; uint_t i; /* no alignment requirement for strings */ for (i = 0; i < nelem; i++) { if (strs[i] == NULL) return (-1); value_sz += strlen(strs[i]) + 1; } } break; case DATA_TYPE_HRTIME: value_sz = sizeof (hrtime_t); break; case DATA_TYPE_NVLIST: value_sz = NV_ALIGN(sizeof (nvlist_t)); break; case DATA_TYPE_NVLIST_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t) + (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t)); break; default: return (-1); } return (value_sz > INT32_MAX ? -1 : (int)value_sz); } static int nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl) { nvpriv_t *priv; int err; if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t) nvl->nvl_priv)) == NULL) return (ENOMEM); nvlist_init(emb_nvl, onvl->nvl_nvflag, priv); if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) { nvlist_free(emb_nvl); emb_nvl->nvl_priv = 0; } return (err); } /* * nvlist_add_common - Add new pair to nvlist */ static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t nelem, const void *data) { nvpair_t *nvp; uint_t i; int nvp_sz, name_sz, value_sz; int err = 0; if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) return (EINVAL); if (nelem != 0 && data == NULL) return (EINVAL); /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) included. */ if ((value_sz = i_get_value_size(type, data, nelem)) < 0) return (EINVAL); if (i_validate_nvpair_value(type, nelem, data) != 0) return (EINVAL); /* * If we're adding an nvlist or nvlist array, ensure that we are not * adding the input nvlist to itself, which would cause recursion, * and ensure that no NULL nvlist pointers are present. */ switch (type) { case DATA_TYPE_NVLIST: if (data == nvl || data == NULL) return (EINVAL); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **onvlp = (nvlist_t **)data; for (i = 0; i < nelem; i++) { if (onvlp[i] == nvl || onvlp[i] == NULL) return (EINVAL); } break; } default: break; } /* calculate sizes of the nvpair elements and the nvpair itself */ name_sz = strlen(name) + 1; nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL) return (ENOMEM); ASSERT(nvp->nvp_size == nvp_sz); nvp->nvp_name_sz = name_sz; nvp->nvp_value_elem = nelem; nvp->nvp_type = type; bcopy(name, NVP_NAME(nvp), name_sz); switch (type) { case DATA_TYPE_BOOLEAN: break; case DATA_TYPE_STRING_ARRAY: { char *const *strs = data; char *buf = NVP_VALUE(nvp); char **cstrs = (void *)buf; /* skip pre-allocated space for pointer array */ buf += nelem * sizeof (uint64_t); for (i = 0; i < nelem; i++) { int slen = strlen(strs[i]) + 1; bcopy(strs[i], buf, slen); cstrs[i] = buf; buf += slen; } break; } case DATA_TYPE_NVLIST: { nvlist_t *nnvl = EMBEDDED_NVL(nvp); nvlist_t *onvl = (nvlist_t *)data; if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) { nvp_buf_free(nvl, nvp); return (err); } break; } case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **onvlp = (nvlist_t **)data; nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); nvlist_t *embedded = (nvlist_t *) ((uintptr_t)nvlp + nelem * sizeof (uint64_t)); for (i = 0; i < nelem; i++) { if ((err = nvlist_copy_embedded(nvl, onvlp[i], embedded)) != 0) { /* * Free any successfully created lists */ nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (err); } nvlp[i] = embedded++; } break; } default: bcopy(data, NVP_VALUE(nvp), value_sz); } /* if unique name, remove before add */ if (nvl->nvl_nvflag & NV_UNIQUE_NAME) (void) nvlist_remove_all(nvl, name); else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) (void) nvlist_remove(nvl, name, type); nvp_buf_link(nvl, nvp); return (0); } int nvlist_add_boolean(nvlist_t *nvl, const char *name) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL)); } int nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val)); } int nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val)); } int nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val)); } int nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val)); } int nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val)); } int nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val)); } int nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val)); } int nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val)); } int nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val)); } int nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); } #if !defined(_KERNEL) int nvlist_add_double(nvlist_t *nvl, const char *name, double val) { return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val)); } #endif int nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) { return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val)); } int nvlist_add_boolean_array(nvlist_t *nvl, const char *name, boolean_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); } int nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); } int nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); } int nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); } int nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); } int nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); } int nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); } int nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); } int nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); } int nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); } int nvlist_add_string_array(nvlist_t *nvl, const char *name, char *const *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); } int nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val)); } int nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) { return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); } int nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); } /* reading name-value pairs */ nvpair_t * nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); curr = NVPAIR2I_NVP(nvp); /* * Ensure that nvp is a valid nvpair on this nvlist. * NB: nvp_curr is used only as a hint so that we don't always * have to walk the list to determine if nvp is still on the list. */ if (nvp == NULL) curr = priv->nvp_list; else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) curr = curr->nvi_next; else curr = NULL; priv->nvp_curr = curr; return (curr != NULL ? &curr->nvi_nvp : NULL); } nvpair_t * nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); curr = NVPAIR2I_NVP(nvp); if (nvp == NULL) curr = priv->nvp_last; else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) curr = curr->nvi_prev; else curr = NULL; priv->nvp_curr = curr; return (curr != NULL ? &curr->nvi_nvp : NULL); } boolean_t nvlist_empty(nvlist_t *nvl) { nvpriv_t *priv; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (B_TRUE); return (priv->nvp_list == NULL); } char * nvpair_name(nvpair_t *nvp) { return (NVP_NAME(nvp)); } data_type_t nvpair_type(nvpair_t *nvp) { return (NVP_TYPE(nvp)); } int nvpair_type_is_array(nvpair_t *nvp) { data_type_t type = NVP_TYPE(nvp); if ((type == DATA_TYPE_BYTE_ARRAY) || (type == DATA_TYPE_INT8_ARRAY) || (type == DATA_TYPE_UINT8_ARRAY) || (type == DATA_TYPE_INT16_ARRAY) || (type == DATA_TYPE_UINT16_ARRAY) || (type == DATA_TYPE_INT32_ARRAY) || (type == DATA_TYPE_UINT32_ARRAY) || (type == DATA_TYPE_INT64_ARRAY) || (type == DATA_TYPE_UINT64_ARRAY) || (type == DATA_TYPE_BOOLEAN_ARRAY) || (type == DATA_TYPE_STRING_ARRAY) || (type == DATA_TYPE_NVLIST_ARRAY)) return (1); return (0); } static int nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) { if (nvp == NULL || nvpair_type(nvp) != type) return (EINVAL); /* * For non-array types, we copy the data. * For array types (including string), we set a pointer. */ switch (type) { case DATA_TYPE_BOOLEAN: if (nelem != NULL) *nelem = 0; break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif if (data == NULL) return (EINVAL); bcopy(NVP_VALUE(nvp), data, (size_t)i_get_value_size(type, NULL, 1)); if (nelem != NULL) *nelem = 1; break; case DATA_TYPE_NVLIST: case DATA_TYPE_STRING: if (data == NULL) return (EINVAL); *(void **)data = (void *)NVP_VALUE(nvp); if (nelem != NULL) *nelem = 1; break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: if (nelem == NULL || data == NULL) return (EINVAL); if ((*nelem = NVP_NELEM(nvp)) != 0) *(void **)data = (void *)NVP_VALUE(nvp); else *(void **)data = NULL; break; default: return (ENOTSUP); } return (0); } static int nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t *nelem, void *data) { nvpriv_t *priv; nvpair_t *nvp; i_nvp_t *curr; if (name == NULL || nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (EINVAL); if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) return (ENOTSUP); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { nvp = &curr->nvi_nvp; if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) return (nvpair_value_common(nvp, type, nelem, data)); } return (ENOENT); } int nvlist_lookup_boolean(nvlist_t *nvl, const char *name) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL)); } int nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); } int nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val)); } int nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val)); } int nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val)); } int nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val)); } int nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val)); } int nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val)); } int nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val)); } int nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val)); } int nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); } #if !defined(_KERNEL) int nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val)); } #endif int nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val)); } int nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val)); } int nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, boolean_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); } int nvlist_lookup_byte_array(nvlist_t *nvl, const char *name, uchar_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); } int nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); } int nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, uint8_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); } int nvlist_lookup_int16_array(nvlist_t *nvl, const char *name, int16_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); } int nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, uint16_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); } int nvlist_lookup_int32_array(nvlist_t *nvl, const char *name, int32_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); } int nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, uint32_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); } int nvlist_lookup_int64_array(nvlist_t *nvl, const char *name, int64_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); } int nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, uint64_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); } int nvlist_lookup_string_array(nvlist_t *nvl, const char *name, char ***a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); } int nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t ***a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); } int nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val)); } int nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) { va_list ap; char *name; int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0); int ret = 0; va_start(ap, flag); while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { data_type_t type; void *val; uint_t *nelem; switch (type = va_arg(ap, data_type_t)) { case DATA_TYPE_BOOLEAN: ret = nvlist_lookup_common(nvl, name, type, NULL, NULL); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: case DATA_TYPE_STRING: case DATA_TYPE_NVLIST: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif val = va_arg(ap, void *); ret = nvlist_lookup_common(nvl, name, type, NULL, val); break; case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: val = va_arg(ap, void *); nelem = va_arg(ap, uint_t *); ret = nvlist_lookup_common(nvl, name, type, nelem, val); break; default: ret = EINVAL; } if (ret == ENOENT && noentok) ret = 0; } va_end(ap); return (ret); } /* * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function * returns zero and a pointer to the matching nvpair is returned in '*ret' * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate * multiple levels of embedded nvlists, with 'sep' as the separator. As an * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or * "a.d[3].e[1]". This matches the C syntax for array embed (for convience, * code also supports "a.d[3]e[1]" syntax). * * If 'ip' is non-NULL and the last name component is an array, return the * value of the "...[index]" array index in *ip. For an array reference that * is not indexed, *ip will be returned as -1. If there is a syntax error in * 'name', and 'ep' is non-NULL then *ep will be set to point to the location * inside the 'name' string where the syntax error was detected. */ static int nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, nvpair_t **ret, int *ip, char **ep) { nvpair_t *nvp; const char *np; char *sepp; char *idxp, *idxep; nvlist_t **nva; long idx; int n; if (ip) *ip = -1; /* not indexed */ if (ep) *ep = NULL; if ((nvl == NULL) || (name == NULL)) return (EINVAL); sepp = NULL; idx = 0; /* step through components of name */ for (np = name; np && *np; np = sepp) { /* ensure unique names */ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME)) return (ENOTSUP); /* skip white space */ skip_whitespace(np); if (*np == 0) break; /* set 'sepp' to end of current component 'np' */ if (sep) sepp = strchr(np, sep); else sepp = NULL; /* find start of next "[ index ]..." */ idxp = strchr(np, '['); /* if sepp comes first, set idxp to NULL */ if (sepp && idxp && (sepp < idxp)) idxp = NULL; /* * At this point 'idxp' is set if there is an index * expected for the current component. */ if (idxp) { /* set 'n' to length of current 'np' name component */ n = idxp++ - np; /* keep sepp up to date for *ep use as we advance */ skip_whitespace(idxp); sepp = idxp; /* determine the index value */ #if defined(_KERNEL) && !defined(_BOOT) if (ddi_strtol(idxp, &idxep, 0, &idx)) goto fail; #else idx = strtol(idxp, &idxep, 0); #endif if (idxep == idxp) goto fail; /* keep sepp up to date for *ep use as we advance */ sepp = idxep; /* skip white space index value and check for ']' */ skip_whitespace(sepp); if (*sepp++ != ']') goto fail; /* for embedded arrays, support C syntax: "a[1].b" */ skip_whitespace(sepp); if (sep && (*sepp == sep)) sepp++; } else if (sepp) { n = sepp++ - np; } else { n = strlen(np); } /* trim trailing whitespace by reducing length of 'np' */ if (n == 0) goto fail; for (n--; (np[n] == ' ') || (np[n] == '\t'); n--) ; n++; /* skip whitespace, and set sepp to NULL if complete */ if (sepp) { skip_whitespace(sepp); if (*sepp == 0) sepp = NULL; } /* * At this point: * o 'n' is the length of current 'np' component. * o 'idxp' is set if there was an index, and value 'idx'. * o 'sepp' is set to the beginning of the next component, * and set to NULL if we have no more components. * * Search for nvpair with matching component name. */ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { /* continue if no match on name */ if (strncmp(np, nvpair_name(nvp), n) || (strlen(nvpair_name(nvp)) != n)) continue; /* if indexed, verify type is array oriented */ if (idxp && !nvpair_type_is_array(nvp)) goto fail; /* * Full match found, return nvp and idx if this * was the last component. */ if (sepp == NULL) { if (ret) *ret = nvp; if (ip && idxp) *ip = (int)idx; /* return index */ return (0); /* found */ } /* * More components: current match must be * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY * to support going deeper. */ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) { nvl = EMBEDDED_NVL(nvp); break; } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) { (void) nvpair_value_nvlist_array(nvp, &nva, (uint_t *)&n); if ((n < 0) || (idx >= n)) goto fail; nvl = nva[idx]; break; } /* type does not support more levels */ goto fail; } if (nvp == NULL) goto fail; /* 'name' not found */ /* search for match of next component in embedded 'nvl' list */ } fail: if (ep && sepp) *ep = sepp; return (EINVAL); } /* * Return pointer to nvpair with specified 'name'. */ int nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret) { return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL)); } /* * Determine if named nvpair exists in nvlist (use embedded separator of '.' * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed * description. */ int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl, const char *name, nvpair_t **ret, int *ip, char **ep) { return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep)); } boolean_t nvlist_exists(nvlist_t *nvl, const char *name) { nvpriv_t *priv; nvpair_t *nvp; i_nvp_t *curr; if (name == NULL || nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (B_FALSE); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { nvp = &curr->nvi_nvp; if (strcmp(name, NVP_NAME(nvp)) == 0) return (B_TRUE); } return (B_FALSE); } int nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); } int nvpair_value_byte(nvpair_t *nvp, uchar_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val)); } int nvpair_value_int8(nvpair_t *nvp, int8_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val)); } int nvpair_value_uint8(nvpair_t *nvp, uint8_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val)); } int nvpair_value_int16(nvpair_t *nvp, int16_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val)); } int nvpair_value_uint16(nvpair_t *nvp, uint16_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val)); } int nvpair_value_int32(nvpair_t *nvp, int32_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val)); } int nvpair_value_uint32(nvpair_t *nvp, uint32_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val)); } int nvpair_value_int64(nvpair_t *nvp, int64_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val)); } int nvpair_value_uint64(nvpair_t *nvp, uint64_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); } #if !defined(_KERNEL) int nvpair_value_double(nvpair_t *nvp, double *val) { return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val)); } #endif int nvpair_value_string(nvpair_t *nvp, char **val) { return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val)); } int nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val) { return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val)); } int nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val)); } int nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val)); } int nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val)); } int nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val)); } int nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val)); } int nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val)); } int nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val)); } int nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val)); } int nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val)); } int nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val)); } int nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val)); } int nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val)); } int nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val)); } /* * Add specified pair to the list. */ int nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) { if (nvl == NULL || nvp == NULL) return (EINVAL); return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp), NVP_NELEM(nvp), NVP_VALUE(nvp))); } /* * Merge the supplied nvlists and put the result in dst. * The merged list will contain all names specified in both lists, * the values are taken from nvl in the case of duplicates. * Return 0 on success. */ /*ARGSUSED*/ int nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag) { if (nvl == NULL || dst == NULL) return (EINVAL); if (dst != nvl) return (nvlist_copy_pairs(nvl, dst)); return (0); } /* * Encoding related routines */ #define NVS_OP_ENCODE 0 #define NVS_OP_DECODE 1 #define NVS_OP_GETSIZE 2 typedef struct nvs_ops nvs_ops_t; typedef struct { int nvs_op; const nvs_ops_t *nvs_ops; void *nvs_private; nvpriv_t *nvs_priv; } nvstream_t; /* * nvs operations are: * - nvs_nvlist * encoding / decoding of a nvlist header (nvlist_t) * calculates the size used for header and end detection * * - nvs_nvpair * responsible for the first part of encoding / decoding of an nvpair * calculates the decoded size of an nvpair * * - nvs_nvp_op * second part of encoding / decoding of an nvpair * * - nvs_nvp_size * calculates the encoding size of an nvpair * * - nvs_nvl_fini * encodes the end detection mark (zeros). */ struct nvs_ops { int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *); int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *); int (*nvs_nvp_op)(nvstream_t *, nvpair_t *); int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *); int (*nvs_nvl_fini)(nvstream_t *); }; typedef struct { char nvh_encoding; /* nvs encoding method */ char nvh_endian; /* nvs endian */ char nvh_reserved1; /* reserved for future use */ char nvh_reserved2; /* reserved for future use */ } nvs_header_t; static int nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; /* * Walk nvpair in list and encode each nvpair */ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0) return (EFAULT); return (nvs->nvs_ops->nvs_nvl_fini(nvs)); } static int nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) { nvpair_t *nvp; size_t nvsize; int err; /* * Get decoded size of next pair in stream, alloc * memory for nvpair_t, then decode the nvpair */ while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) { if (nvsize == 0) /* end of list */ break; /* make sure len makes sense */ if (nvsize < NVP_SIZE_CALC(1, 0)) return (EFAULT); if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL) return (ENOMEM); if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) { nvp_buf_free(nvl, nvp); return (err); } if (i_validate_nvpair(nvp) != 0) { nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (EFAULT); } nvp_buf_link(nvl, nvp); } return (err); } static int nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; uint64_t nvsize = *buflen; size_t size; /* * Get encoded size of nvpairs in nvlist */ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0) return (EINVAL); if ((nvsize += size) > INT32_MAX) return (EINVAL); } *buflen = nvsize; return (0); } static int nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) { int err; if (nvl->nvl_priv == 0) return (EFAULT); /* * Perform the operation, starting with header, then each nvpair */ if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0) return (err); switch (nvs->nvs_op) { case NVS_OP_ENCODE: err = nvs_encode_pairs(nvs, nvl); break; case NVS_OP_DECODE: err = nvs_decode_pairs(nvs, nvl); break; case NVS_OP_GETSIZE: err = nvs_getsize_pairs(nvs, nvl, buflen); break; default: err = EINVAL; } return (err); } static int nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: return (nvs_operation(nvs, embedded, NULL)); case NVS_OP_DECODE: { nvpriv_t *priv; int err; if (embedded->nvl_version != NV_VERSION) return (ENOTSUP); if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL) return (ENOMEM); nvlist_init(embedded, embedded->nvl_nvflag, priv); if ((err = nvs_operation(nvs, embedded, NULL)) != 0) nvlist_free(embedded); return (err); } default: break; } return (EINVAL); } static int nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { size_t nelem = NVP_NELEM(nvp); nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); int i; switch (nvs->nvs_op) { case NVS_OP_ENCODE: for (i = 0; i < nelem; i++) if (nvs_embedded(nvs, nvlp[i]) != 0) return (EFAULT); break; case NVS_OP_DECODE: { size_t len = nelem * sizeof (uint64_t); nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len); bzero(nvlp, len); /* don't trust packed data */ for (i = 0; i < nelem; i++) { if (nvs_embedded(nvs, embedded) != 0) { nvpair_free(nvp); return (EFAULT); } nvlp[i] = embedded++; } break; } case NVS_OP_GETSIZE: { uint64_t nvsize = 0; for (i = 0; i < nelem; i++) { size_t nvp_sz = 0; if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0) return (EINVAL); if ((nvsize += nvp_sz) > INT32_MAX) return (EINVAL); } *size = nvsize; break; } default: return (EINVAL); } return (0); } static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *); static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *); /* * Common routine for nvlist operations: * encode, decode, getsize (encoded size). */ static int nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, int nvs_op) { int err = 0; nvstream_t nvs; int nvl_endian; #if BYTE_ORDER == _LITTLE_ENDIAN int host_endian = 1; #else int host_endian = 0; #endif /* _LITTLE_ENDIAN */ nvs_header_t *nvh = (void *)buf; if (buflen == NULL || nvl == NULL || (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (EINVAL); nvs.nvs_op = nvs_op; /* * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and * a buffer is allocated. The first 4 bytes in the buffer are * used for encoding method and host endian. */ switch (nvs_op) { case NVS_OP_ENCODE: if (buf == NULL || *buflen < sizeof (nvs_header_t)) return (EINVAL); nvh->nvh_encoding = encoding; nvh->nvh_endian = nvl_endian = host_endian; nvh->nvh_reserved1 = 0; nvh->nvh_reserved2 = 0; break; case NVS_OP_DECODE: if (buf == NULL || *buflen < sizeof (nvs_header_t)) return (EINVAL); /* get method of encoding from first byte */ encoding = nvh->nvh_encoding; nvl_endian = nvh->nvh_endian; break; case NVS_OP_GETSIZE: nvl_endian = host_endian; /* * add the size for encoding */ *buflen = sizeof (nvs_header_t); break; default: return (ENOTSUP); } /* * Create an nvstream with proper encoding method */ switch (encoding) { case NV_ENCODE_NATIVE: /* * check endianness, in case we are unpacking * from a file */ if (nvl_endian != host_endian) return (ENOTSUP); err = nvs_native(&nvs, nvl, buf, buflen); break; case NV_ENCODE_XDR: err = nvs_xdr(&nvs, nvl, buf, buflen); break; default: err = ENOTSUP; break; } return (err); } int nvlist_size(nvlist_t *nvl, size_t *size, int encoding) { return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE)); } /* * Pack nvlist into contiguous memory */ /*ARGSUSED1*/ int nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, int kmflag) { #if defined(_KERNEL) && !defined(_BOOT) return (nvlist_xpack(nvl, bufp, buflen, encoding, (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); #else return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep)); #endif } int nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, nv_alloc_t *nva) { nvpriv_t nvpriv; size_t alloc_size; char *buf; int err; if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL) return (EINVAL); if (*bufp != NULL) return (nvlist_common(nvl, *bufp, buflen, encoding, NVS_OP_ENCODE)); /* * Here is a difficult situation: * 1. The nvlist has fixed allocator properties. * All other nvlist routines (like nvlist_add_*, ...) use * these properties. - * 2. When using nvlist_pack() the user can specify his own + * 2. When using nvlist_pack() the user can specify their own * allocator properties (e.g. by using KM_NOSLEEP). * * We use the user specified properties (2). A clearer solution * will be to remove the kmflag from nvlist_pack(), but we will * not change the interface. */ nv_priv_init(&nvpriv, nva, 0); if ((err = nvlist_size(nvl, &alloc_size, encoding))) return (err); if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL) return (ENOMEM); if ((err = nvlist_common(nvl, buf, &alloc_size, encoding, NVS_OP_ENCODE)) != 0) { nv_mem_free(&nvpriv, buf, alloc_size); } else { *buflen = alloc_size; *bufp = buf; } return (err); } /* * Unpack buf into an nvlist_t */ /*ARGSUSED1*/ int nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) { #if defined(_KERNEL) && !defined(_BOOT) return (nvlist_xunpack(buf, buflen, nvlp, (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep))); #else return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep)); #endif } int nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva) { nvlist_t *nvl; int err; if (nvlp == NULL) return (EINVAL); if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0) return (err); if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0) nvlist_free(nvl); else *nvlp = nvl; return (err); } /* * Native encoding functions */ typedef struct { /* * This structure is used when decoding a packed nvpair in * the native format. n_base points to a buffer containing the * packed nvpair. n_end is a pointer to the end of the buffer. * (n_end actually points to the first byte past the end of the * buffer.) n_curr is a pointer that lies between n_base and n_end. * It points to the current data that we are decoding. * The amount of data left in the buffer is equal to n_end - n_curr. * n_flag is used to recognize a packed embedded list. */ caddr_t n_base; caddr_t n_end; caddr_t n_curr; uint_t n_flag; } nvs_native_t; static int nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf, size_t buflen) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: nvs->nvs_private = native; native->n_curr = native->n_base = buf; native->n_end = buf + buflen; native->n_flag = 0; return (0); case NVS_OP_GETSIZE: nvs->nvs_private = native; native->n_curr = native->n_base = native->n_end = NULL; native->n_flag = 0; return (0); default: return (EINVAL); } } /*ARGSUSED*/ static void nvs_native_destroy(nvstream_t *nvs) { } static int native_cp(nvstream_t *nvs, void *buf, size_t size) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; if (native->n_curr + size > native->n_end) return (EFAULT); /* * The bcopy() below eliminates alignment requirement * on the buffer (stream) and is preferred over direct access. */ switch (nvs->nvs_op) { case NVS_OP_ENCODE: bcopy(buf, native->n_curr, size); break; case NVS_OP_DECODE: bcopy(native->n_curr, buf, size); break; default: return (EINVAL); } native->n_curr += size; return (0); } /* * operate on nvlist_t header */ static int nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) { nvs_native_t *native = nvs->nvs_private; switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: if (native->n_flag) return (0); /* packed embedded list */ native->n_flag = 1; /* copy version and nvflag of the nvlist_t */ if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 || native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0) return (EFAULT); return (0); case NVS_OP_GETSIZE: /* * if calculate for packed embedded list * 4 for end of the embedded list * else * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag * and 4 for end of the entire list */ if (native->n_flag) { *size += 4; } else { native->n_flag = 1; *size += 2 * sizeof (int32_t) + 4; } return (0); default: return (EINVAL); } } static int nvs_native_nvl_fini(nvstream_t *nvs) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; /* * Add 4 zero bytes at end of nvlist. They are used * for end detection by the decode routine. */ if (native->n_curr + sizeof (int) > native->n_end) return (EFAULT); bzero(native->n_curr, sizeof (int)); native->n_curr += sizeof (int); } return (0); } static int nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; char *packed = (void *) (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); /* * Null out the pointer that is meaningless in the packed * structure. The address may not be aligned, so we have * to use bzero. */ bzero(packed + offsetof(nvlist_t, nvl_priv), sizeof(((nvlist_t *)NULL)->nvl_priv)); } return (nvs_embedded(nvs, EMBEDDED_NVL(nvp))); } static int nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp); size_t len = NVP_NELEM(nvp) * sizeof (uint64_t); int i; /* * Null out pointers that are meaningless in the packed * structure. The addresses may not be aligned, so we have * to use bzero. */ bzero(value, len); value += len; for (i = 0; i < NVP_NELEM(nvp); i++) { /* * Null out the pointer that is meaningless in the * packed structure. The address may not be aligned, * so we have to use bzero. */ bzero(value + offsetof(nvlist_t, nvl_priv), sizeof(((nvlist_t *)NULL)->nvl_priv)); value += sizeof(nvlist_t); } } return (nvs_embedded_nvl_array(nvs, nvp, NULL)); } static void nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; uint64_t *strp = (void *) (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); /* * Null out pointers that are meaningless in the packed * structure. The addresses may not be aligned, so we have * to use bzero. */ bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t)); break; } case NVS_OP_DECODE: { char **strp = (void *)NVP_VALUE(nvp); char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t)); int i; for (i = 0; i < NVP_NELEM(nvp); i++) { strp[i] = buf; buf += strlen(buf) + 1; } break; } } } static int nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) { data_type_t type; int value_sz; int ret = 0; /* * We do the initial bcopy of the data before we look at * the nvpair type, because when we're decoding, we won't * have the correct values for the pair until we do the bcopy. */ switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: if (native_cp(nvs, nvp, nvp->nvp_size) != 0) return (EFAULT); break; default: return (EINVAL); } /* verify nvp_name_sz, check the name string length */ if (i_validate_nvpair_name(nvp) != 0) return (EFAULT); type = NVP_TYPE(nvp); /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) return (EFAULT); if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) return (EFAULT); switch (type) { case DATA_TYPE_NVLIST: ret = nvpair_native_embedded(nvs, nvp); break; case DATA_TYPE_NVLIST_ARRAY: ret = nvpair_native_embedded_array(nvs, nvp); break; case DATA_TYPE_STRING_ARRAY: nvpair_native_string_array(nvs, nvp); break; default: break; } return (ret); } static int nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { uint64_t nvp_sz = nvp->nvp_size; switch (NVP_TYPE(nvp)) { case DATA_TYPE_NVLIST: { size_t nvsize = 0; if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0) return (EINVAL); nvp_sz += nvsize; break; } case DATA_TYPE_NVLIST_ARRAY: { size_t nvsize; if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0) return (EINVAL); nvp_sz += nvsize; break; } default: break; } if (nvp_sz > INT32_MAX) return (EINVAL); *size = nvp_sz; return (0); } static int nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: return (nvs_native_nvp_op(nvs, nvp)); case NVS_OP_DECODE: { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; int32_t decode_len; /* try to read the size value from the stream */ if (native->n_curr + sizeof (int32_t) > native->n_end) return (EFAULT); bcopy(native->n_curr, &decode_len, sizeof (int32_t)); /* sanity check the size value */ if (decode_len < 0 || decode_len > native->n_end - native->n_curr) return (EFAULT); *size = decode_len; /* * If at the end of the stream then move the cursor * forward, otherwise nvpair_native_op() will read * the entire nvpair at the same cursor position. */ if (*size == 0) native->n_curr += sizeof (int32_t); break; } default: return (EINVAL); } return (0); } static const nvs_ops_t nvs_native_ops = { nvs_native_nvlist, nvs_native_nvpair, nvs_native_nvp_op, nvs_native_nvp_size, nvs_native_nvl_fini }; static int nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) { nvs_native_t native; int err; nvs->nvs_ops = &nvs_native_ops; if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t), *buflen - sizeof (nvs_header_t))) != 0) return (err); err = nvs_operation(nvs, nvl, buflen); nvs_native_destroy(nvs); return (err); } /* * XDR encoding functions * * An xdr packed nvlist is encoded as: * * - encoding methode and host endian (4 bytes) * - nvl_version (4 bytes) * - nvl_nvflag (4 bytes) * * - encoded nvpairs, the format of one xdr encoded nvpair is: * - encoded size of the nvpair (4 bytes) * - decoded size of the nvpair (4 bytes) * - name string, (4 + sizeof(NV_ALIGN4(string)) * a string is coded as size (4 bytes) and data * - data type (4 bytes) * - number of elements in the nvpair (4 bytes) * - data * * - 2 zero's for end of the entire list (8 bytes) */ static int nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen) { /* xdr data must be 4 byte aligned */ if ((ulong_t)buf % 4 != 0) return (EFAULT); switch (nvs->nvs_op) { case NVS_OP_ENCODE: xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE); nvs->nvs_private = xdr; return (0); case NVS_OP_DECODE: xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE); nvs->nvs_private = xdr; return (0); case NVS_OP_GETSIZE: nvs->nvs_private = NULL; return (0); default: return (EINVAL); } } static void nvs_xdr_destroy(nvstream_t *nvs) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: xdr_destroy((XDR *)nvs->nvs_private); break; default: break; } } static int nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: { XDR *xdr = nvs->nvs_private; if (!xdr_int(xdr, &nvl->nvl_version) || !xdr_u_int(xdr, &nvl->nvl_nvflag)) return (EFAULT); break; } case NVS_OP_GETSIZE: { /* * 2 * 4 for nvl_version + nvl_nvflag * and 8 for end of the entire list */ *size += 2 * 4 + 8; break; } default: return (EINVAL); } return (0); } static int nvs_xdr_nvl_fini(nvstream_t *nvs) { if (nvs->nvs_op == NVS_OP_ENCODE) { XDR *xdr = nvs->nvs_private; int zero = 0; if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero)) return (EFAULT); } return (0); } /* * The format of xdr encoded nvpair is: * encode_size, decode_size, name string, data type, nelem, data */ static int nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) { data_type_t type; char *buf; char *buf_end = (char *)nvp + nvp->nvp_size; int value_sz; uint_t nelem, buflen; bool_t ret = FALSE; XDR *xdr = nvs->nvs_private; ASSERT(xdr != NULL && nvp != NULL); /* name string */ if ((buf = NVP_NAME(nvp)) >= buf_end) return (EFAULT); buflen = buf_end - buf; if (!xdr_string(xdr, &buf, buflen - 1)) return (EFAULT); nvp->nvp_name_sz = strlen(buf) + 1; /* type and nelem */ if (!xdr_int(xdr, (int *)&nvp->nvp_type) || !xdr_int(xdr, &nvp->nvp_value_elem)) return (EFAULT); type = NVP_TYPE(nvp); nelem = nvp->nvp_value_elem; /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) return (EFAULT); /* if there is no data to extract then return */ if (nelem == 0) return (0); /* value */ if ((buf = NVP_VALUE(nvp)) >= buf_end) return (EFAULT); buflen = buf_end - buf; if (buflen < value_sz) return (EFAULT); switch (type) { case DATA_TYPE_NVLIST: if (nvs_embedded(nvs, (void *)buf) == 0) return (0); break; case DATA_TYPE_NVLIST_ARRAY: if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0) return (0); break; case DATA_TYPE_BOOLEAN: ret = TRUE; break; case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: ret = xdr_char(xdr, buf); break; case DATA_TYPE_INT16: ret = xdr_short(xdr, (void *)buf); break; case DATA_TYPE_UINT16: ret = xdr_u_short(xdr, (void *)buf); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_INT32: ret = xdr_int(xdr, (void *)buf); break; case DATA_TYPE_UINT32: ret = xdr_u_int(xdr, (void *)buf); break; case DATA_TYPE_INT64: ret = xdr_longlong_t(xdr, (void *)buf); break; case DATA_TYPE_UINT64: ret = xdr_u_longlong_t(xdr, (void *)buf); break; case DATA_TYPE_HRTIME: /* * NOTE: must expose the definition of hrtime_t here */ ret = xdr_longlong_t(xdr, (void *)buf); break; #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: ret = xdr_double(xdr, (void *)buf); break; #endif case DATA_TYPE_STRING: ret = xdr_string(xdr, &buf, buflen - 1); break; case DATA_TYPE_BYTE_ARRAY: ret = xdr_opaque(xdr, buf, nelem); break; case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), (xdrproc_t)xdr_char); break; case DATA_TYPE_INT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), sizeof (int16_t), (xdrproc_t)xdr_short); break; case DATA_TYPE_UINT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), sizeof (uint16_t), (xdrproc_t)xdr_u_short); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), sizeof (int32_t), (xdrproc_t)xdr_int); break; case DATA_TYPE_UINT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), sizeof (uint32_t), (xdrproc_t)xdr_u_int); break; case DATA_TYPE_INT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), sizeof (int64_t), (xdrproc_t)xdr_longlong_t); break; case DATA_TYPE_UINT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); break; case DATA_TYPE_STRING_ARRAY: { size_t len = nelem * sizeof (uint64_t); char **strp = (void *)buf; int i; if (nvs->nvs_op == NVS_OP_DECODE) bzero(buf, len); /* don't trust packed data */ for (i = 0; i < nelem; i++) { if (buflen <= len) return (EFAULT); buf += len; buflen -= len; if (xdr_string(xdr, &buf, buflen - 1) != TRUE) return (EFAULT); if (nvs->nvs_op == NVS_OP_DECODE) strp[i] = buf; len = strlen(buf) + 1; } ret = TRUE; break; } default: break; } return (ret == TRUE ? 0 : EFAULT); } static int nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { data_type_t type = NVP_TYPE(nvp); /* * encode_size + decode_size + name string size + data type + nelem * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) */ uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4; switch (type) { case DATA_TYPE_BOOLEAN: break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: nvp_sz += 4; /* 4 is the minimum xdr unit */ break; case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif nvp_sz += 8; break; case DATA_TYPE_STRING: nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp))); break; case DATA_TYPE_BYTE_ARRAY: nvp_sz += NV_ALIGN4(NVP_NELEM(nvp)); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp); break; case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp); break; case DATA_TYPE_STRING_ARRAY: { int i; char **strs = (void *)NVP_VALUE(nvp); for (i = 0; i < NVP_NELEM(nvp); i++) nvp_sz += 4 + NV_ALIGN4(strlen(strs[i])); break; } case DATA_TYPE_NVLIST: case DATA_TYPE_NVLIST_ARRAY: { size_t nvsize = 0; int old_nvs_op = nvs->nvs_op; int err; nvs->nvs_op = NVS_OP_GETSIZE; if (type == DATA_TYPE_NVLIST) err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize); else err = nvs_embedded_nvl_array(nvs, nvp, &nvsize); nvs->nvs_op = old_nvs_op; if (err != 0) return (EINVAL); nvp_sz += nvsize; break; } default: return (EINVAL); } if (nvp_sz > INT32_MAX) return (EINVAL); *size = nvp_sz; return (0); } /* * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates * the largest nvpair that could be encoded in the buffer. * * See comments above nvpair_xdr_op() for the format of xdr encoding. * The size of a xdr packed nvpair without any data is 5 words. * * Using the size of the data directly as an estimate would be ok * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY * then the actual nvpair has space for an array of pointers to index * the strings. These pointers are not encoded into the packed xdr buffer. * * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are * of length 0, then each string is endcoded in xdr format as a single word. * Therefore when expanded to an nvpair there will be 2.25 word used for * each string. (a int64_t allocated for pointer usage, and a single char * for the null termination.) * * This is the calculation performed by the NVS_XDR_MAX_LEN macro. */ #define NVS_XDR_HDR_LEN ((size_t)(5 * 4)) #define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \ 0 : ((size_t)(y) - NVS_XDR_HDR_LEN)) #define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \ (NVS_XDR_DATA_LEN(x) * 2) + \ NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4))) static int nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { XDR *xdr = nvs->nvs_private; int32_t encode_len, decode_len; switch (nvs->nvs_op) { case NVS_OP_ENCODE: { size_t nvsize; if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0) return (EFAULT); decode_len = nvp->nvp_size; encode_len = nvsize; if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) return (EFAULT); return (nvs_xdr_nvp_op(nvs, nvp)); } case NVS_OP_DECODE: { struct xdr_bytesrec bytesrec; /* get the encode and decode size */ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) return (EFAULT); *size = decode_len; /* are we at the end of the stream? */ if (*size == 0) return (0); /* sanity check the size parameter */ if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec)) return (EFAULT); if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail)) return (EFAULT); break; } default: return (EINVAL); } return (0); } static const struct nvs_ops nvs_xdr_ops = { nvs_xdr_nvlist, nvs_xdr_nvpair, nvs_xdr_nvp_op, nvs_xdr_nvp_size, nvs_xdr_nvl_fini }; static int nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) { XDR xdr; int err; nvs->nvs_ops = &nvs_xdr_ops; if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t), *buflen - sizeof (nvs_header_t))) != 0) return (err); err = nvs_operation(nvs, nvl, buflen); nvs_xdr_destroy(nvs); return (err); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c (revision 329627) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c (revision 329628) @@ -1,922 +1,922 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups * of names after deciding which is the appropriate lookup interface. */ static int zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, matchtype_t mt, uint64_t *zoid) { int error; if (zfsvfs->z_norm) { /* * In the non-mixed case we only expect there would ever * be one match, but we need to use the normalizing lookup. */ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid, mt, NULL, 0, NULL); } else { error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); } *zoid = ZFS_DIRENT_OBJ(*zoid); return (error); } /* * Look up a directory entry under a locked vnode. * dvp being locked gives us a guarantee that there are no concurrent * modification of the directory and, thus, if a node can be found in * the directory, then it must not be unlinked. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. * ZXATTR: we want dzp's xattr directory * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. */ int zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; matchtype_t mt = 0; uint64_t zoid; vnode_t *vp = NULL; int error = 0; ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); *zpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' */ if (name[0] == '.' && (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) return (SET_ERROR(EEXIST)); /* * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zfsvfs->z_case and zfsvfs->z_norm fields. These choices * affect how we perform zap lookups. * * When matching we may need to normalize & change case according to * FS settings. * * Note that a normalized match is necessary for a case insensitive * filesystem when the lookup request is not exact because normalization * can fold case independent of normalizing code point sequences. * * See the table above zfs_dropname(). */ if (zfsvfs->z_norm != 0) { mt = MT_NORMALIZE; /* * Determine if the match needs to honor the case specified in * lookup, and if so keep track of that so that during * normalization we don't fold case. */ if (zfsvfs->z_case == ZFS_CASE_MIXED) { mt |= MT_MATCH_CASE; } } /* * Only look in or update the DNLC if we are looking for the * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name. * * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE * because in that case MT_EXACT and MT_FIRST should produce exactly * the same result. */ if (dzp->z_unlinked && !(flag & ZXATTR)) return (ENOENT); if (flag & ZXATTR) { error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, sizeof (zoid)); if (error == 0) error = (zoid == 0 ? ENOENT : 0); } else { error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { return (error); } } else { if (flag & ZNEW) { return (SET_ERROR(EEXIST)); } error = zfs_zget(zfsvfs, zoid, zpp); if (error) return (error); ASSERT(!(*zpp)->z_unlinked); } return (0); } static int zfs_dd_lookup(znode_t *dzp, znode_t **zpp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; uint64_t parent; int error; ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); if (dzp->z_unlinked) return (ENOENT); if ((error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) return (error); error = zfs_zget(zfsvfs, parent, &zp); if (error == 0) *zpp = zp; return (error); } int zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; int error = 0; ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); if (dzp->z_unlinked) return (SET_ERROR(ENOENT)); if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { *zpp = dzp; } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { error = zfs_dd_lookup(dzp, zpp); } else { error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); if (error == 0) { dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ *zpp = zp; } } return (error); } /* * unlinked Set (formerly known as the "delete queue") Error Handling * * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we * don't specify the name of the entry that we will be manipulating. We * also fib and say that we won't be adding any new entries to the * unlinked set, even though we might (this is to lower the minimum file * size that can be deleted in a full filesystem). So on the small * chance that the nlink list is using a fat zap (ie. has more than * 2000 entries), we *may* not pre-read a block that's needed. * Therefore it is remotely possible for some of the assertions * regarding the unlinked set below to fail due to i/o error. On a * nondebug system, this will result in the space being leaked. */ void zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); ASSERT(zp->z_links == 0); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); } /* * Clean up any znodes that had no links when we either crashed or * (force) umounted the file system. */ void zfs_unlinked_drain(zfsvfs_t *zfsvfs) { zap_cursor_t zc; zap_attribute_t zap; dmu_object_info_t doi; znode_t *zp; int error; /* * Interate over the contents of the unlinked set. */ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); zap_cursor_retrieve(&zc, &zap) == 0; zap_cursor_advance(&zc)) { /* * See what kind of object we have in list */ error = dmu_object_info(zfsvfs->z_os, zap.za_first_integer, &doi); if (error != 0) continue; ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); /* * We need to re-mark these list entries for deletion, * so we pull them back into core and set zp->z_unlinked. */ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); /* * We may pick up znodes that are already marked for deletion. * This could happen during the purge of an extended attribute * directory. All we need to do is skip over them, since they * are already in the system marked z_unlinked. */ if (error != 0) continue; vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); zp->z_unlinked = B_TRUE; vput(ZTOV(zp)); } zap_cursor_fini(&zc); } /* * Delete the entire contents of a directory. Return a count * of the number of entries that could not be deleted. If we encounter * an error, return a count of at least one so that the directory stays * in the unlinked set. * * NOTE: this function assumes that the directory is inactive, * so there is no need to lock its entries before deletion. * Also, it assumes the directory contents is *only* regular * files. */ static int zfs_purgedir(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; znode_t *xzp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int skipped = 0; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { error = zfs_zget(zfsvfs, ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); if (error) { skipped += 1; continue; } vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); ASSERT((ZTOV(xzp)->v_type == VREG) || (ZTOV(xzp)->v_type == VLNK)); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* Is this really needed ? */ zfs_sa_upgrade_txholds(tx, xzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); vput(ZTOV(xzp)); skipped += 1; continue; } error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); if (error) skipped += 1; dmu_tx_commit(tx); vput(ZTOV(xzp)); } zap_cursor_fini(&zc); if (error != ENOENT) skipped += 1; return (skipped); } void zfs_rmnode(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; uint64_t xattr_obj; int error; ASSERT(zp->z_links == 0); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. * Leave it in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } } else { /* * Free up all the data in the file. We don't do this for * XATTR directories because we need truncate and remove to be * in the same tx, like in zfs_znode_delete(). Otherwise, if * we crash here we'll end up with an inconsistent truncated * zap object in the delete queue. Note a truncated file is * harmless since it only contains user data. */ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); if (error) { /* * Not enough space or we were interrupted by unmount. * Leave the file in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT3S(error, ==, 0); vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); } acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xzp) { dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* * Not enough space to delete the file. Leave it in the * unlinked set, leaking it until the fs is remounted (at * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); zfs_znode_dmu_fini(zp); zfs_znode_free(zp); goto out; } if (xzp) { ASSERT(error == 0); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ xzp->z_links = 0; /* no more links to it */ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &xzp->z_links, sizeof (xzp->z_links), tx)); zfs_unlinked_add(xzp, tx); } /* Remove this znode from the unlinked set */ VERIFY3U(0, ==, zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); out: if (xzp) vput(ZTOV(xzp)); } static uint64_t zfs_dirent(znode_t *zp, uint64_t mode) { uint64_t de = zp->z_id; if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) de |= IFTODT(mode) << 60; return (de); } /* * Link zp into dzp. Can only fail if zp has been unlinked. */ int zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); uint64_t value; int zp_is_dir = (vp->v_type == VDIR); sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); #ifdef __FreeBSD__ if (zp_is_dir) { if (dzp->z_links >= ZFS_LINK_MAX) return (SET_ERROR(EMLINK)); } #endif if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); return (SET_ERROR(ENOENT)); } #ifdef __FreeBSD__ if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { return (SET_ERROR(EMLINK)); } #endif zp->z_links++; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); } else { ASSERT(zp->z_unlinked == 0); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &dzp->z_id, sizeof (dzp->z_id)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (!(flag & ZNEW)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(error); dzp->z_size++; dzp->z_links += zp_is_dir; count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT0(error); value = zfs_dirent(zp, zp->z_mode); error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, 8, 1, &value, tx); VERIFY0(error); return (0); } /* * The match type in the code for this function should conform to: * * ------------------------------------------------------------------------ * fs type | z_norm | lookup type | match type * ---------|-------------|-------------|---------------------------------- * CS !norm | 0 | 0 | 0 (exact) * CS norm | formX | 0 | MT_NORMALIZE * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | ZCILOOK | MT_NORMALIZE * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE * * Abbreviations: * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) * formX = unicode normalization form set on fs creation */ static int zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag) { int error; if (zp->z_zfsvfs->z_norm) { matchtype_t mt = MT_NORMALIZE; if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { mt |= MT_MATCH_CASE; } error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, name, mt, tx); } else { error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); } return (error); } /* * Unlink zp from dzp, and mark zp for deletion if this was the last link. * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If it's non-NULL, we use it to indicate whether the znode needs deletion, * and it's the caller's job to do it. */ int zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag, boolean_t *unlinkedp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; vnode_t *vp = ZTOV(zp); int zp_is_dir = (vp->v_type == VDIR); boolean_t unlinked = B_FALSE; sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (!(flag & ZRENAMING)) { if (zp_is_dir && !zfs_dirempty(zp)) { #ifdef illumos return (SET_ERROR(EEXIST)); #else return (SET_ERROR(ENOTEMPTY)); #endif } /* * If we get here, we are going to try to remove the object. * First try removing the name from the directory; if that * fails, return the error. */ error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) { return (error); } if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on vnode %p is %u, " "should be at least %u", zp->z_vnode, (int)zp->z_links, zp_is_dir + 1); zp->z_links = zp_is_dir + 1; } if (--zp->z_links == zp_is_dir) { zp->z_unlinked = B_TRUE; zp->z_links = 0; unlinked = B_TRUE; } else { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, B_TRUE); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); count = 0; ASSERT0(error); } else { ASSERT(zp->z_unlinked == 0); error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) return (error); } dzp->z_size--; /* one dirent removed */ dzp->z_links -= zp_is_dir; /* ".." link from zp */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT0(error); if (unlinkedp != NULL) *unlinkedp = unlinked; else if (unlinked) zfs_unlinked_add(zp, tx); return (0); } /* * Indicate whether the directory is empty. */ boolean_t zfs_dirempty(znode_t *dzp) { return (dzp->z_size == 2); } int zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; dmu_tx_t *tx; int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t parent; *xvpp = NULL; /* * In FreeBSD, access checking for creating an EA is being done * in zfs_setextattr(), */ #ifndef __FreeBSD_kernel__ if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) return (error); #endif if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, &acl_ids)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); return (error); } zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); #ifdef DEBUG error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); ASSERT(error == 0 && parent == zp->z_id); #endif VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, sizeof (xzp->z_id), tx)); (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); *xvpp = ZTOV(xzp); return (0); } /* * Return a znode for the extended attribute directory for zp. * ** If the directory does not already exist, it is created ** * * IN: zp - znode to obtain attribute directory from * cr - credentials of caller * flags - flags from the VOP_LOOKUP call * * OUT: xzpp - pointer to extended attribute znode * * RETURN: 0 on success * error number on failure */ int zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; vattr_t va; int error; top: error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); if (error) return (error); if (xzp != NULL) { *xvpp = ZTOV(xzp); return (0); } if (!(flags & CREATE_XATTR_DIR)) { #ifdef illumos return (SET_ERROR(ENOENT)); #else return (SET_ERROR(ENOATTR)); #endif } if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { return (SET_ERROR(EROFS)); } /* * The ability to 'create' files in an attribute * directory comes from the write_xattr permission on the base file. * * The ability to 'search' an attribute directory requires * read_xattr permission on the base file. * * Once in a directory the ability to read/write attributes * is controlled by the permissions on the attribute file. */ va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; va.va_type = VDIR; va.va_mode = S_IFDIR | S_ISVTX | 0777; zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); error = zfs_make_xattrdir(zp, &va, xvpp, cr); if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } if (error == 0) VOP_UNLOCK(*xvpp, 0); return (error); } /* * Decide whether it is okay to remove within a sticky directory. * * In sticky directories, write access is not sufficient; * you can remove entries from a directory only if: * * you own the directory, * you own the entry, * the entry is a plain file and you have write access, * or you are privileged (checked in secpolicy...). * * The function returns 0 if remove access is granted. */ int zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) { uid_t uid; uid_t downer; uid_t fowner; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; if (zdp->z_zfsvfs->z_replay) return (0); if ((zdp->z_mode & S_ISVTX) == 0) return (0); downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) return (0); else return (secpolicy_vnode_remove(ZTOV(zp), cr)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 329627) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 329628) @@ -1,4151 +1,4151 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); #if defined(__amd64__) static int zio_use_uma = 1; #else static int zio_use_uma = 0; #endif SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, "Use uma(9) for ZIO allocations"); static int zio_exclude_metadata = 0; SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); zio_trim_stats_t zio_trim_stats = { { "bytes", KSTAT_DATA_UINT64, "Number of bytes successfully TRIMmed" }, { "success", KSTAT_DATA_UINT64, "Number of successful TRIM requests" }, { "unsupported", KSTAT_DATA_UINT64, "Number of TRIM requests that failed because TRIM is not supported" }, { "failed", KSTAT_DATA_UINT64, "Number of TRIM requests that failed for reasons other than not supported" }, }; static kstat_t *zio_trim_ksp; /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", "zio_ioctl" }; boolean_t zio_dva_throttle_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN, &zio_dva_throttle_enabled, 0, ""); /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; #ifdef illumos #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif #endif static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); if (!zio_use_uma) goto out; /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0; while (!ISP2(p2)) p2 &= p2 - 1; #ifdef illumos #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; #endif #endif /* illumos */ if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = MIN(p2 >> 2, PAGESIZE); } if (align != 0) { char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); /* * Since zio_data bufs do not appear in crash dumps, we * pass KMC_NOTOUCH so that no allocator metadata is * stored with the buffers. */ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags | KMC_NOTOUCH | KMC_NODEBUG); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } out: zio_inject_init(); zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", KSTAT_TYPE_NAMED, sizeof(zio_trim_stats) / sizeof(kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zio_trim_ksp != NULL) { zio_trim_ksp->ks_data = &zio_trim_stats; kstat_install(zio_trim_ksp); } } void zio_fini(void) { size_t c; kmem_cache_t *last_cache = NULL; kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; if (zio_data_buf_cache[c] != last_data_cache) { last_data_cache = zio_data_buf_cache[c]; kmem_cache_destroy(zio_data_buf_cache[c]); } zio_data_buf_cache[c] = NULL; } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); if (zio_trim_ksp != NULL) { kstat_delete(zio_trim_ksp); zio_trim_ksp = NULL; } } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; int flags = zio_exclude_metadata ? KM_NODEBUG : 0; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP|flags)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_buf_cache[c], buf); else kmem_free(buf, size); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_data_buf_cache[c], buf); else kmem_free(buf, size); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); /* * Ensure that anyone expecting this zio to contain a linear ABD isn't * going to get a nasty surprise when they try to access the data. */ #ifdef illumos IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); #else IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd), abd_is_linear(data)); #endif zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks and decompression * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size); abd_return_buf_copy(data, tmp, size); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } static void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * Dispatch the parent zio in its own taskq so that * the child can continue to make progress. This also * prevents overflowing the stack when we have deeply nested * parent-child relationships. */ zio_taskq_dispatch(pio, type, B_FALSE); } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; ASSERT3U(type == ZIO_TYPE_FREE || psize, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } return (zio); } static void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) { if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { zfs_panic_recover("blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); if (vdevid >= spa->spa_root_vdev->vdev_children) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { zfs_panic_recover("blkptr at %p DVA %u has hole " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "OFFSET %llu", bp, i, (longlong_t)offset); } } } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). */ if (data == NULL && zio->io_prop.zp_dedup_verify) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. */ if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, BP_GET_PSIZE(bp), 0))); } } zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags) { zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); metaslab_check_free(spa, bp); arc_freed(spa, bp); if (zfs_trim_enabled) stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | ZIO_STAGE_VDEV_IO_ASSESS; /* * GANG and DEDUP blocks can induce a read (for the gang block header, * or the DDT), so issue them asynchronously so that this thread is * not tied up. */ else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) stage |= ZIO_STAGE_ISSUE_ASYNC; flags |= ZIO_FLAG_DONT_QUEUE; zio = zio_create(pio, spa, txg, bp, NULL, size, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; dprintf_bp(bp, "claiming in txg %llu", txg); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ASSERT(txg == spa_first_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, offset, size, done, private, priority, flags)); } return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; ASSERT(vd->vdev_parent == (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* Not all IO types require vdev io done stage e.g. free */ if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; if (vd->vdev_children == 0) offset += VDEV_LABEL_START_SIZE; flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { metaslab_class_t *mc = spa_normal_class(pio->io_spa); ASSERT(mc->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } zio_t * zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) { ASSERT(vd->vdev_ops->vdev_op_leaf); return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3P(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static int zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static int zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (ZIO_PIPELINE_CONTINUE); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (ZIO_PIPELINE_CONTINUE); } static int zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; int pass = 1; EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (ZIO_PIPELINE_STOP); } if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (ZIO_PIPELINE_CONTINUE); } else { /* * Round up compressed size up to the ashift * of the smallest-ashift device, and zero the tail. * This ensures that the compressed size of the BP * (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); size_t rounded = (size_t)P2ROUNDUP(psize, 1ULL << spa->spa_min_ashift); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { ASSERT(psize != 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (ZIO_PIPELINE_CONTINUE); } static int zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (taskq_member(tqs->stqs_taskq[i], executor)) return (B_TRUE); } } return (B_FALSE); } static int zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { hrtime_t diff = zio->io_target_timestamp - now; DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); (void) timeout_generic(CALLOUT_NORMAL, (void (*)(void *))zio_interrupt, zio, diff, 1, 0); } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } /* * Execute the I/O pipeline until one of the following occurs: * * (1) the I/O completes * (2) the pipeline stalls waiting for dependent child I/Os * (3) the I/O issues, so we're waiting for an I/O completion interrupt * (4) the I/O is delegated by vdev-level caching or aggregation * (5) the I/O is deferred due to vdev-level queueing * (6) the I/O is handed off to another thread. * * In all cases, the pipeline stops whenever there's no CPU work; it never * burns a thread in cv_wait(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; void zio_execute(zio_t *zio) { zio->io_executor = curthread; ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) return; ASSERT(rv == ZIO_PIPELINE_CONTINUE); } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { int error; ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) cv_wait(&zio->io_cv, &zio->io_lock); mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); } /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the - * responsibility of the caller to wait on him. + * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = B_FALSE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_put(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_put(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_put(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree && gio->io_abd != NULL) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static int zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (ZIO_PIPELINE_CONTINUE); } static int zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (ZIO_PIPELINE_STOP); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); zio_t *gio = zio->io_gang_leader; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { abd_put(zio->io_abd); } static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); zio_prop_t zp; int error; int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); flags |= METASLAB_ASYNC_ALLOC; VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio); } pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, lsize, &zp, zio_write_gang_member_ready, NULL, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static int zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (ZIO_PIPELINE_CONTINUE); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static int zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (ZIO_PIPELINE_CONTINUE); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); } static int zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (ZIO_PIPELINE_STOP); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (ZIO_PIPELINE_CONTINUE); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (ZIO_PIPELINE_CONTINUE); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW); /* We should never get a raw, override zio */ ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd, zio->io_orig_size) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_exit(ddt); /* * Intuitively, it would make more sense to compare * io_abd than io_orig_abd in the raw case since you * don't want to look at any transformations that have * happened to the data. However, for raw I/Os the * data will actually be the same in io_abd and * io_orig_abd, so all we have to do is issue this as * a raw ARC read. */ if (do_raw) { zio_flags |= ZIO_FLAG_RAW; ASSERT3U(zio->io_size, ==, zio->io_orig_size); ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd, zio->io_size)); ASSERT3P(zio->io_transform_stack, ==, NULL); } error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zio->io_bookmark); if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static void zio_ddt_ditto_write_done(zio_t *zio) { int p = DDT_PHYS_DITTO; zio_prop_t *zp = &zio->io_prop; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_key_t *ddk = &dde->dde_key; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); if (ddp->ddp_phys_birth != 0) ddt_phys_free(ddt, ddk, ddp, zio->io_txg); ddt_phys_fill(ddp, bp); } ddt_exit(ddt); } static int zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; int ditto_copies; zio_t *cio = NULL; zio_t *dio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); ASSERT(ditto_copies < SPA_DVAS_PER_BP); if (ditto_copies > ddt_ditto_copies_present(dde) && dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { zio_prop_t czp = *zp; czp.zp_copies = ditto_copies; /* * If we arrived here with an override bp, we won't have run * the transform stack, so we won't have the data we need to * generate a child i/o. So, toss the override bp and restart. * This is safe, because using the override bp is just an * optimization; and it's rare, so the cost doesn't matter. */ if (zio->io_bp_override) { zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; zio->io_pipeline = ZIO_WRITE_PIPELINE; zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); if (cio) zio_nowait(cio); if (dio) zio_nowait(dio); return (ZIO_PIPELINE_CONTINUE); } ddt_entry_t *freedde; /* for debugging */ static int zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); zio = avl_first(&spa->spa_alloc_tree); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ if (!metaslab_class_throttle_reserve(spa_normal_class(spa), zio->io_prop.zp_copies, zio, 0)) { return (NULL); } avl_remove(&spa->spa_alloc_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static int zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); mutex_enter(&spa->spa_alloc_lock); ASSERT(zio->io_type == ZIO_TYPE_WRITE); avl_add(&spa->spa_alloc_tree, zio); nio = zio_io_to_allocate(zio->io_spa); mutex_exit(&spa->spa_alloc_lock); if (nio == zio) return (ZIO_PIPELINE_CONTINUE); if (nio != NULL) { ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); /* * We are passing control to a new zio so make sure that * it is processed by a different thread. We do this to * avoid stack overflows that can occur when parents are * throttled and children are making progress. We allow * it to go to the head of the taskq since it's already * been waiting. */ zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); } return (ZIO_PIPELINE_STOP); } void zio_allocate_dispatch(spa_t *spa) { zio_t *zio; mutex_enter(&spa->spa_alloc_lock); zio = zio_io_to_allocate(spa); mutex_exit(&spa->spa_alloc_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); if (zio->io_flags & ZIO_FLAG_NODATA) { flags |= METASLAB_DONT_THROTTLE; } if (zio->io_flags & ZIO_FLAG_GANG_CHILD) { flags |= METASLAB_GANG_CHILD; } if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) { flags |= METASLAB_ASYNC_ALLOC; } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio); if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; } return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL); if (error == 0) *slog = FALSE; } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), size, error); } return (error); } /* * Free an intent log block. */ void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) { ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); ASSERT(!BP_IS_GANG(bp)); zio_free(spa, txg, bp); } /* * ========================================================================== * Read, write and delete to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static int zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; int ret; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && zio->io_priority == ZIO_PRIORITY_NOW) { trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); return (ZIO_PIPELINE_CONTINUE); } ASSERT3P(zio->io_logical, !=, zio); /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care * about non-scrubbing, top-level reads and writes with the following * characteristics: * - synchronous writes of user data to non-slog devices * - any reads of user data * When these conditions are met, adjust the timestamp of spa_last_io * which allows the scan thread to adjust its workload accordingly. */ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && vd == vd->vdev_top && !vd->vdev_islog && zio->io_bookmark.zb_objset != DMU_META_OBJSET && zio->io_txg != spa_syncing_txg(spa)) { uint64_t old = spa->spa_last_io; uint64_t new = ddi_get_lbolt64(); if (old != new) (void) atomic_cas_64(&spa->spa_last_io, old, new); } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = NULL; if (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE) abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, abuf ? asize : 0, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For the physical io we allow alignment * to a logical block size. */ uint64_t log_align = 1ULL << vd->vdev_top->vdev_logical_ashift; ASSERT0(P2PHASE(zio->io_offset, log_align)); ASSERT0(P2PHASE(zio->io_size, log_align)); } VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering with nested replication. * For example, given a mirror of mirrors, (A+B)+(C+D), if only * A is out of date, we'll read from C+D, then use the data to * resilver A+B -- but we don't actually want to resilver B, just A. * The top-level mirror has no way to know this, so instead we just * discard unnecessary repairs as we work our way down the vdev tree. * The same logic applies to any form of nested replication: * ditto + mirror, RAID-Z + replacing, etc. This covers them all. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (ZIO_PIPELINE_CONTINUE); } if (vd->vdev_ops->vdev_op_leaf) { switch (zio->io_type) { case ZIO_TYPE_READ: if (vdev_cache_read(zio)) return (ZIO_PIPELINE_CONTINUE); /* FALLTHROUGH */ case ZIO_TYPE_WRITE: case ZIO_TYPE_FREE: if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } break; } /* * Note that we ignore repair writes for TRIM because they can * conflict with normal writes. This isn't an issue because, by * definition, we only repair blocks that aren't freed. */ if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && !trim_map_write_start(zio)) return (ZIO_PIPELINE_STOP); } vd->vdev_ops->vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } static int zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (ZIO_PIPELINE_STOP); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); if (vd != NULL && vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE)) { if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) trim_map_write_done(zio); vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(vd, zio, EIO); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error) { if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FREE) { /* Not all devices support TRIM. */ } else if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const void *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); abd_copy_to_buf(buf, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_buf_free; } static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (ZIO_PIPELINE_STOP); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); if (zio->io_type == ZIO_TYPE_FREE && zio->io_priority != ZIO_PRIORITY_NOW) { switch (zio->io_error) { case 0: ZIO_TRIM_STAT_INCR(bytes, zio->io_size); ZIO_TRIM_STAT_BUMP(success); break; case EOPNOTSUPP: ZIO_TRIM_STAT_BUMP(unsupported); break; default: ZIO_TRIM_STAT_BUMP(failed); break; } } /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (ZIO_PIPELINE_STOP); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent bit so * that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (ZIO_PIPELINE_CONTINUE); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static int zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } static int zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, zio, zio->io_offset, zio->io_size, NULL, &info); } } return (ZIO_PIPELINE_CONTINUE); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (ZIO_PIPELINE_STOP); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( spa_normal_class(zio->io_spa), zio->io_prop.zp_copies, zio); zio_allocate_dispatch(zio->io_spa); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (ZIO_PIPELINE_CONTINUE); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *lio = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), 1, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa); } static int zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; zio_t *pio, *pio_next; metaslab_class_t *mc = spa_normal_class(spa); zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (ZIO_PIPELINE_STOP); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(mc->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(bp != NULL); metaslab_group_alloc_verify(spa, zio->io_bp, zio); VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (bp != NULL && !BP_IS_EMBEDDED(bp)) { ASSERT(bp->blk_pad[0] == 0); ASSERT(bp->blk_pad[1] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); ASSERT(BP_COUNT_GANG(bp) == 0 || (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); char *abuf = NULL; abd_t *adata = zio->io_abd; if (asize != psize) { adata = abd_alloc_linear(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } if (adata != NULL) abuf = abd_borrow_buf_copy(adata, asize); zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); if (adata != NULL) abd_return_buf(adata, abuf, asize); if (asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == lio) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(spa, zio); zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 0, 0); } } if (zio->io_error && zio == lio) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(vd == NULL && bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(spa) == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute = 0; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(spa, zio); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (ZIO_PIPELINE_STOP); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT(last_block->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } Index: head/sys/cddl/contrib/opensolaris =================================================================== --- head/sys/cddl/contrib/opensolaris (revision 329627) +++ head/sys/cddl/contrib/opensolaris (revision 329628) Property changes on: head/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor-sys/illumos/dist:r316910