diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h index e2c4830cb964..aa3688718ae7 100644 --- a/include/os/freebsd/spl/sys/sdt.h +++ b/include/os/freebsd/spl/sys/sdt.h @@ -1,46 +1,46 @@ /* * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_SDT_H_ #define _OPENSOLARIS_SYS_SDT_H_ #include_next #ifdef KDTRACE_HOOKS -/* BEGIN CSTYLED */ SDT_PROBE_DECLARE(sdt, , , set__error); +/* BEGIN CSTYLED */ #define SET_ERROR(err) ({ \ SDT_PROBE1(sdt, , , set__error, (uintptr_t)err); \ err; \ }) /* END CSTYLED */ #else #define SET_ERROR(err) (err) #endif #endif /* _OPENSOLARIS_SYS_SDT_H_ */ diff --git a/lib/libspl/atomic.c b/lib/libspl/atomic.c index 8cc350710ba0..f61f5fcc47f5 100644 --- a/lib/libspl/atomic.c +++ b/lib/libspl/atomic.c @@ -1,400 +1,372 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009 by Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include /* * These are the void returning variants */ #define ATOMIC_INC(name, type) \ void atomic_inc_##name(volatile type *target) \ { \ (void) __atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_INC(8, uint8_t) ATOMIC_INC(16, uint16_t) ATOMIC_INC(32, uint32_t) ATOMIC_INC(64, uint64_t) ATOMIC_INC(uchar, uchar_t) ATOMIC_INC(ushort, ushort_t) ATOMIC_INC(uint, uint_t) ATOMIC_INC(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_DEC(name, type) \ void atomic_dec_##name(volatile type *target) \ { \ (void) __atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_DEC(8, uint8_t) ATOMIC_DEC(16, uint16_t) ATOMIC_DEC(32, uint32_t) ATOMIC_DEC(64, uint64_t) ATOMIC_DEC(uchar, uchar_t) ATOMIC_DEC(ushort, ushort_t) ATOMIC_DEC(uint, uint_t) ATOMIC_DEC(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_ADD(name, type1, type2) \ void atomic_add_##name(volatile type1 *target, type2 bits) \ { \ (void) __atomic_add_fetch(target, bits, __ATOMIC_SEQ_CST); \ } void atomic_add_ptr(volatile void *target, ssize_t bits) { (void) __atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST); } -/* BEGIN CSTYLED */ ATOMIC_ADD(8, uint8_t, int8_t) ATOMIC_ADD(16, uint16_t, int16_t) ATOMIC_ADD(32, uint32_t, int32_t) ATOMIC_ADD(64, uint64_t, int64_t) ATOMIC_ADD(char, uchar_t, signed char) ATOMIC_ADD(short, ushort_t, short) ATOMIC_ADD(int, uint_t, int) ATOMIC_ADD(long, ulong_t, long) -/* END CSTYLED */ #define ATOMIC_SUB(name, type1, type2) \ void atomic_sub_##name(volatile type1 *target, type2 bits) \ { \ (void) __atomic_sub_fetch(target, bits, __ATOMIC_SEQ_CST); \ } void atomic_sub_ptr(volatile void *target, ssize_t bits) { (void) __atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST); } -/* BEGIN CSTYLED */ ATOMIC_SUB(8, uint8_t, int8_t) ATOMIC_SUB(16, uint16_t, int16_t) ATOMIC_SUB(32, uint32_t, int32_t) ATOMIC_SUB(64, uint64_t, int64_t) ATOMIC_SUB(char, uchar_t, signed char) ATOMIC_SUB(short, ushort_t, short) ATOMIC_SUB(int, uint_t, int) ATOMIC_SUB(long, ulong_t, long) -/* END CSTYLED */ #define ATOMIC_OR(name, type) \ void atomic_or_##name(volatile type *target, type bits) \ { \ (void) __atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_OR(8, uint8_t) ATOMIC_OR(16, uint16_t) ATOMIC_OR(32, uint32_t) ATOMIC_OR(64, uint64_t) ATOMIC_OR(uchar, uchar_t) ATOMIC_OR(ushort, ushort_t) ATOMIC_OR(uint, uint_t) ATOMIC_OR(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_AND(name, type) \ void atomic_and_##name(volatile type *target, type bits) \ { \ (void) __atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST); \ } -/* BEGIN CSTYLED */ ATOMIC_AND(8, uint8_t) ATOMIC_AND(16, uint16_t) ATOMIC_AND(32, uint32_t) ATOMIC_AND(64, uint64_t) ATOMIC_AND(uchar, uchar_t) ATOMIC_AND(ushort, ushort_t) ATOMIC_AND(uint, uint_t) ATOMIC_AND(ulong, ulong_t) -/* END CSTYLED */ /* * New value returning variants */ #define ATOMIC_INC_NV(name, type) \ type atomic_inc_##name##_nv(volatile type *target) \ { \ return (__atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_INC_NV(8, uint8_t) ATOMIC_INC_NV(16, uint16_t) ATOMIC_INC_NV(32, uint32_t) ATOMIC_INC_NV(64, uint64_t) ATOMIC_INC_NV(uchar, uchar_t) ATOMIC_INC_NV(ushort, ushort_t) ATOMIC_INC_NV(uint, uint_t) ATOMIC_INC_NV(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_DEC_NV(name, type) \ type atomic_dec_##name##_nv(volatile type *target) \ { \ return (__atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_DEC_NV(8, uint8_t) ATOMIC_DEC_NV(16, uint16_t) ATOMIC_DEC_NV(32, uint32_t) ATOMIC_DEC_NV(64, uint64_t) ATOMIC_DEC_NV(uchar, uchar_t) ATOMIC_DEC_NV(ushort, ushort_t) ATOMIC_DEC_NV(uint, uint_t) ATOMIC_DEC_NV(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_ADD_NV(name, type1, type2) \ type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits) \ { \ return (__atomic_add_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } void * atomic_add_ptr_nv(volatile void *target, ssize_t bits) { return (__atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST)); } -/* BEGIN CSTYLED */ ATOMIC_ADD_NV(8, uint8_t, int8_t) ATOMIC_ADD_NV(16, uint16_t, int16_t) ATOMIC_ADD_NV(32, uint32_t, int32_t) ATOMIC_ADD_NV(64, uint64_t, int64_t) ATOMIC_ADD_NV(char, uchar_t, signed char) ATOMIC_ADD_NV(short, ushort_t, short) ATOMIC_ADD_NV(int, uint_t, int) ATOMIC_ADD_NV(long, ulong_t, long) -/* END CSTYLED */ #define ATOMIC_SUB_NV(name, type1, type2) \ type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits) \ { \ return (__atomic_sub_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } void * atomic_sub_ptr_nv(volatile void *target, ssize_t bits) { return (__atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST)); } -/* BEGIN CSTYLED */ ATOMIC_SUB_NV(8, uint8_t, int8_t) ATOMIC_SUB_NV(char, uchar_t, signed char) ATOMIC_SUB_NV(16, uint16_t, int16_t) ATOMIC_SUB_NV(short, ushort_t, short) ATOMIC_SUB_NV(32, uint32_t, int32_t) ATOMIC_SUB_NV(int, uint_t, int) ATOMIC_SUB_NV(long, ulong_t, long) ATOMIC_SUB_NV(64, uint64_t, int64_t) -/* END CSTYLED */ #define ATOMIC_OR_NV(name, type) \ type atomic_or_##name##_nv(volatile type *target, type bits) \ { \ return (__atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_OR_NV(8, uint8_t) ATOMIC_OR_NV(16, uint16_t) ATOMIC_OR_NV(32, uint32_t) ATOMIC_OR_NV(64, uint64_t) ATOMIC_OR_NV(uchar, uchar_t) ATOMIC_OR_NV(ushort, ushort_t) ATOMIC_OR_NV(uint, uint_t) ATOMIC_OR_NV(ulong, ulong_t) -/* END CSTYLED */ #define ATOMIC_AND_NV(name, type) \ type atomic_and_##name##_nv(volatile type *target, type bits) \ { \ return (__atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_AND_NV(8, uint8_t) ATOMIC_AND_NV(16, uint16_t) ATOMIC_AND_NV(32, uint32_t) ATOMIC_AND_NV(64, uint64_t) ATOMIC_AND_NV(uchar, uchar_t) ATOMIC_AND_NV(ushort, ushort_t) ATOMIC_AND_NV(uint, uint_t) ATOMIC_AND_NV(ulong, ulong_t) -/* END CSTYLED */ /* * If *tgt == exp, set *tgt = des; return old value * * This may not look right on the first pass (or the sixteenth), but, * from https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html: * > If they are not equal, the operation is a read * > and the current contents of *ptr are written into *expected. * And, in the converse case, exp is already *target by definition. */ #define ATOMIC_CAS(name, type) \ type atomic_cas_##name(volatile type *target, type exp, type des) \ { \ __atomic_compare_exchange_n(target, &exp, des, B_FALSE, \ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); \ return (exp); \ } void * atomic_cas_ptr(volatile void *target, void *exp, void *des) { __atomic_compare_exchange_n((void **)target, &exp, des, B_FALSE, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); return (exp); } -/* BEGIN CSTYLED */ ATOMIC_CAS(8, uint8_t) ATOMIC_CAS(16, uint16_t) ATOMIC_CAS(32, uint32_t) ATOMIC_CAS(64, uint64_t) ATOMIC_CAS(uchar, uchar_t) ATOMIC_CAS(ushort, ushort_t) ATOMIC_CAS(uint, uint_t) ATOMIC_CAS(ulong, ulong_t) -/* END CSTYLED */ /* * Swap target and return old value */ #define ATOMIC_SWAP(name, type) \ type atomic_swap_##name(volatile type *target, type bits) \ { \ return (__atomic_exchange_n(target, bits, __ATOMIC_SEQ_CST)); \ } -/* BEGIN CSTYLED */ ATOMIC_SWAP(8, uint8_t) ATOMIC_SWAP(16, uint16_t) ATOMIC_SWAP(32, uint32_t) ATOMIC_SWAP(64, uint64_t) ATOMIC_SWAP(uchar, uchar_t) ATOMIC_SWAP(ushort, ushort_t) ATOMIC_SWAP(uint, uint_t) ATOMIC_SWAP(ulong, ulong_t) -/* END CSTYLED */ void * atomic_swap_ptr(volatile void *target, void *bits) { return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST)); } #ifndef _LP64 uint64_t atomic_load_64(volatile uint64_t *target) { return (__atomic_load_n(target, __ATOMIC_RELAXED)); } void atomic_store_64(volatile uint64_t *target, uint64_t bits) { return (__atomic_store_n(target, bits, __ATOMIC_RELAXED)); } #endif int atomic_set_long_excl(volatile ulong_t *target, uint_t value) { ulong_t bit = 1UL << value; ulong_t old = __atomic_fetch_or(target, bit, __ATOMIC_SEQ_CST); return ((old & bit) ? -1 : 0); } int atomic_clear_long_excl(volatile ulong_t *target, uint_t value) { ulong_t bit = 1UL << value; ulong_t old = __atomic_fetch_and(target, ~bit, __ATOMIC_SEQ_CST); return ((old & bit) ? 0 : -1); } void membar_enter(void) { __atomic_thread_fence(__ATOMIC_SEQ_CST); } void membar_exit(void) { __atomic_thread_fence(__ATOMIC_SEQ_CST); } void membar_sync(void) { __atomic_thread_fence(__ATOMIC_SEQ_CST); } void membar_producer(void) { __atomic_thread_fence(__ATOMIC_RELEASE); } void membar_consumer(void) { __atomic_thread_fence(__ATOMIC_ACQUIRE); } diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c index 887f7d32df4a..9034873474fe 100644 --- a/module/nvpair/nvpair.c +++ b/module/nvpair/nvpair.c @@ -1,3797 +1,3795 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2017 by Delphix. All rights reserved. * Copyright 2018 RackTop Systems. */ /* * Links to Illumos.org for more information on Interface Libraries: * [1] https://illumos.org/man/3lib/libnvpair * [2] https://illumos.org/man/3nvpair/nvlist_alloc * [3] https://illumos.org/man/9f/nvlist_alloc * [4] https://illumos.org/man/9f/nvlist_next_nvpair * [5] https://illumos.org/man/9f/nvpair_value_byte */ #include #include #include #include #include #include #include #include #include #include #if defined(_KERNEL) #include #include #else #include #include #include #endif #define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) (p)++ /* * nvpair.c - Provides kernel & userland interfaces for manipulating * name-value pairs. * * Overview Diagram * * +--------------+ * | nvlist_t | * |--------------| * | nvl_version | * | nvl_nvflag | * | nvl_priv -+-+ * | nvl_flag | | * | nvl_pad | | * +--------------+ | * V * +--------------+ last i_nvp in list * | nvpriv_t | +---------------------> * |--------------| | * +--+- nvp_list | | +------------+ * | | nvp_last -+--+ + nv_alloc_t | * | | nvp_curr | |------------| * | | nvp_nva -+----> | nva_ops | * | | nvp_stat | | nva_arg | * | +--------------+ +------------+ * | * +-------+ * V * +---------------------+ +-------------------+ * | i_nvp_t | +-->| i_nvp_t | +--> * |---------------------| | |-------------------| | * | nvi_next -+--+ | nvi_next -+--+ * | nvi_prev (NULL) | <----+ nvi_prev | * | . . . . . . . . . . | | . . . . . . . . . | * | nvp (nvpair_t) | | nvp (nvpair_t) | * | - nvp_size | | - nvp_size | * | - nvp_name_sz | | - nvp_name_sz | * | - nvp_value_elem | | - nvp_value_elem | * | - nvp_type | | - nvp_type | * | - data ... | | - data ... | * +---------------------+ +-------------------+ * * * * +---------------------+ +---------------------+ * | i_nvp_t | +--> +-->| i_nvp_t (last) | * |---------------------| | | |---------------------| * | nvi_next -+--+ ... --+ | nvi_next (NULL) | * <-+- nvi_prev |<-- ... <----+ nvi_prev | * | . . . . . . . . . | | . . . . . . . . . | * | nvp (nvpair_t) | | nvp (nvpair_t) | * | - nvp_size | | - nvp_size | * | - nvp_name_sz | | - nvp_name_sz | * | - nvp_value_elem | | - nvp_value_elem | * | - DATA_TYPE_NVLIST | | - nvp_type | * | - data (embedded) | | - data ... | * | nvlist name | +---------------------+ * | +--------------+ | * | | nvlist_t | | * | |--------------| | * | | nvl_version | | * | | nvl_nvflag | | * | | nvl_priv --+---+----> * | | nvl_flag | | * | | nvl_pad | | * | +--------------+ | * +---------------------+ * * * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will * allow value to be aligned on 8 byte boundary * * name_len is the length of the name string including the null terminator * so it must be >= 1 */ #define NVP_SIZE_CALC(name_len, data_len) \ (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t nelem, const void *data); #define NV_STAT_EMBEDDED 0x1 #define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp)) #define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp)) #define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz)) #define NVPAIR2I_NVP(nvp) \ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) #ifdef _KERNEL static const int nvpair_max_recursion = 20; #else static const int nvpair_max_recursion = 100; #endif static const uint64_t nvlist_hashtable_init_size = (1 << 4); int nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) { va_list valist; int err = 0; nva->nva_ops = nvo; nva->nva_arg = NULL; va_start(valist, nvo); if (nva->nva_ops->nv_ao_init != NULL) err = nva->nva_ops->nv_ao_init(nva, valist); va_end(valist); return (err); } void nv_alloc_reset(nv_alloc_t *nva) { if (nva->nva_ops->nv_ao_reset != NULL) nva->nva_ops->nv_ao_reset(nva); } void nv_alloc_fini(nv_alloc_t *nva) { if (nva->nva_ops->nv_ao_fini != NULL) nva->nva_ops->nv_ao_fini(nva); } nv_alloc_t * nvlist_lookup_nv_alloc(nvlist_t *nvl) { nvpriv_t *priv; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); return (priv->nvp_nva); } static void * nv_mem_zalloc(nvpriv_t *nvp, size_t size) { nv_alloc_t *nva = nvp->nvp_nva; void *buf; if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL) memset(buf, 0, size); return (buf); } static void nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) { nv_alloc_t *nva = nvp->nvp_nva; nva->nva_ops->nv_ao_free(nva, buf, size); } static void nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) { memset(priv, 0, sizeof (nvpriv_t)); priv->nvp_nva = nva; priv->nvp_stat = stat; } static nvpriv_t * nv_priv_alloc(nv_alloc_t *nva) { nvpriv_t *priv; /* * nv_mem_alloc() cannot called here because it needs the priv * argument. */ if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL) return (NULL); nv_priv_init(priv, nva, 0); return (priv); } /* * Embedded lists need their own nvpriv_t's. We create a new * nvpriv_t using the parameters and allocator from the parent * list's nvpriv_t. */ static nvpriv_t * nv_priv_alloc_embedded(nvpriv_t *priv) { nvpriv_t *emb_priv; if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL) return (NULL); nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED); return (emb_priv); } static int nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets) { ASSERT3P(priv->nvp_hashtable, ==, NULL); ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *)); if (tab == NULL) return (ENOMEM); priv->nvp_hashtable = tab; priv->nvp_nbuckets = buckets; return (0); } static void nvt_tab_free(nvpriv_t *priv) { i_nvp_t **tab = priv->nvp_hashtable; if (tab == NULL) { ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); return; } nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *)); priv->nvp_hashtable = NULL; priv->nvp_nbuckets = 0; priv->nvp_nentries = 0; } static uint32_t nvt_hash(const char *p) { uint32_t g, hval = 0; while (*p) { hval = (hval << 4) + *p++; if ((g = (hval & 0xf0000000)) != 0) hval ^= g >> 24; hval &= ~g; } return (hval); } static boolean_t nvt_nvpair_match(const nvpair_t *nvp1, const nvpair_t *nvp2, uint32_t nvflag) { boolean_t match = B_FALSE; if (nvflag & NV_UNIQUE_NAME_TYPE) { if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 && NVP_TYPE(nvp1) == NVP_TYPE(nvp2)) match = B_TRUE; } else { ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME); if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0) match = B_TRUE; } return (match); } static nvpair_t * nvt_lookup_name_type(const nvlist_t *nvl, const char *name, data_type_t type) { const nvpriv_t *priv = (const nvpriv_t *)(uintptr_t)nvl->nvl_priv; ASSERT(priv != NULL); i_nvp_t **tab = priv->nvp_hashtable; if (tab == NULL) { ASSERT3P(priv->nvp_list, ==, NULL); ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); return (NULL); } else { ASSERT(priv->nvp_nbuckets != 0); } uint64_t hash = nvt_hash(name); uint64_t index = hash & (priv->nvp_nbuckets - 1); ASSERT3U(index, <, priv->nvp_nbuckets); i_nvp_t *entry = tab[index]; for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) { if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 && (type == DATA_TYPE_DONTCARE || NVP_TYPE(&e->nvi_nvp) == type)) return (&e->nvi_nvp); } return (NULL); } static nvpair_t * nvt_lookup_name(const nvlist_t *nvl, const char *name) { return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE)); } static int nvt_resize(nvpriv_t *priv, uint32_t new_size) { i_nvp_t **tab = priv->nvp_hashtable; /* * Migrate all the entries from the current table * to a newly-allocated table with the new size by * re-adjusting the pointers of their entries. */ uint32_t size = priv->nvp_nbuckets; uint32_t new_mask = new_size - 1; ASSERT(ISP2(new_size)); i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *)); if (new_tab == NULL) return (ENOMEM); uint32_t nentries = 0; for (uint32_t i = 0; i < size; i++) { i_nvp_t *next, *e = tab[i]; while (e != NULL) { next = e->nvi_hashtable_next; uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp)); uint32_t index = hash & new_mask; e->nvi_hashtable_next = new_tab[index]; new_tab[index] = e; nentries++; e = next; } tab[i] = NULL; } ASSERT3U(nentries, ==, priv->nvp_nentries); nvt_tab_free(priv); priv->nvp_hashtable = new_tab; priv->nvp_nbuckets = new_size; priv->nvp_nentries = nentries; return (0); } static boolean_t nvt_needs_togrow(nvpriv_t *priv) { /* * Grow only when we have more elements than buckets * and the # of buckets doesn't overflow. */ return (priv->nvp_nentries > priv->nvp_nbuckets && (UINT32_MAX >> 1) >= priv->nvp_nbuckets); } /* * Allocate a new table that's twice the size of the old one, * and migrate all the entries from the old one to the new * one by re-adjusting their pointers. */ static int nvt_grow(nvpriv_t *priv) { uint32_t current_size = priv->nvp_nbuckets; /* ensure we won't overflow */ ASSERT3U(UINT32_MAX >> 1, >=, current_size); return (nvt_resize(priv, current_size << 1)); } static boolean_t nvt_needs_toshrink(nvpriv_t *priv) { /* * Shrink only when the # of elements is less than or * equal to 1/4 the # of buckets. Never shrink less than * nvlist_hashtable_init_size. */ ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size); if (priv->nvp_nbuckets == nvlist_hashtable_init_size) return (B_FALSE); return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2)); } /* * Allocate a new table that's half the size of the old one, * and migrate all the entries from the old one to the new * one by re-adjusting their pointers. */ static int nvt_shrink(nvpriv_t *priv) { uint32_t current_size = priv->nvp_nbuckets; /* ensure we won't overflow */ ASSERT3U(current_size, >=, nvlist_hashtable_init_size); return (nvt_resize(priv, current_size >> 1)); } static int nvt_remove_nvpair(nvlist_t *nvl, const nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; if (nvt_needs_toshrink(priv)) { int err = nvt_shrink(priv); if (err != 0) return (err); } i_nvp_t **tab = priv->nvp_hashtable; const char *name = NVP_NAME(nvp); uint64_t hash = nvt_hash(name); uint64_t index = hash & (priv->nvp_nbuckets - 1); ASSERT3U(index, <, priv->nvp_nbuckets); i_nvp_t *bucket = tab[index]; for (i_nvp_t *prev = NULL, *e = bucket; e != NULL; prev = e, e = e->nvi_hashtable_next) { if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_nvflag)) { if (prev != NULL) { prev->nvi_hashtable_next = e->nvi_hashtable_next; } else { ASSERT3P(e, ==, bucket); tab[index] = e->nvi_hashtable_next; } e->nvi_hashtable_next = NULL; priv->nvp_nentries--; break; } } return (0); } static int nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; /* initialize nvpair table now if it doesn't exist. */ if (priv->nvp_hashtable == NULL) { int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size); if (err != 0) return (err); } /* * if we don't allow duplicate entries, make sure to * unlink any existing entries from the table. */ if (nvl->nvl_nvflag != 0) { int err = nvt_remove_nvpair(nvl, nvp); if (err != 0) return (err); } if (nvt_needs_togrow(priv)) { int err = nvt_grow(priv); if (err != 0) return (err); } i_nvp_t **tab = priv->nvp_hashtable; const char *name = NVP_NAME(nvp); uint64_t hash = nvt_hash(name); uint64_t index = hash & (priv->nvp_nbuckets - 1); ASSERT3U(index, <, priv->nvp_nbuckets); // cppcheck-suppress nullPointerRedundantCheck i_nvp_t *bucket = tab[index]; /* insert link at the beginning of the bucket */ i_nvp_t *new_entry = NVPAIR2I_NVP(nvp); ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL); new_entry->nvi_hashtable_next = bucket; // cppcheck-suppress nullPointerRedundantCheck tab[index] = new_entry; priv->nvp_nentries++; return (0); } static void nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) { nvl->nvl_version = NV_VERSION; nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE); nvl->nvl_priv = (uint64_t)(uintptr_t)priv; nvl->nvl_flag = 0; nvl->nvl_pad = 0; } uint_t nvlist_nvflag(nvlist_t *nvl) { return (nvl->nvl_nvflag); } static nv_alloc_t * nvlist_nv_alloc(int kmflag) { #if defined(_KERNEL) switch (kmflag) { case KM_SLEEP: return (nv_alloc_sleep); case KM_NOSLEEP: return (nv_alloc_nosleep); default: return (nv_alloc_pushpage); } #else (void) kmflag; return (nv_alloc_nosleep); #endif /* _KERNEL */ } /* * nvlist_alloc - Allocate nvlist. */ int nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) { return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag))); } int nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva) { nvpriv_t *priv; if (nvlp == NULL || nva == NULL) return (EINVAL); if ((priv = nv_priv_alloc(nva)) == NULL) return (ENOMEM); if ((*nvlp = nv_mem_zalloc(priv, NV_ALIGN(sizeof (nvlist_t)))) == NULL) { nv_mem_free(priv, priv, sizeof (nvpriv_t)); return (ENOMEM); } nvlist_init(*nvlp, nvflag, priv); return (0); } /* * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair. */ static nvpair_t * nvp_buf_alloc(nvlist_t *nvl, size_t len) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *buf; nvpair_t *nvp; size_t nvsize; /* * Allocate the buffer */ nvsize = len + offsetof(i_nvp_t, nvi_nvp); if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL) return (NULL); nvp = &buf->nvi_nvp; nvp->nvp_size = len; return (nvp); } /* * nvp_buf_free - de-Allocate an i_nvp_t. */ static void nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp); nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize); } /* * nvp_buf_link - link a new nv pair into the nvlist. */ static void nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr = NVPAIR2I_NVP(nvp); /* Put element at end of nvlist */ if (priv->nvp_list == NULL) { priv->nvp_list = priv->nvp_last = curr; } else { curr->nvi_prev = priv->nvp_last; priv->nvp_last->nvi_next = curr; priv->nvp_last = curr; } } /* * nvp_buf_unlink - unlink an removed nvpair out of the nvlist. */ static void nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr = NVPAIR2I_NVP(nvp); /* * protect nvlist_next_nvpair() against walking on freed memory. */ if (priv->nvp_curr == curr) priv->nvp_curr = curr->nvi_next; if (curr == priv->nvp_list) priv->nvp_list = curr->nvi_next; else curr->nvi_prev->nvi_next = curr->nvi_next; if (curr == priv->nvp_last) priv->nvp_last = curr->nvi_prev; else curr->nvi_next->nvi_prev = curr->nvi_prev; } /* * take a nvpair type and number of elements and make sure the are valid */ static int i_validate_type_nelem(data_type_t type, uint_t nelem) { switch (type) { case DATA_TYPE_BOOLEAN: if (nelem != 0) return (EINVAL); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_STRING: case DATA_TYPE_HRTIME: case DATA_TYPE_NVLIST: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif if (nelem != 1) return (EINVAL); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: /* we allow arrays with 0 elements */ break; default: return (EINVAL); } return (0); } /* * Verify nvp_name_sz and check the name string length. */ static int i_validate_nvpair_name(nvpair_t *nvp) { if ((nvp->nvp_name_sz <= 0) || (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0))) return (EFAULT); /* verify the name string, make sure its terminated */ if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0') return (EFAULT); return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT); } static int i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data) { switch (type) { case DATA_TYPE_BOOLEAN_VALUE: if (*(boolean_t *)data != B_TRUE && *(boolean_t *)data != B_FALSE) return (EINVAL); break; case DATA_TYPE_BOOLEAN_ARRAY: { int i; for (i = 0; i < nelem; i++) if (((boolean_t *)data)[i] != B_TRUE && ((boolean_t *)data)[i] != B_FALSE) return (EINVAL); break; } default: break; } return (0); } /* * This function takes a pointer to what should be a nvpair and it's size * and then verifies that all the nvpair fields make sense and can be * trusted. This function is used when decoding packed nvpairs. */ static int i_validate_nvpair(nvpair_t *nvp) { data_type_t type = NVP_TYPE(nvp); int size1, size2; /* verify nvp_name_sz, check the name string length */ if (i_validate_nvpair_name(nvp) != 0) return (EFAULT); if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0) return (EFAULT); /* * verify nvp_type, nvp_value_elem, and also possibly * verify string values and get the value size. */ size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); size1 = nvp->nvp_size - NVP_VALOFF(nvp); if (size2 < 0 || size1 != NV_ALIGN(size2)) return (EFAULT); return (0); } static int nvlist_copy_pairs(const nvlist_t *snvl, nvlist_t *dnvl) { const nvpriv_t *priv; const i_nvp_t *curr; if ((priv = (const nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL) return (EINVAL); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { const nvpair_t *nvp = &curr->nvi_nvp; int err; if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp), NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0) return (err); } return (0); } /* * Frees all memory allocated for an nvpair (like embedded lists) with * the exception of the nvpair buffer itself. */ static void nvpair_free(nvpair_t *nvp) { switch (NVP_TYPE(nvp)) { case DATA_TYPE_NVLIST: nvlist_free(EMBEDDED_NVL(nvp)); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); int i; for (i = 0; i < NVP_NELEM(nvp); i++) if (nvlp[i] != NULL) nvlist_free(nvlp[i]); break; } default: break; } } /* * nvlist_free - free an unpacked nvlist */ void nvlist_free(nvlist_t *nvl) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return; /* * Unpacked nvlist are linked through i_nvp_t */ curr = priv->nvp_list; while (curr != NULL) { nvpair_t *nvp = &curr->nvi_nvp; curr = curr->nvi_next; nvpair_free(nvp); nvp_buf_free(nvl, nvp); } if (!(priv->nvp_stat & NV_STAT_EMBEDDED)) nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t))); else nvl->nvl_priv = 0; nvt_tab_free(priv); nv_mem_free(priv, priv, sizeof (nvpriv_t)); } static int nvlist_contains_nvp(const nvlist_t *nvl, const nvpair_t *nvp) { const nvpriv_t *priv = (const nvpriv_t *)(uintptr_t)nvl->nvl_priv; const i_nvp_t *curr; if (nvp == NULL) return (0); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) if (&curr->nvi_nvp == nvp) return (1); return (0); } /* * Make a copy of nvlist */ int nvlist_dup(const nvlist_t *nvl, nvlist_t **nvlp, int kmflag) { return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag))); } int nvlist_xdup(const nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) { int err; nvlist_t *ret; if (nvl == NULL || nvlp == NULL) return (EINVAL); if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0) return (err); if ((err = nvlist_copy_pairs(nvl, ret)) != 0) nvlist_free(ret); else *nvlp = ret; return (err); } /* * Remove all with matching name */ int nvlist_remove_all(nvlist_t *nvl, const char *name) { int error = ENOENT; if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) return (EINVAL); nvpair_t *nvp; while ((nvp = nvt_lookup_name(nvl, name)) != NULL) { VERIFY0(nvlist_remove_nvpair(nvl, nvp)); error = 0; } return (error); } /* * Remove first one with matching name and type */ int nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) { if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) return (EINVAL); nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); if (nvp == NULL) return (ENOENT); return (nvlist_remove_nvpair(nvl, nvp)); } int nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) { if (nvl == NULL || nvp == NULL) return (EINVAL); int err = nvt_remove_nvpair(nvl, nvp); if (err != 0) return (err); nvp_buf_unlink(nvl, nvp); nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (0); } /* * This function calculates the size of an nvpair value. * * The data argument controls the behavior in case of the data types * DATA_TYPE_STRING and * DATA_TYPE_STRING_ARRAY * Is data == NULL then the size of the string(s) is excluded. */ static int i_get_value_size(data_type_t type, const void *data, uint_t nelem) { uint64_t value_sz; if (i_validate_type_nelem(type, nelem) != 0) return (-1); /* Calculate required size for holding value */ switch (type) { case DATA_TYPE_BOOLEAN: value_sz = 0; break; case DATA_TYPE_BOOLEAN_VALUE: value_sz = sizeof (boolean_t); break; case DATA_TYPE_BYTE: value_sz = sizeof (uchar_t); break; case DATA_TYPE_INT8: value_sz = sizeof (int8_t); break; case DATA_TYPE_UINT8: value_sz = sizeof (uint8_t); break; case DATA_TYPE_INT16: value_sz = sizeof (int16_t); break; case DATA_TYPE_UINT16: value_sz = sizeof (uint16_t); break; case DATA_TYPE_INT32: value_sz = sizeof (int32_t); break; case DATA_TYPE_UINT32: value_sz = sizeof (uint32_t); break; case DATA_TYPE_INT64: value_sz = sizeof (int64_t); break; case DATA_TYPE_UINT64: value_sz = sizeof (uint64_t); break; #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: value_sz = sizeof (double); break; #endif case DATA_TYPE_STRING: if (data == NULL) value_sz = 0; else value_sz = strlen(data) + 1; break; case DATA_TYPE_BOOLEAN_ARRAY: value_sz = (uint64_t)nelem * sizeof (boolean_t); break; case DATA_TYPE_BYTE_ARRAY: value_sz = (uint64_t)nelem * sizeof (uchar_t); break; case DATA_TYPE_INT8_ARRAY: value_sz = (uint64_t)nelem * sizeof (int8_t); break; case DATA_TYPE_UINT8_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint8_t); break; case DATA_TYPE_INT16_ARRAY: value_sz = (uint64_t)nelem * sizeof (int16_t); break; case DATA_TYPE_UINT16_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint16_t); break; case DATA_TYPE_INT32_ARRAY: value_sz = (uint64_t)nelem * sizeof (int32_t); break; case DATA_TYPE_UINT32_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint32_t); break; case DATA_TYPE_INT64_ARRAY: value_sz = (uint64_t)nelem * sizeof (int64_t); break; case DATA_TYPE_UINT64_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t); break; case DATA_TYPE_STRING_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t); if (data != NULL) { char *const *strs = data; uint_t i; /* no alignment requirement for strings */ for (i = 0; i < nelem; i++) { if (strs[i] == NULL) return (-1); value_sz += strlen(strs[i]) + 1; } } break; case DATA_TYPE_HRTIME: value_sz = sizeof (hrtime_t); break; case DATA_TYPE_NVLIST: value_sz = NV_ALIGN(sizeof (nvlist_t)); break; case DATA_TYPE_NVLIST_ARRAY: value_sz = (uint64_t)nelem * sizeof (uint64_t) + (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t)); break; default: return (-1); } return (value_sz > INT32_MAX ? -1 : (int)value_sz); } static int nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl) { nvpriv_t *priv; int err; if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t) nvl->nvl_priv)) == NULL) return (ENOMEM); nvlist_init(emb_nvl, onvl->nvl_nvflag, priv); if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) { nvlist_free(emb_nvl); emb_nvl->nvl_priv = 0; } return (err); } /* * nvlist_add_common - Add new pair to nvlist */ static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t nelem, const void *data) { nvpair_t *nvp; uint_t i; int nvp_sz, name_sz, value_sz; int err = 0; if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) return (EINVAL); if (nelem != 0 && data == NULL) return (EINVAL); /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) included. */ if ((value_sz = i_get_value_size(type, data, nelem)) < 0) return (EINVAL); if (i_validate_nvpair_value(type, nelem, data) != 0) return (EINVAL); /* * If we're adding an nvlist or nvlist array, ensure that we are not * adding the input nvlist to itself, which would cause recursion, * and ensure that no NULL nvlist pointers are present. */ switch (type) { case DATA_TYPE_NVLIST: if (data == nvl || data == NULL) return (EINVAL); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **onvlp = (nvlist_t **)data; for (i = 0; i < nelem; i++) { if (onvlp[i] == nvl || onvlp[i] == NULL) return (EINVAL); } break; } default: break; } /* calculate sizes of the nvpair elements and the nvpair itself */ name_sz = strlen(name) + 1; if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * NBBY - 1)) return (EINVAL); nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL) return (ENOMEM); ASSERT(nvp->nvp_size == nvp_sz); nvp->nvp_name_sz = name_sz; nvp->nvp_value_elem = nelem; nvp->nvp_type = type; memcpy(NVP_NAME(nvp), name, name_sz); switch (type) { case DATA_TYPE_BOOLEAN: break; case DATA_TYPE_STRING_ARRAY: { char *const *strs = data; char *buf = NVP_VALUE(nvp); char **cstrs = (void *)buf; /* skip pre-allocated space for pointer array */ buf += nelem * sizeof (uint64_t); for (i = 0; i < nelem; i++) { int slen = strlen(strs[i]) + 1; memcpy(buf, strs[i], slen); cstrs[i] = buf; buf += slen; } break; } case DATA_TYPE_NVLIST: { nvlist_t *nnvl = EMBEDDED_NVL(nvp); nvlist_t *onvl = (nvlist_t *)data; if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) { nvp_buf_free(nvl, nvp); return (err); } break; } case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **onvlp = (nvlist_t **)data; nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); nvlist_t *embedded = (nvlist_t *) ((uintptr_t)nvlp + nelem * sizeof (uint64_t)); for (i = 0; i < nelem; i++) { if ((err = nvlist_copy_embedded(nvl, onvlp[i], embedded)) != 0) { /* * Free any successfully created lists */ nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (err); } nvlp[i] = embedded++; } break; } default: memcpy(NVP_VALUE(nvp), data, value_sz); } /* if unique name, remove before add */ if (nvl->nvl_nvflag & NV_UNIQUE_NAME) (void) nvlist_remove_all(nvl, name); else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) (void) nvlist_remove(nvl, name, type); err = nvt_add_nvpair(nvl, nvp); if (err != 0) { nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (err); } nvp_buf_link(nvl, nvp); return (0); } int nvlist_add_boolean(nvlist_t *nvl, const char *name) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL)); } int nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val)); } int nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val)); } int nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val)); } int nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val)); } int nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val)); } int nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val)); } int nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val)); } int nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val)); } int nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val)); } int nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); } #if !defined(_KERNEL) int nvlist_add_double(nvlist_t *nvl, const char *name, double val) { return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val)); } #endif int nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) { return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val)); } int nvlist_add_boolean_array(nvlist_t *nvl, const char *name, const boolean_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); } int nvlist_add_byte_array(nvlist_t *nvl, const char *name, const uchar_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); } int nvlist_add_int8_array(nvlist_t *nvl, const char *name, const int8_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); } int nvlist_add_uint8_array(nvlist_t *nvl, const char *name, const uint8_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); } int nvlist_add_int16_array(nvlist_t *nvl, const char *name, const int16_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); } int nvlist_add_uint16_array(nvlist_t *nvl, const char *name, const uint16_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); } int nvlist_add_int32_array(nvlist_t *nvl, const char *name, const int32_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); } int nvlist_add_uint32_array(nvlist_t *nvl, const char *name, const uint32_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); } int nvlist_add_int64_array(nvlist_t *nvl, const char *name, const int64_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); } int nvlist_add_uint64_array(nvlist_t *nvl, const char *name, const uint64_t *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); } int nvlist_add_string_array(nvlist_t *nvl, const char *name, const char *const *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); } int nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val) { return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val)); } int nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *val) { return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); } int nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, const nvlist_t * const *a, uint_t n) { return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); } /* reading name-value pairs */ nvpair_t * nvlist_next_nvpair(nvlist_t *nvl, const nvpair_t *nvp) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); curr = NVPAIR2I_NVP(nvp); /* * Ensure that nvp is a valid nvpair on this nvlist. * NB: nvp_curr is used only as a hint so that we don't always * have to walk the list to determine if nvp is still on the list. */ if (nvp == NULL) curr = priv->nvp_list; else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) curr = curr->nvi_next; else curr = NULL; priv->nvp_curr = curr; return (curr != NULL ? &curr->nvi_nvp : NULL); } nvpair_t * nvlist_prev_nvpair(nvlist_t *nvl, const nvpair_t *nvp) { nvpriv_t *priv; i_nvp_t *curr; if (nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (NULL); curr = NVPAIR2I_NVP(nvp); if (nvp == NULL) curr = priv->nvp_last; else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) curr = curr->nvi_prev; else curr = NULL; priv->nvp_curr = curr; return (curr != NULL ? &curr->nvi_nvp : NULL); } boolean_t nvlist_empty(const nvlist_t *nvl) { const nvpriv_t *priv; if (nvl == NULL || (priv = (const nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (B_TRUE); return (priv->nvp_list == NULL); } const char * nvpair_name(const nvpair_t *nvp) { return (NVP_NAME(nvp)); } data_type_t nvpair_type(const nvpair_t *nvp) { return (NVP_TYPE(nvp)); } int nvpair_type_is_array(const nvpair_t *nvp) { data_type_t type = NVP_TYPE(nvp); if ((type == DATA_TYPE_BYTE_ARRAY) || (type == DATA_TYPE_INT8_ARRAY) || (type == DATA_TYPE_UINT8_ARRAY) || (type == DATA_TYPE_INT16_ARRAY) || (type == DATA_TYPE_UINT16_ARRAY) || (type == DATA_TYPE_INT32_ARRAY) || (type == DATA_TYPE_UINT32_ARRAY) || (type == DATA_TYPE_INT64_ARRAY) || (type == DATA_TYPE_UINT64_ARRAY) || (type == DATA_TYPE_BOOLEAN_ARRAY) || (type == DATA_TYPE_STRING_ARRAY) || (type == DATA_TYPE_NVLIST_ARRAY)) return (1); return (0); } static int nvpair_value_common(const nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) { int value_sz; if (nvp == NULL || nvpair_type(nvp) != type) return (EINVAL); /* * For non-array types, we copy the data. * For array types (including string), we set a pointer. */ switch (type) { case DATA_TYPE_BOOLEAN: if (nelem != NULL) *nelem = 0; break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif if (data == NULL) return (EINVAL); if ((value_sz = i_get_value_size(type, NULL, 1)) < 0) return (EINVAL); memcpy(data, NVP_VALUE(nvp), (size_t)value_sz); if (nelem != NULL) *nelem = 1; break; case DATA_TYPE_NVLIST: case DATA_TYPE_STRING: if (data == NULL) return (EINVAL); /* * This discards the const from nvp, so all callers for these * types must not accept const nvpairs. */ *(void **)data = (void *)NVP_VALUE(nvp); if (nelem != NULL) *nelem = 1; break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: if (nelem == NULL || data == NULL) return (EINVAL); /* * This discards the const from nvp, so all callers for these * types must not accept const nvpairs. */ if ((*nelem = NVP_NELEM(nvp)) != 0) *(void **)data = (void *)NVP_VALUE(nvp); else *(void **)data = NULL; break; default: return (ENOTSUP); } return (0); } static int nvlist_lookup_common(const nvlist_t *nvl, const char *name, data_type_t type, uint_t *nelem, void *data) { if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) return (EINVAL); if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) return (ENOTSUP); nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); if (nvp == NULL) return (ENOENT); return (nvpair_value_common(nvp, type, nelem, data)); } int nvlist_lookup_boolean(const nvlist_t *nvl, const char *name) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL)); } int nvlist_lookup_boolean_value(const nvlist_t *nvl, const char *name, boolean_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); } int nvlist_lookup_byte(const nvlist_t *nvl, const char *name, uchar_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val)); } int nvlist_lookup_int8(const nvlist_t *nvl, const char *name, int8_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val)); } int nvlist_lookup_uint8(const nvlist_t *nvl, const char *name, uint8_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val)); } int nvlist_lookup_int16(const nvlist_t *nvl, const char *name, int16_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val)); } int nvlist_lookup_uint16(const nvlist_t *nvl, const char *name, uint16_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val)); } int nvlist_lookup_int32(const nvlist_t *nvl, const char *name, int32_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val)); } int nvlist_lookup_uint32(const nvlist_t *nvl, const char *name, uint32_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val)); } int nvlist_lookup_int64(const nvlist_t *nvl, const char *name, int64_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val)); } int nvlist_lookup_uint64(const nvlist_t *nvl, const char *name, uint64_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); } #if !defined(_KERNEL) int nvlist_lookup_double(const nvlist_t *nvl, const char *name, double *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val)); } #endif int nvlist_lookup_string(const nvlist_t *nvl, const char *name, const char **val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val)); } int nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val)); } int nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, boolean_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); } int nvlist_lookup_byte_array(nvlist_t *nvl, const char *name, uchar_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); } int nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); } int nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, uint8_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); } int nvlist_lookup_int16_array(nvlist_t *nvl, const char *name, int16_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); } int nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, uint16_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); } int nvlist_lookup_int32_array(nvlist_t *nvl, const char *name, int32_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); } int nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, uint32_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); } int nvlist_lookup_int64_array(nvlist_t *nvl, const char *name, int64_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); } int nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, uint64_t **a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); } int nvlist_lookup_string_array(nvlist_t *nvl, const char *name, char ***a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); } int nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t ***a, uint_t *n) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); } int nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val) { return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val)); } int nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) { va_list ap; char *name; int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0); int ret = 0; va_start(ap, flag); while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { data_type_t type; void *val; uint_t *nelem; switch (type = va_arg(ap, data_type_t)) { case DATA_TYPE_BOOLEAN: ret = nvlist_lookup_common(nvl, name, type, NULL, NULL); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: case DATA_TYPE_STRING: case DATA_TYPE_NVLIST: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif val = va_arg(ap, void *); ret = nvlist_lookup_common(nvl, name, type, NULL, val); break; case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: case DATA_TYPE_STRING_ARRAY: case DATA_TYPE_NVLIST_ARRAY: val = va_arg(ap, void *); nelem = va_arg(ap, uint_t *); ret = nvlist_lookup_common(nvl, name, type, nelem, val); break; default: ret = EINVAL; } if (ret == ENOENT && noentok) ret = 0; } va_end(ap); return (ret); } /* * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function * returns zero and a pointer to the matching nvpair is returned in '*ret' * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate * multiple levels of embedded nvlists, with 'sep' as the separator. As an * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or * "a.d[3].e[1]". This matches the C syntax for array embed (for convenience, * code also supports "a.d[3]e[1]" syntax). * * If 'ip' is non-NULL and the last name component is an array, return the * value of the "...[index]" array index in *ip. For an array reference that * is not indexed, *ip will be returned as -1. If there is a syntax error in * 'name', and 'ep' is non-NULL then *ep will be set to point to the location * inside the 'name' string where the syntax error was detected. */ static int nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, nvpair_t **ret, int *ip, const char **ep) { nvpair_t *nvp; const char *np; char *sepp = NULL; char *idxp, *idxep; nvlist_t **nva; long idx = 0; int n; if (ip) *ip = -1; /* not indexed */ if (ep) *ep = NULL; if ((nvl == NULL) || (name == NULL)) return (EINVAL); sepp = NULL; idx = 0; /* step through components of name */ for (np = name; np && *np; np = sepp) { /* ensure unique names */ if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME)) return (ENOTSUP); /* skip white space */ skip_whitespace(np); if (*np == 0) break; /* set 'sepp' to end of current component 'np' */ if (sep) sepp = strchr(np, sep); else sepp = NULL; /* find start of next "[ index ]..." */ idxp = strchr(np, '['); /* if sepp comes first, set idxp to NULL */ if (sepp && idxp && (sepp < idxp)) idxp = NULL; /* * At this point 'idxp' is set if there is an index * expected for the current component. */ if (idxp) { /* set 'n' to length of current 'np' name component */ n = idxp++ - np; /* keep sepp up to date for *ep use as we advance */ skip_whitespace(idxp); sepp = idxp; /* determine the index value */ #if defined(_KERNEL) if (ddi_strtol(idxp, &idxep, 0, &idx)) goto fail; #else idx = strtol(idxp, &idxep, 0); #endif if (idxep == idxp) goto fail; /* keep sepp up to date for *ep use as we advance */ sepp = idxep; /* skip white space index value and check for ']' */ skip_whitespace(sepp); if (*sepp++ != ']') goto fail; /* for embedded arrays, support C syntax: "a[1].b" */ skip_whitespace(sepp); if (sep && (*sepp == sep)) sepp++; } else if (sepp) { n = sepp++ - np; } else { n = strlen(np); } /* trim trailing whitespace by reducing length of 'np' */ if (n == 0) goto fail; for (n--; (np[n] == ' ') || (np[n] == '\t'); n--) ; n++; /* skip whitespace, and set sepp to NULL if complete */ if (sepp) { skip_whitespace(sepp); if (*sepp == 0) sepp = NULL; } /* * At this point: * o 'n' is the length of current 'np' component. * o 'idxp' is set if there was an index, and value 'idx'. * o 'sepp' is set to the beginning of the next component, * and set to NULL if we have no more components. * * Search for nvpair with matching component name. */ for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { /* continue if no match on name */ if (strncmp(np, nvpair_name(nvp), n) || (strlen(nvpair_name(nvp)) != n)) continue; /* if indexed, verify type is array oriented */ if (idxp && !nvpair_type_is_array(nvp)) goto fail; /* * Full match found, return nvp and idx if this * was the last component. */ if (sepp == NULL) { if (ret) *ret = nvp; if (ip && idxp) *ip = (int)idx; /* return index */ return (0); /* found */ } /* * More components: current match must be * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY * to support going deeper. */ if (nvpair_type(nvp) == DATA_TYPE_NVLIST) { nvl = EMBEDDED_NVL(nvp); break; } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) { if (nvpair_value_nvlist_array(nvp, &nva, (uint_t *)&n) != 0) goto fail; if (nva == NULL) goto fail; if ((n < 0) || (idx >= n)) goto fail; nvl = nva[idx]; break; } /* type does not support more levels */ goto fail; } if (nvp == NULL) goto fail; /* 'name' not found */ /* search for match of next component in embedded 'nvl' list */ } fail: if (ep && sepp) *ep = sepp; return (EINVAL); } /* * Return pointer to nvpair with specified 'name'. */ int nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret) { return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL)); } /* * Determine if named nvpair exists in nvlist (use embedded separator of '.' * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed * description. */ int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl, const char *name, nvpair_t **ret, int *ip, const char **ep) { return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep)); } boolean_t nvlist_exists(const nvlist_t *nvl, const char *name) { nvpriv_t *priv; nvpair_t *nvp; i_nvp_t *curr; if (name == NULL || nvl == NULL || (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (B_FALSE); for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { nvp = &curr->nvi_nvp; if (strcmp(name, NVP_NAME(nvp)) == 0) return (B_TRUE); } return (B_FALSE); } int nvpair_value_boolean_value(const nvpair_t *nvp, boolean_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); } int nvpair_value_byte(const nvpair_t *nvp, uchar_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val)); } int nvpair_value_int8(const nvpair_t *nvp, int8_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val)); } int nvpair_value_uint8(const nvpair_t *nvp, uint8_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val)); } int nvpair_value_int16(const nvpair_t *nvp, int16_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val)); } int nvpair_value_uint16(const nvpair_t *nvp, uint16_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val)); } int nvpair_value_int32(const nvpair_t *nvp, int32_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val)); } int nvpair_value_uint32(const nvpair_t *nvp, uint32_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val)); } int nvpair_value_int64(const nvpair_t *nvp, int64_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val)); } int nvpair_value_uint64(const nvpair_t *nvp, uint64_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); } #if !defined(_KERNEL) int nvpair_value_double(const nvpair_t *nvp, double *val) { return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val)); } #endif int nvpair_value_string(const nvpair_t *nvp, const char **val) { return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val)); } int nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val) { return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val)); } int nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val)); } int nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val)); } int nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val)); } int nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val)); } int nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val)); } int nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val)); } int nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val)); } int nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val)); } int nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val)); } int nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val)); } int nvpair_value_string_array(nvpair_t *nvp, const char ***val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val)); } int nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem) { return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val)); } int nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val) { return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val)); } /* * Add specified pair to the list. */ int nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) { if (nvl == NULL || nvp == NULL) return (EINVAL); return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp), NVP_NELEM(nvp), NVP_VALUE(nvp))); } /* * Merge the supplied nvlists and put the result in dst. * The merged list will contain all names specified in both lists, * the values are taken from nvl in the case of duplicates. * Return 0 on success. */ int nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag) { (void) flag; if (nvl == NULL || dst == NULL) return (EINVAL); if (dst != nvl) return (nvlist_copy_pairs(nvl, dst)); return (0); } /* * Encoding related routines */ #define NVS_OP_ENCODE 0 #define NVS_OP_DECODE 1 #define NVS_OP_GETSIZE 2 typedef struct nvs_ops nvs_ops_t; typedef struct { int nvs_op; const nvs_ops_t *nvs_ops; void *nvs_private; nvpriv_t *nvs_priv; int nvs_recursion; } nvstream_t; /* * nvs operations are: * - nvs_nvlist * encoding / decoding of an nvlist header (nvlist_t) * calculates the size used for header and end detection * * - nvs_nvpair * responsible for the first part of encoding / decoding of an nvpair * calculates the decoded size of an nvpair * * - nvs_nvp_op * second part of encoding / decoding of an nvpair * * - nvs_nvp_size * calculates the encoding size of an nvpair * * - nvs_nvl_fini * encodes the end detection mark (zeros). */ struct nvs_ops { int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *); int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *); int (*nvs_nvp_op)(nvstream_t *, nvpair_t *); int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *); int (*nvs_nvl_fini)(nvstream_t *); }; typedef struct { char nvh_encoding; /* nvs encoding method */ char nvh_endian; /* nvs endian */ char nvh_reserved1; /* reserved for future use */ char nvh_reserved2; /* reserved for future use */ } nvs_header_t; static int nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; /* * Walk nvpair in list and encode each nvpair */ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0) return (EFAULT); return (nvs->nvs_ops->nvs_nvl_fini(nvs)); } static int nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) { nvpair_t *nvp; size_t nvsize; int err; /* * Get decoded size of next pair in stream, alloc * memory for nvpair_t, then decode the nvpair */ while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) { if (nvsize == 0) /* end of list */ break; /* make sure len makes sense */ if (nvsize < NVP_SIZE_CALC(1, 0)) return (EFAULT); if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL) return (ENOMEM); if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) { nvp_buf_free(nvl, nvp); return (err); } if (i_validate_nvpair(nvp) != 0) { nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (EFAULT); } err = nvt_add_nvpair(nvl, nvp); if (err != 0) { nvpair_free(nvp); nvp_buf_free(nvl, nvp); return (err); } nvp_buf_link(nvl, nvp); } return (err); } static int nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) { nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; i_nvp_t *curr; uint64_t nvsize = *buflen; size_t size; /* * Get encoded size of nvpairs in nvlist */ for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0) return (EINVAL); if ((nvsize += size) > INT32_MAX) return (EINVAL); } *buflen = nvsize; return (0); } static int nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) { int err; if (nvl->nvl_priv == 0) return (EFAULT); /* * Perform the operation, starting with header, then each nvpair */ if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0) return (err); switch (nvs->nvs_op) { case NVS_OP_ENCODE: err = nvs_encode_pairs(nvs, nvl); break; case NVS_OP_DECODE: err = nvs_decode_pairs(nvs, nvl); break; case NVS_OP_GETSIZE: err = nvs_getsize_pairs(nvs, nvl, buflen); break; default: err = EINVAL; } return (err); } static int nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: { int err; if (nvs->nvs_recursion >= nvpair_max_recursion) return (EINVAL); nvs->nvs_recursion++; err = nvs_operation(nvs, embedded, NULL); nvs->nvs_recursion--; return (err); } case NVS_OP_DECODE: { nvpriv_t *priv; int err; if (embedded->nvl_version != NV_VERSION) return (ENOTSUP); if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL) return (ENOMEM); nvlist_init(embedded, embedded->nvl_nvflag, priv); if (nvs->nvs_recursion >= nvpair_max_recursion) { nvlist_free(embedded); return (EINVAL); } nvs->nvs_recursion++; if ((err = nvs_operation(nvs, embedded, NULL)) != 0) nvlist_free(embedded); nvs->nvs_recursion--; return (err); } default: break; } return (EINVAL); } static int nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { size_t nelem = NVP_NELEM(nvp); nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); int i; switch (nvs->nvs_op) { case NVS_OP_ENCODE: for (i = 0; i < nelem; i++) if (nvs_embedded(nvs, nvlp[i]) != 0) return (EFAULT); break; case NVS_OP_DECODE: { size_t len = nelem * sizeof (uint64_t); nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len); memset(nvlp, 0, len); /* don't trust packed data */ for (i = 0; i < nelem; i++) { if (nvs_embedded(nvs, embedded) != 0) { nvpair_free(nvp); return (EFAULT); } nvlp[i] = embedded++; } break; } case NVS_OP_GETSIZE: { uint64_t nvsize = 0; for (i = 0; i < nelem; i++) { size_t nvp_sz = 0; if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0) return (EINVAL); if ((nvsize += nvp_sz) > INT32_MAX) return (EINVAL); } *size = nvsize; break; } default: return (EINVAL); } return (0); } static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *); static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *); /* * Common routine for nvlist operations: * encode, decode, getsize (encoded size). */ static int nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, int nvs_op) { int err = 0; nvstream_t nvs; int nvl_endian; #if defined(_ZFS_LITTLE_ENDIAN) int host_endian = 1; #elif defined(_ZFS_BIG_ENDIAN) int host_endian = 0; #else #error "No endian defined!" #endif /* _ZFS_LITTLE_ENDIAN */ nvs_header_t *nvh; if (buflen == NULL || nvl == NULL || (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) return (EINVAL); nvs.nvs_op = nvs_op; nvs.nvs_recursion = 0; /* * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and * a buffer is allocated. The first 4 bytes in the buffer are * used for encoding method and host endian. */ switch (nvs_op) { case NVS_OP_ENCODE: if (buf == NULL || *buflen < sizeof (nvs_header_t)) return (EINVAL); nvh = (void *)buf; nvh->nvh_encoding = encoding; nvh->nvh_endian = nvl_endian = host_endian; nvh->nvh_reserved1 = 0; nvh->nvh_reserved2 = 0; break; case NVS_OP_DECODE: if (buf == NULL || *buflen < sizeof (nvs_header_t)) return (EINVAL); /* get method of encoding from first byte */ nvh = (void *)buf; encoding = nvh->nvh_encoding; nvl_endian = nvh->nvh_endian; break; case NVS_OP_GETSIZE: nvl_endian = host_endian; /* * add the size for encoding */ *buflen = sizeof (nvs_header_t); break; default: return (ENOTSUP); } /* * Create an nvstream with proper encoding method */ switch (encoding) { case NV_ENCODE_NATIVE: /* * check endianness, in case we are unpacking * from a file */ if (nvl_endian != host_endian) return (ENOTSUP); err = nvs_native(&nvs, nvl, buf, buflen); break; case NV_ENCODE_XDR: err = nvs_xdr(&nvs, nvl, buf, buflen); break; default: err = ENOTSUP; break; } return (err); } int nvlist_size(nvlist_t *nvl, size_t *size, int encoding) { return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE)); } /* * Pack nvlist into contiguous memory */ int nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, int kmflag) { return (nvlist_xpack(nvl, bufp, buflen, encoding, nvlist_nv_alloc(kmflag))); } int nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, nv_alloc_t *nva) { nvpriv_t nvpriv; size_t alloc_size; char *buf; int err; if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL) return (EINVAL); if (*bufp != NULL) return (nvlist_common(nvl, *bufp, buflen, encoding, NVS_OP_ENCODE)); /* * Here is a difficult situation: * 1. The nvlist has fixed allocator properties. * All other nvlist routines (like nvlist_add_*, ...) use * these properties. * 2. When using nvlist_pack() the user can specify their own * allocator properties (e.g. by using KM_NOSLEEP). * * We use the user specified properties (2). A clearer solution * will be to remove the kmflag from nvlist_pack(), but we will * not change the interface. */ nv_priv_init(&nvpriv, nva, 0); if ((err = nvlist_size(nvl, &alloc_size, encoding))) return (err); if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL) return (ENOMEM); if ((err = nvlist_common(nvl, buf, &alloc_size, encoding, NVS_OP_ENCODE)) != 0) { nv_mem_free(&nvpriv, buf, alloc_size); } else { *buflen = alloc_size; *bufp = buf; } return (err); } /* * Unpack buf into an nvlist_t */ int nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) { return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag))); } int nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva) { nvlist_t *nvl; int err; if (nvlp == NULL) return (EINVAL); if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0) return (err); if ((err = nvlist_common(nvl, buf, &buflen, NV_ENCODE_NATIVE, NVS_OP_DECODE)) != 0) nvlist_free(nvl); else *nvlp = nvl; return (err); } /* * Native encoding functions */ typedef struct { /* * This structure is used when decoding a packed nvpair in * the native format. n_base points to a buffer containing the * packed nvpair. n_end is a pointer to the end of the buffer. * (n_end actually points to the first byte past the end of the * buffer.) n_curr is a pointer that lies between n_base and n_end. * It points to the current data that we are decoding. * The amount of data left in the buffer is equal to n_end - n_curr. * n_flag is used to recognize a packed embedded list. */ caddr_t n_base; caddr_t n_end; caddr_t n_curr; uint_t n_flag; } nvs_native_t; static int nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf, size_t buflen) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: nvs->nvs_private = native; native->n_curr = native->n_base = buf; native->n_end = buf + buflen; native->n_flag = 0; return (0); case NVS_OP_GETSIZE: nvs->nvs_private = native; native->n_curr = native->n_base = native->n_end = NULL; native->n_flag = 0; return (0); default: return (EINVAL); } } static void nvs_native_destroy(nvstream_t *nvs) { nvs->nvs_private = NULL; } static int native_cp(nvstream_t *nvs, void *buf, size_t size) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; if (native->n_curr + size > native->n_end) return (EFAULT); /* * The memcpy() below eliminates alignment requirement * on the buffer (stream) and is preferred over direct access. */ switch (nvs->nvs_op) { case NVS_OP_ENCODE: memcpy(native->n_curr, buf, size); break; case NVS_OP_DECODE: memcpy(buf, native->n_curr, size); break; default: return (EINVAL); } native->n_curr += size; return (0); } /* * operate on nvlist_t header */ static int nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) { nvs_native_t *native = nvs->nvs_private; switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: if (native->n_flag) return (0); /* packed embedded list */ native->n_flag = 1; /* copy version and nvflag of the nvlist_t */ if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 || native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0) return (EFAULT); return (0); case NVS_OP_GETSIZE: /* * if calculate for packed embedded list * 4 for end of the embedded list * else * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag * and 4 for end of the entire list */ if (native->n_flag) { *size += 4; } else { native->n_flag = 1; *size += 2 * sizeof (int32_t) + 4; } return (0); default: return (EINVAL); } } static int nvs_native_nvl_fini(nvstream_t *nvs) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; /* * Add 4 zero bytes at end of nvlist. They are used * for end detection by the decode routine. */ if (native->n_curr + sizeof (int) > native->n_end) return (EFAULT); memset(native->n_curr, 0, sizeof (int)); native->n_curr += sizeof (int); } return (0); } static int nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; nvlist_t *packed = (void *) (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); /* * Null out the pointer that is meaningless in the packed * structure. The address may not be aligned, so we have * to use memset. */ memset((char *)packed + offsetof(nvlist_t, nvl_priv), 0, sizeof (uint64_t)); } return (nvs_embedded(nvs, EMBEDDED_NVL(nvp))); } static int nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) { if (nvs->nvs_op == NVS_OP_ENCODE) { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp); size_t len = NVP_NELEM(nvp) * sizeof (uint64_t); nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len); int i; /* * Null out pointers that are meaningless in the packed * structure. The addresses may not be aligned, so we have * to use memset. */ memset(value, 0, len); for (i = 0; i < NVP_NELEM(nvp); i++, packed++) /* * Null out the pointer that is meaningless in the * packed structure. The address may not be aligned, * so we have to use memset. */ memset((char *)packed + offsetof(nvlist_t, nvl_priv), 0, sizeof (uint64_t)); } return (nvs_embedded_nvl_array(nvs, nvp, NULL)); } static void nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; uint64_t *strp = (void *) (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); /* * Null out pointers that are meaningless in the packed * structure. The addresses may not be aligned, so we have * to use memset. */ memset(strp, 0, NVP_NELEM(nvp) * sizeof (uint64_t)); break; } case NVS_OP_DECODE: { char **strp = (void *)NVP_VALUE(nvp); char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t)); int i; for (i = 0; i < NVP_NELEM(nvp); i++) { strp[i] = buf; buf += strlen(buf) + 1; } break; } } } static int nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) { data_type_t type; int value_sz; int ret = 0; /* * We do the initial memcpy of the data before we look at * the nvpair type, because when we're decoding, we won't * have the correct values for the pair until we do the memcpy. */ switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: if (native_cp(nvs, nvp, nvp->nvp_size) != 0) return (EFAULT); break; default: return (EINVAL); } /* verify nvp_name_sz, check the name string length */ if (i_validate_nvpair_name(nvp) != 0) return (EFAULT); type = NVP_TYPE(nvp); /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) return (EFAULT); if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) return (EFAULT); switch (type) { case DATA_TYPE_NVLIST: ret = nvpair_native_embedded(nvs, nvp); break; case DATA_TYPE_NVLIST_ARRAY: ret = nvpair_native_embedded_array(nvs, nvp); break; case DATA_TYPE_STRING_ARRAY: nvpair_native_string_array(nvs, nvp); break; default: break; } return (ret); } static int nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { uint64_t nvp_sz = nvp->nvp_size; switch (NVP_TYPE(nvp)) { case DATA_TYPE_NVLIST: { size_t nvsize = 0; if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0) return (EINVAL); nvp_sz += nvsize; break; } case DATA_TYPE_NVLIST_ARRAY: { size_t nvsize; if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0) return (EINVAL); nvp_sz += nvsize; break; } default: break; } if (nvp_sz > INT32_MAX) return (EINVAL); *size = nvp_sz; return (0); } static int nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: return (nvs_native_nvp_op(nvs, nvp)); case NVS_OP_DECODE: { nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; int32_t decode_len; /* try to read the size value from the stream */ if (native->n_curr + sizeof (int32_t) > native->n_end) return (EFAULT); memcpy(&decode_len, native->n_curr, sizeof (int32_t)); /* sanity check the size value */ if (decode_len < 0 || decode_len > native->n_end - native->n_curr) return (EFAULT); *size = decode_len; /* * If at the end of the stream then move the cursor * forward, otherwise nvpair_native_op() will read * the entire nvpair at the same cursor position. */ if (*size == 0) native->n_curr += sizeof (int32_t); break; } default: return (EINVAL); } return (0); } static const nvs_ops_t nvs_native_ops = { .nvs_nvlist = nvs_native_nvlist, .nvs_nvpair = nvs_native_nvpair, .nvs_nvp_op = nvs_native_nvp_op, .nvs_nvp_size = nvs_native_nvp_size, .nvs_nvl_fini = nvs_native_nvl_fini }; static int nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) { nvs_native_t native; int err; nvs->nvs_ops = &nvs_native_ops; if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t), *buflen - sizeof (nvs_header_t))) != 0) return (err); err = nvs_operation(nvs, nvl, buflen); nvs_native_destroy(nvs); return (err); } /* * XDR encoding functions * * An xdr packed nvlist is encoded as: * * - encoding method and host endian (4 bytes) * - nvl_version (4 bytes) * - nvl_nvflag (4 bytes) * * - encoded nvpairs, the format of one xdr encoded nvpair is: * - encoded size of the nvpair (4 bytes) * - decoded size of the nvpair (4 bytes) * - name string, (4 + sizeof(NV_ALIGN4(string)) * a string is coded as size (4 bytes) and data * - data type (4 bytes) * - number of elements in the nvpair (4 bytes) * - data * * - 2 zero's for end of the entire list (8 bytes) */ static int nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen) { /* xdr data must be 4 byte aligned */ if ((ulong_t)buf % 4 != 0) return (EFAULT); switch (nvs->nvs_op) { case NVS_OP_ENCODE: xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE); nvs->nvs_private = xdr; return (0); case NVS_OP_DECODE: xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE); nvs->nvs_private = xdr; return (0); case NVS_OP_GETSIZE: nvs->nvs_private = NULL; return (0); default: return (EINVAL); } } static void nvs_xdr_destroy(nvstream_t *nvs) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: nvs->nvs_private = NULL; break; default: break; } } static int nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) { switch (nvs->nvs_op) { case NVS_OP_ENCODE: case NVS_OP_DECODE: { XDR *xdr = nvs->nvs_private; if (!xdr_int(xdr, &nvl->nvl_version) || !xdr_u_int(xdr, &nvl->nvl_nvflag)) return (EFAULT); break; } case NVS_OP_GETSIZE: { /* * 2 * 4 for nvl_version + nvl_nvflag * and 8 for end of the entire list */ *size += 2 * 4 + 8; break; } default: return (EINVAL); } return (0); } static int nvs_xdr_nvl_fini(nvstream_t *nvs) { if (nvs->nvs_op == NVS_OP_ENCODE) { XDR *xdr = nvs->nvs_private; int zero = 0; if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero)) return (EFAULT); } return (0); } /* * xdrproc_t-compatible callbacks for xdr_array() */ #if defined(_KERNEL) && defined(__linux__) /* Linux kernel */ #define NVS_BUILD_XDRPROC_T(type) \ static bool_t \ nvs_xdr_nvp_##type(XDR *xdrs, void *ptr) \ { \ return (xdr_##type(xdrs, ptr)); \ } #elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */ #define NVS_BUILD_XDRPROC_T(type) \ static bool_t \ nvs_xdr_nvp_##type(XDR *xdrs, ...) \ { \ va_list args; \ void *ptr; \ \ va_start(args, xdrs); \ ptr = va_arg(args, void *); \ va_end(args); \ \ return (xdr_##type(xdrs, ptr)); \ } #else /* FreeBSD, sunrpc */ #define NVS_BUILD_XDRPROC_T(type) \ static bool_t \ nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...) \ { \ return (xdr_##type(xdrs, ptr)); \ } #endif -/* BEGIN CSTYLED */ NVS_BUILD_XDRPROC_T(char); NVS_BUILD_XDRPROC_T(short); NVS_BUILD_XDRPROC_T(u_short); NVS_BUILD_XDRPROC_T(int); NVS_BUILD_XDRPROC_T(u_int); NVS_BUILD_XDRPROC_T(longlong_t); NVS_BUILD_XDRPROC_T(u_longlong_t); -/* END CSTYLED */ /* * The format of xdr encoded nvpair is: * encode_size, decode_size, name string, data type, nelem, data */ static int nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) { ASSERT(nvs != NULL && nvp != NULL); data_type_t type; char *buf; char *buf_end = (char *)nvp + nvp->nvp_size; int value_sz; uint_t nelem, buflen; bool_t ret = FALSE; XDR *xdr = nvs->nvs_private; ASSERT(xdr != NULL); /* name string */ if ((buf = NVP_NAME(nvp)) >= buf_end) return (EFAULT); buflen = buf_end - buf; if (!xdr_string(xdr, &buf, buflen - 1)) return (EFAULT); nvp->nvp_name_sz = strlen(buf) + 1; /* type and nelem */ if (!xdr_int(xdr, (int *)&nvp->nvp_type) || !xdr_int(xdr, &nvp->nvp_value_elem)) return (EFAULT); type = NVP_TYPE(nvp); nelem = nvp->nvp_value_elem; /* * Verify type and nelem and get the value size. * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY * is the size of the string(s) excluded. */ if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) return (EFAULT); /* if there is no data to extract then return */ if (nelem == 0) return (0); /* value */ if ((buf = NVP_VALUE(nvp)) >= buf_end) return (EFAULT); buflen = buf_end - buf; if (buflen < value_sz) return (EFAULT); switch (type) { case DATA_TYPE_NVLIST: if (nvs_embedded(nvs, (void *)buf) == 0) return (0); break; case DATA_TYPE_NVLIST_ARRAY: if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0) return (0); break; case DATA_TYPE_BOOLEAN: ret = TRUE; break; case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: ret = xdr_char(xdr, buf); break; case DATA_TYPE_INT16: ret = xdr_short(xdr, (void *)buf); break; case DATA_TYPE_UINT16: ret = xdr_u_short(xdr, (void *)buf); break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_INT32: ret = xdr_int(xdr, (void *)buf); break; case DATA_TYPE_UINT32: ret = xdr_u_int(xdr, (void *)buf); break; case DATA_TYPE_INT64: ret = xdr_longlong_t(xdr, (void *)buf); break; case DATA_TYPE_UINT64: ret = xdr_u_longlong_t(xdr, (void *)buf); break; case DATA_TYPE_HRTIME: /* * NOTE: must expose the definition of hrtime_t here */ ret = xdr_longlong_t(xdr, (void *)buf); break; #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: ret = xdr_double(xdr, (void *)buf); break; #endif case DATA_TYPE_STRING: ret = xdr_string(xdr, &buf, buflen - 1); break; case DATA_TYPE_BYTE_ARRAY: ret = xdr_opaque(xdr, buf, nelem); break; case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), nvs_xdr_nvp_char); break; case DATA_TYPE_INT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), sizeof (int16_t), nvs_xdr_nvp_short); break; case DATA_TYPE_UINT16_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), sizeof (uint16_t), nvs_xdr_nvp_u_short); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), sizeof (int32_t), nvs_xdr_nvp_int); break; case DATA_TYPE_UINT32_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), sizeof (uint32_t), nvs_xdr_nvp_u_int); break; case DATA_TYPE_INT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), sizeof (int64_t), nvs_xdr_nvp_longlong_t); break; case DATA_TYPE_UINT64_ARRAY: ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t); break; case DATA_TYPE_STRING_ARRAY: { size_t len = nelem * sizeof (uint64_t); char **strp = (void *)buf; int i; if (nvs->nvs_op == NVS_OP_DECODE) memset(buf, 0, len); /* don't trust packed data */ for (i = 0; i < nelem; i++) { if (buflen <= len) return (EFAULT); buf += len; buflen -= len; if (xdr_string(xdr, &buf, buflen - 1) != TRUE) return (EFAULT); if (nvs->nvs_op == NVS_OP_DECODE) strp[i] = buf; len = strlen(buf) + 1; } ret = TRUE; break; } default: break; } return (ret == TRUE ? 0 : EFAULT); } static int nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { data_type_t type = NVP_TYPE(nvp); /* * encode_size + decode_size + name string size + data type + nelem * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) */ uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4; switch (type) { case DATA_TYPE_BOOLEAN: break; case DATA_TYPE_BOOLEAN_VALUE: case DATA_TYPE_BYTE: case DATA_TYPE_INT8: case DATA_TYPE_UINT8: case DATA_TYPE_INT16: case DATA_TYPE_UINT16: case DATA_TYPE_INT32: case DATA_TYPE_UINT32: nvp_sz += 4; /* 4 is the minimum xdr unit */ break; case DATA_TYPE_INT64: case DATA_TYPE_UINT64: case DATA_TYPE_HRTIME: #if !defined(_KERNEL) case DATA_TYPE_DOUBLE: #endif nvp_sz += 8; break; case DATA_TYPE_STRING: nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp))); break; case DATA_TYPE_BYTE_ARRAY: nvp_sz += NV_ALIGN4(NVP_NELEM(nvp)); break; case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_INT8_ARRAY: case DATA_TYPE_UINT8_ARRAY: case DATA_TYPE_INT16_ARRAY: case DATA_TYPE_UINT16_ARRAY: case DATA_TYPE_INT32_ARRAY: case DATA_TYPE_UINT32_ARRAY: nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp); break; case DATA_TYPE_INT64_ARRAY: case DATA_TYPE_UINT64_ARRAY: nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp); break; case DATA_TYPE_STRING_ARRAY: { int i; char **strs = (void *)NVP_VALUE(nvp); for (i = 0; i < NVP_NELEM(nvp); i++) nvp_sz += 4 + NV_ALIGN4(strlen(strs[i])); break; } case DATA_TYPE_NVLIST: case DATA_TYPE_NVLIST_ARRAY: { size_t nvsize = 0; int old_nvs_op = nvs->nvs_op; int err; nvs->nvs_op = NVS_OP_GETSIZE; if (type == DATA_TYPE_NVLIST) err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize); else err = nvs_embedded_nvl_array(nvs, nvp, &nvsize); nvs->nvs_op = old_nvs_op; if (err != 0) return (EINVAL); nvp_sz += nvsize; break; } default: return (EINVAL); } if (nvp_sz > INT32_MAX) return (EINVAL); *size = nvp_sz; return (0); } /* * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates * the largest nvpair that could be encoded in the buffer. * * See comments above nvpair_xdr_op() for the format of xdr encoding. * The size of a xdr packed nvpair without any data is 5 words. * * Using the size of the data directly as an estimate would be ok * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY * then the actual nvpair has space for an array of pointers to index * the strings. These pointers are not encoded into the packed xdr buffer. * * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are * of length 0, then each string is encoded in xdr format as a single word. * Therefore when expanded to an nvpair there will be 2.25 word used for * each string. (a int64_t allocated for pointer usage, and a single char * for the null termination.) * * This is the calculation performed by the NVS_XDR_MAX_LEN macro. */ #define NVS_XDR_HDR_LEN ((size_t)(5 * 4)) #define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \ 0 : ((size_t)(y) - NVS_XDR_HDR_LEN)) #define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \ (NVS_XDR_DATA_LEN(x) * 2) + \ NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4))) static int nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) { XDR *xdr = nvs->nvs_private; int32_t encode_len, decode_len; switch (nvs->nvs_op) { case NVS_OP_ENCODE: { size_t nvsize; if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0) return (EFAULT); decode_len = nvp->nvp_size; encode_len = nvsize; if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) return (EFAULT); return (nvs_xdr_nvp_op(nvs, nvp)); } case NVS_OP_DECODE: { struct xdr_bytesrec bytesrec; /* get the encode and decode size */ if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) return (EFAULT); *size = decode_len; /* are we at the end of the stream? */ if (*size == 0) return (0); /* sanity check the size parameter */ if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec)) return (EFAULT); if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail)) return (EFAULT); break; } default: return (EINVAL); } return (0); } static const struct nvs_ops nvs_xdr_ops = { .nvs_nvlist = nvs_xdr_nvlist, .nvs_nvpair = nvs_xdr_nvpair, .nvs_nvp_op = nvs_xdr_nvp_op, .nvs_nvp_size = nvs_xdr_nvp_size, .nvs_nvl_fini = nvs_xdr_nvl_fini }; static int nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) { XDR xdr; int err; nvs->nvs_ops = &nvs_xdr_ops; if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t), *buflen - sizeof (nvs_header_t))) != 0) return (err); err = nvs_operation(nvs, nvl, buflen); nvs_xdr_destroy(nvs); return (err); } EXPORT_SYMBOL(nv_alloc_init); EXPORT_SYMBOL(nv_alloc_reset); EXPORT_SYMBOL(nv_alloc_fini); /* list management */ EXPORT_SYMBOL(nvlist_alloc); EXPORT_SYMBOL(nvlist_free); EXPORT_SYMBOL(nvlist_size); EXPORT_SYMBOL(nvlist_pack); EXPORT_SYMBOL(nvlist_unpack); EXPORT_SYMBOL(nvlist_dup); EXPORT_SYMBOL(nvlist_merge); EXPORT_SYMBOL(nvlist_xalloc); EXPORT_SYMBOL(nvlist_xpack); EXPORT_SYMBOL(nvlist_xunpack); EXPORT_SYMBOL(nvlist_xdup); EXPORT_SYMBOL(nvlist_lookup_nv_alloc); EXPORT_SYMBOL(nvlist_add_nvpair); EXPORT_SYMBOL(nvlist_add_boolean); EXPORT_SYMBOL(nvlist_add_boolean_value); EXPORT_SYMBOL(nvlist_add_byte); EXPORT_SYMBOL(nvlist_add_int8); EXPORT_SYMBOL(nvlist_add_uint8); EXPORT_SYMBOL(nvlist_add_int16); EXPORT_SYMBOL(nvlist_add_uint16); EXPORT_SYMBOL(nvlist_add_int32); EXPORT_SYMBOL(nvlist_add_uint32); EXPORT_SYMBOL(nvlist_add_int64); EXPORT_SYMBOL(nvlist_add_uint64); EXPORT_SYMBOL(nvlist_add_string); EXPORT_SYMBOL(nvlist_add_nvlist); EXPORT_SYMBOL(nvlist_add_boolean_array); EXPORT_SYMBOL(nvlist_add_byte_array); EXPORT_SYMBOL(nvlist_add_int8_array); EXPORT_SYMBOL(nvlist_add_uint8_array); EXPORT_SYMBOL(nvlist_add_int16_array); EXPORT_SYMBOL(nvlist_add_uint16_array); EXPORT_SYMBOL(nvlist_add_int32_array); EXPORT_SYMBOL(nvlist_add_uint32_array); EXPORT_SYMBOL(nvlist_add_int64_array); EXPORT_SYMBOL(nvlist_add_uint64_array); EXPORT_SYMBOL(nvlist_add_string_array); EXPORT_SYMBOL(nvlist_add_nvlist_array); EXPORT_SYMBOL(nvlist_next_nvpair); EXPORT_SYMBOL(nvlist_prev_nvpair); EXPORT_SYMBOL(nvlist_empty); EXPORT_SYMBOL(nvlist_add_hrtime); EXPORT_SYMBOL(nvlist_remove); EXPORT_SYMBOL(nvlist_remove_nvpair); EXPORT_SYMBOL(nvlist_remove_all); EXPORT_SYMBOL(nvlist_lookup_boolean); EXPORT_SYMBOL(nvlist_lookup_boolean_value); EXPORT_SYMBOL(nvlist_lookup_byte); EXPORT_SYMBOL(nvlist_lookup_int8); EXPORT_SYMBOL(nvlist_lookup_uint8); EXPORT_SYMBOL(nvlist_lookup_int16); EXPORT_SYMBOL(nvlist_lookup_uint16); EXPORT_SYMBOL(nvlist_lookup_int32); EXPORT_SYMBOL(nvlist_lookup_uint32); EXPORT_SYMBOL(nvlist_lookup_int64); EXPORT_SYMBOL(nvlist_lookup_uint64); EXPORT_SYMBOL(nvlist_lookup_string); EXPORT_SYMBOL(nvlist_lookup_nvlist); EXPORT_SYMBOL(nvlist_lookup_boolean_array); EXPORT_SYMBOL(nvlist_lookup_byte_array); EXPORT_SYMBOL(nvlist_lookup_int8_array); EXPORT_SYMBOL(nvlist_lookup_uint8_array); EXPORT_SYMBOL(nvlist_lookup_int16_array); EXPORT_SYMBOL(nvlist_lookup_uint16_array); EXPORT_SYMBOL(nvlist_lookup_int32_array); EXPORT_SYMBOL(nvlist_lookup_uint32_array); EXPORT_SYMBOL(nvlist_lookup_int64_array); EXPORT_SYMBOL(nvlist_lookup_uint64_array); EXPORT_SYMBOL(nvlist_lookup_string_array); EXPORT_SYMBOL(nvlist_lookup_nvlist_array); EXPORT_SYMBOL(nvlist_lookup_hrtime); EXPORT_SYMBOL(nvlist_lookup_pairs); EXPORT_SYMBOL(nvlist_lookup_nvpair); EXPORT_SYMBOL(nvlist_exists); /* processing nvpair */ EXPORT_SYMBOL(nvpair_name); EXPORT_SYMBOL(nvpair_type); EXPORT_SYMBOL(nvpair_value_boolean_value); EXPORT_SYMBOL(nvpair_value_byte); EXPORT_SYMBOL(nvpair_value_int8); EXPORT_SYMBOL(nvpair_value_uint8); EXPORT_SYMBOL(nvpair_value_int16); EXPORT_SYMBOL(nvpair_value_uint16); EXPORT_SYMBOL(nvpair_value_int32); EXPORT_SYMBOL(nvpair_value_uint32); EXPORT_SYMBOL(nvpair_value_int64); EXPORT_SYMBOL(nvpair_value_uint64); EXPORT_SYMBOL(nvpair_value_string); EXPORT_SYMBOL(nvpair_value_nvlist); EXPORT_SYMBOL(nvpair_value_boolean_array); EXPORT_SYMBOL(nvpair_value_byte_array); EXPORT_SYMBOL(nvpair_value_int8_array); EXPORT_SYMBOL(nvpair_value_uint8_array); EXPORT_SYMBOL(nvpair_value_int16_array); EXPORT_SYMBOL(nvpair_value_uint16_array); EXPORT_SYMBOL(nvpair_value_int32_array); EXPORT_SYMBOL(nvpair_value_uint32_array); EXPORT_SYMBOL(nvpair_value_int64_array); EXPORT_SYMBOL(nvpair_value_uint64_array); EXPORT_SYMBOL(nvpair_value_string_array); EXPORT_SYMBOL(nvpair_value_nvlist_array); EXPORT_SYMBOL(nvpair_value_hrtime); diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c index 4b9cc65d641e..0a2fcf110d7b 100644 --- a/module/os/freebsd/spl/spl_dtrace.c +++ b/module/os/freebsd/spl/spl_dtrace.c @@ -1,35 +1,34 @@ /* * Copyright 2014 The FreeBSD Project. * All rights reserved. * * This software was developed by Steven Hartland. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include -/* CSTYLED */ SDT_PROBE_DEFINE1(sdt, , , set__error, "int"); diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index c84cb7407a9c..7350b8a6d49f 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -1,886 +1,804 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0, "ZFS Block Reference Table"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV mirror"); SYSCTL_DECL(_vfs_zfs_version); SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); /* arc.c */ int param_set_arc_u64(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_64(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); arc_tuning_update(B_TRUE); return (0); } int param_set_arc_int(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_int(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); arc_tuning_update(B_TRUE); return (0); } int param_set_arc_max(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_arc_max; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min || val >= arc_all_memory())) return (SET_ERROR(EINVAL)); zfs_arc_max = val; arc_tuning_update(B_TRUE); /* Update the sysctl to the tuned value */ if (val != 0) zfs_arc_max = arc_c_max; return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_max, "LU", "Maximum ARC size in bytes (LEGACY)"); -/* END CSTYLED */ int param_set_arc_min(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_arc_min; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max)) return (SET_ERROR(EINVAL)); zfs_arc_min = val; arc_tuning_update(B_TRUE); /* Update the sysctl to the tuned value */ if (val != 0) zfs_arc_min = arc_c_min; return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_min, "LU", "Minimum ARC size in bytes (LEGACY)"); -/* END CSTYLED */ extern uint_t zfs_arc_free_target; int param_set_arc_free_target(SYSCTL_HANDLER_ARGS) { uint_t val; int err; val = zfs_arc_free_target; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < minfree) return (EINVAL); if (val > vm_cnt.v_page_count) return (EINVAL); zfs_arc_free_target = val; return (0); } /* * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on * pagedaemon initialization. */ -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim" " (LEGACY)"); -/* END CSTYLED */ int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { int err, val; val = arc_no_grow_shift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val >= arc_shrink_shift) return (EINVAL); arc_no_grow_shift = val; return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, param_set_arc_no_grow_shift, "I", "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_write_max; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, &l2arc_write_max, 0, "Max write bytes per interval (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_write_boost; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, &l2arc_write_boost, 0, "Extra write bytes during device warmup (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_headroom; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, &l2arc_headroom, 0, "Number of max device writes to precache (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_headroom_boost; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, "Compressed l2arc_headroom multiplier (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_feed_secs; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, &l2arc_feed_secs, 0, "Seconds between L2ARC writing (LEGACY)"); -/* END CSTYLED */ extern uint64_t l2arc_feed_min_ms; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, "Min feed interval in milliseconds (LEGACY)"); -/* END CSTYLED */ extern int l2arc_noprefetch; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, &l2arc_noprefetch, 0, "Skip caching prefetched buffers (LEGACY)"); -/* END CSTYLED */ extern int l2arc_feed_again; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, &l2arc_feed_again, 0, "Turbo L2ARC warmup (LEGACY)"); -/* END CSTYLED */ extern int l2arc_norw; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, &l2arc_norw, 0, "No reads during writes (LEGACY)"); -/* END CSTYLED */ static int param_get_arc_state_size(SYSCTL_HANDLER_ARGS) { arc_state_t *state = (arc_state_t *)arg1; int64_t val; val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); return (sysctl_handle_64(oidp, &val, 0, req)); } extern arc_state_t ARC_anon; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_anon, 0, param_get_arc_state_size, "Q", "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in anonymous state"); -/* END CSTYLED */ extern arc_state_t ARC_mru; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mru, 0, param_get_arc_state_size, "Q", "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mru state"); -/* END CSTYLED */ extern arc_state_t ARC_mru_ghost; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mru ghost state"); -/* END CSTYLED */ extern arc_state_t ARC_mfu; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mfu, 0, param_get_arc_state_size, "Q", "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mfu state"); -/* END CSTYLED */ extern arc_state_t ARC_mfu_ghost; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in mfu ghost state"); -/* END CSTYLED */ extern arc_state_t ARC_uncached; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_uncached, 0, param_get_arc_state_size, "Q", "size of uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of evictable metadata in uncached state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of evictable data in uncached state"); -/* END CSTYLED */ extern arc_state_t ARC_l2c_only; -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &ARC_l2c_only, 0, param_get_arc_state_size, "Q", "size of l2c_only state"); -/* END CSTYLED */ /* dbuf.c */ /* dmu.c */ /* dmu_zfetch.c */ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); extern uint32_t zfetch_max_distance; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); -/* END CSTYLED */ extern uint32_t zfetch_max_idistance; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream (LEGACY)"); -/* END CSTYLED */ /* dsl_pool.c */ /* dnode.c */ /* dsl_scan.c */ /* metaslab.c */ int param_set_active_allocator(SYSCTL_HANDLER_ARGS) { char buf[16]; int rc; if (req->newptr == NULL) strlcpy(buf, zfs_active_allocator, sizeof (buf)); rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (rc || req->newptr == NULL) return (rc); if (strcmp(buf, zfs_active_allocator) == 0) return (0); return (param_set_active_allocator_common(buf)); } /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ extern int zfs_metaslab_sm_blksz_no_log; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0, "Block size for space map in pools with log space map disabled. " "Power of 2 greater than 4096."); -/* END CSTYLED */ /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ extern int zfs_metaslab_sm_blksz_with_log; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0, "Block size for space map in pools with log space map enabled. " "Power of 2 greater than 4096."); -/* END CSTYLED */ /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ extern uint_t zfs_condense_pct; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, &zfs_condense_pct, 0, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); -/* END CSTYLED */ extern uint_t zfs_remove_max_segment; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to allocate when removing" " a device"); -/* END CSTYLED */ extern int zfs_removal_suspend_progress; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while in the middle of a removal"); -/* END CSTYLED */ /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ extern uint64_t metaslab_df_alloc_threshold; -/* BEGIN CSTYLED */ SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0, "Minimum size which forces the dynamic allocator to change its" " allocation strategy"); -/* END CSTYLED */ /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ extern uint_t metaslab_df_free_pct; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, &metaslab_df_free_pct, 0, "The minimum free space, in percent, which must be available in a" " space map to continue allocations in a first-fit fashion"); -/* END CSTYLED */ /* mmp.c */ int param_set_multihost_interval(SYSCTL_HANDLER_ARGS) { int err; err = sysctl_handle_64(oidp, &zfs_multihost_interval, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (spa_mode_global != SPA_MODE_UNINIT) mmp_signal_all_threads(); return (0); } /* spa.c */ extern int zfs_ccw_retry_interval; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0, "Configuration cache file write, retry after failure, interval" " (seconds)"); -/* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_cachefile; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0, "Allow importing pools with missing top-level vdevs in cache file"); -/* END CSTYLED */ extern uint64_t zfs_max_missing_tvds_scan; -/* BEGIN CSTYLED */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0, "Allow importing pools with missing top-level vdevs during scan"); -/* END CSTYLED */ /* spa_misc.c */ extern int zfs_flags; static int sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) { int err, val; val = zfs_flags; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); /* * ZFS_DEBUG_MODIFY must be enabled prior to boot so all * arc buffers in the system have the necessary additional * checksum data. However, it is safe to disable at any * time. */ if (!(zfs_flags & ZFS_DEBUG_MODIFY)) val &= ~ZFS_DEBUG_MODIFY; zfs_flags = val; return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); -/* END CSTYLED */ int param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_deadman_synctime_ms; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_synctime_ms = val; spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); return (0); } int param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) { unsigned long val; int err; val = zfs_deadman_ziotime_ms; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_ziotime_ms = val; spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms)); return (0); } int param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) { char buf[16]; int rc; if (req->newptr == NULL) strlcpy(buf, zfs_deadman_failmode, sizeof (buf)); rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (rc || req->newptr == NULL) return (rc); if (strcmp(buf, zfs_deadman_failmode) == 0) return (0); if (strcmp(buf, "wait") == 0) zfs_deadman_failmode = "wait"; if (strcmp(buf, "continue") == 0) zfs_deadman_failmode = "continue"; if (strcmp(buf, "panic") == 0) zfs_deadman_failmode = "panic"; return (-param_set_deadman_failmode_common(buf)); } int param_set_slop_shift(SYSCTL_HANDLER_ARGS) { int val; int err; val = spa_slop_shift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 1 || val > 31) return (EINVAL); spa_slop_shift = val; return (0); } /* spacemap.c */ extern int space_map_ibs; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, &space_map_ibs, 0, "Space map indirect block shift"); -/* END CSTYLED */ /* vdev.c */ int param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) { int val; int err; val = zfs_vdev_min_auto_ashift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(EINVAL)); zfs_vdev_min_auto_ashift = val; return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), param_set_min_auto_ashift, "IU", "Min ashift used when creating new top-level vdev. (LEGACY)"); -/* END CSTYLED */ int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { int val; int err; val = zfs_vdev_max_auto_ashift; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(EINVAL)); zfs_vdev_max_auto_ashift = val; return (0); } -/* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), param_set_max_auto_ashift, "IU", "Max ashift used when optimizing for logical -> physical sector size on" " new top-level vdevs. (LEGACY)"); -/* END CSTYLED */ /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. */ extern int zfs_vdev_dtl_sm_blksz; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0, "Block size for DTL space map. Power of 2 greater than 4096."); -/* END CSTYLED */ /* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. */ extern int zfs_vdev_standard_sm_blksz; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); -/* END CSTYLED */ extern int vdev_validate_skip; -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, &vdev_validate_skip, 0, "Enable to bypass vdev_validate()."); -/* END CSTYLED */ /* vdev_mirror.c */ /* vdev_queue.c */ extern uint_t zfs_vdev_max_active; -/* BEGIN CSTYLED */ SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device." " (LEGACY)"); -/* END CSTYLED */ /* zio.c */ -/* BEGIN CSTYLED */ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); -/* END CSTYLED */ diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index feaca93fb933..195ac58f6f1a 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1,1830 +1,1829 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * This file is responsible for handling all of the details of generating * encryption parameters and performing encryption and authentication. * * BLOCK ENCRYPTION PARAMETERS: * Encryption /Authentication Algorithm Suite (crypt): * The encryption algorithm, mode, and key length we are going to use. We * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit * keys. All authentication is currently done with SHA512-HMAC. * * Plaintext: * The unencrypted data that we want to encrypt. * * Initialization Vector (IV): * An initialization vector for the encryption algorithms. This is used to * "tweak" the encryption algorithms so that two blocks of the same data are * encrypted into different ciphertext outputs, thus obfuscating block patterns. * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is * never reused with the same encryption key. This value is stored unencrypted * and must simply be provided to the decryption function. We use a 96 bit IV * (as recommended by NIST) for all block encryption. For non-dedup blocks we * derive the IV randomly. The first 64 bits of the IV are stored in the second * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of * level 0 blocks is the number of allocated dnodes in that block. The on-disk * format supports at most 2^15 slots per L0 dnode block, because the maximum * block size is 16MB (2^24). In either case, for level 0 blocks this number * will still be smaller than UINT32_MAX so it is safe to store the IV in the * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count * for the dnode code. * * Master key: * This is the most important secret data of an encrypted dataset. It is used * along with the salt to generate that actual encryption keys via HKDF. We * do not use the master key to directly encrypt any data because there are * theoretical limits on how much data can actually be safely encrypted with * any encryption mode. The master key is stored encrypted on disk with the * user's wrapping key. Its length is determined by the encryption algorithm. * For details on how this is stored see the block comment in dsl_crypt.c * * Salt: * Used as an input to the HKDF function, along with the master key. We use a * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt * can be used for encrypting many blocks, so we cache the current salt and the * associated derived key in zio_crypt_t so we do not need to derive it again * needlessly. * * Encryption Key: * A secret binary key, generated from an HKDF function used to encrypt and * decrypt data. * * Message Authentication Code (MAC) * The MAC is an output of authenticated encryption modes such as AES-GCM and * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted * data on disk and return garbage to the application. Effectively, it is a * checksum that can not be reproduced by an attacker. We store the MAC in the * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated * regular checksum of the ciphertext which can be used for scrubbing. * * OBJECT AUTHENTICATION: * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because * they contain some info that always needs to be readable. To prevent this * data from being altered, we authenticate this data using SHA512-HMAC. This * will produce a MAC (similar to the one produced via encryption) which can * be used to verify the object was not modified. HMACs do not require key * rotation or IVs, so we can keep up to the full 3 copies of authenticated * data. * * ZIL ENCRYPTION: * ZIL blocks have their bp written to disk ahead of the associated data, so we * cannot store the MAC there as we normally do. For these blocks the MAC is * stored in the embedded checksum within the zil_chain_t header. The salt and * IV are generated for the block on bp allocation instead of at encryption * time. In addition, ZIL blocks have some pieces that must be left in plaintext * for claiming even though all of the sensitive user data still needs to be * encrypted. The function zio_crypt_init_uios_zil() handles parsing which * pieces of the block need to be encrypted. All data that is not encrypted is * authenticated using the AAD mechanisms that the supported encryption modes * provide for. In order to preserve the semantics of the ZIL for encrypted * datasets, the ZIL is not protected at the objset level as described below. * * DNODE ENCRYPTION: * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing * which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: * Up to this point, everything we have encrypted and authenticated has been * at level 0 (or -2 for the ZIL). If we did not do any further work the * on-disk format would be susceptible to attacks that deleted or rearranged * the order of level 0 blocks. Ideally, the cleanest solution would be to * maintain a tree of authentication MACs going up the bp tree. However, this * presents a problem for raw sends. Send files do not send information about * indirect blocks so there would be no convenient way to transfer the MACs and * they cannot be recalculated on the receive side without the master key which * would defeat one of the purposes of raw sends in the first place. Instead, * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs * from the level below. We also include some portable fields from blk_prop such * as the lsize and compression algorithm to prevent the data from being * misinterpreted. * * At the objset level, we maintain 2 separate 256 bit MACs in the * objset_phys_t. The first one is "portable" and is the logical root of the * MAC tree maintained in the metadnode's bps. The second, is "local" and is * used as the root MAC for the user accounting objects, which are also not * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload * of the send file. The useraccounting code ensures that the useraccounting * info is not present upon a receive, so the local MAC can simply be cleared * out at that time. For more info about objset_phys_t authentication, see * zio_crypt_do_objset_hmacs(). * * CONSIDERATIONS FOR DEDUP: * In order for dedup to work, blocks that we want to dedup with one another * need to use the same IV and encryption key, so that they will have the same * ciphertext. Normally, one should never reuse an IV with the same encryption * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both * blocks. In this case, however, since we are using the same plaintext as * well all that we end up with is a duplicate of the original ciphertext we * already had. As a result, an attacker with read access to the raw disk will * be able to tell which blocks are the same but this information is given away * by dedup anyway. In order to get the same IVs and encryption keys for * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC * here so that a reproducible checksum of the plaintext is never available to * the attacker. The HMAC key is kept alongside the master key, encrypted on * disk. The first 64 bits of the HMAC are used in place of the random salt, and * the next 96 bits are used as the IV. As a result of this mechanism, dedup * will only work within a clone family since encrypted dedup requires use of * the same master and HMAC keys. */ /* * After encrypting many blocks with the same key we may start to run up * against the theoretical limits of how much data can securely be encrypted * with a single key using the supported encryption modes. The most obvious * limitation is that our risk of generating 2 equivalent 96 bit IVs increases * the more IVs we generate (which both GCM and CCM modes strictly forbid). * This risk actually grows surprisingly quickly over time according to the * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have * generated n IVs with a cryptographically secure RNG, the approximate * probability p(n) of a collision is given as: * * p(n) ~= e^(-n*(n-1)/(2*(2^96))) * * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] * * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion * we must not write more than 398,065,730 blocks with the same encryption key. * Therefore, we rotate our keys after 400,000,000 blocks have been written by * generating a new random 64 bit salt for our HKDF encryption key generation * function. */ #define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 #define ZFS_CURRENT_MAX_SALT_USES \ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { {"", ZC_TYPE_NONE, 0, "inherit"}, {"", ZC_TYPE_NONE, 0, "on"}, {"", ZC_TYPE_NONE, 0, "off"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} }; static void zio_crypt_key_destroy_early(zio_crypt_key_t *key) { rw_destroy(&key->zk_salt_lock); /* free crypto templates */ memset(&key->zk_session, 0, sizeof (key->zk_session)); /* zero out sensitive data */ memset(key, 0, sizeof (zio_crypt_key_t)); } void zio_crypt_key_destroy(zio_crypt_key_t *key) { freebsd_crypt_freesession(&key->zk_session); zio_crypt_key_destroy_early(key); } int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; crypto_mechanism_t mech __unused; uint_t keydata_len; const zio_crypt_info_t *ci = NULL; ASSERT3P(key, !=, NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); keydata_len = zio_crypt_table[crypt].ci_keylen; memset(key, 0, sizeof (zio_crypt_key_t)); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); if (ret != 0) goto error; ret = random_get_bytes(key->zk_master_keydata, keydata_len); if (ret != 0) goto error; ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); if (ret != 0) goto error; ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for the ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = &key->zk_hmac_key; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); ret = freebsd_crypt_newsession(&key->zk_session, ci, &key->zk_current_key); if (ret) goto error; key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy_early(key); return (ret); } static int zio_crypt_key_change_salt(zio_crypt_key_t *key) { int ret = 0; uint8_t salt[ZIO_DATA_SALT_LEN]; crypto_mechanism_t mech __unused; uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; /* generate a new salt */ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; rw_enter(&key->zk_salt_lock, RW_WRITER); /* someone beat us to the salt rotation, just unlock and return */ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) goto out_unlock; /* derive the current key from the master key and the new salt */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto out_unlock; /* assign the salt and reset the usage count */ memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); key->zk_salt_count = 0; freebsd_crypt_freesession(&key->zk_session); ret = freebsd_crypt_newsession(&key->zk_session, &zio_crypt_table[key->zk_crypt], &key->zk_current_key); if (ret != 0) goto out_unlock; rw_exit(&key->zk_salt_lock); return (0); out_unlock: rw_exit(&key->zk_salt_lock); error: return (ret); } /* See comment above zfs_key_max_salt_uses definition for details */ int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) { int ret; boolean_t salt_change; rw_enter(&key->zk_salt_lock, RW_READER); memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= ZFS_CURRENT_MAX_SALT_USES); rw_exit(&key->zk_salt_lock); if (salt_change) { ret = zio_crypt_key_change_salt(key); if (ret != 0) goto error; } return (0); error: return (ret); } void *failed_decrypt_buf; int failed_decrypt_size; /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ /* * The implementation for FreeBSD's OpenCrypto. * * The big difference between ICP and FOC is that FOC uses a single * buffer for input and output. This means that (for AES-GCM, the * only one supported right now) the source must be copied into the * destination, and the destination must have the AAD, and the tag/MAC, * already associated with it. (Both implementations can use a uio.) * * Since the auth data is part of the iovec array, all we need to know * is the length: 0 means there's no AAD. * */ static int zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, zfs_uio_t *uio, uint_t auth_len) { const zio_crypt_info_t *ci = &zio_crypt_table[crypt]; if (ci->ci_crypt_type != ZC_TYPE_GCM && ci->ci_crypt_type != ZC_TYPE_CCM) return (ENOTSUP); int ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf, datalen, auth_len); if (ret != 0) { #ifdef FCRYPTO_DEBUG printf("%s(%d): Returning error %s\n", __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM"); #endif ret = SET_ERROR(encrypt ? EIO : ECKSUM); } return (ret); } int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; uint64_t aad[3]; /* * With OpenCrypto in FreeBSD, the same buffer is used for * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ zfs_uio_t cuio; struct uio cuio_s; iovec_t iovecs[4]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); zfs_uio_init(&cuio, &cuio_s); keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); if (ret != 0) goto error; /* * Since we only support one buffer, we need to copy * the plain text (source) to the cipher buffer (dest). * We set iovecs[0] -- the authentication data -- below. */ memcpy(keydata_out, key->zk_master_keydata, keydata_len); memcpy(hmac_keydata_out, key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); iovecs[1].iov_base = keydata_out; iovecs[1].iov_len = keydata_len; iovecs[2].iov_base = hmac_keydata_out; iovecs[2].iov_len = SHA512_HMAC_KEYLEN; iovecs[3].iov_base = mac; iovecs[3].iov_len = WRAPPING_MAC_LEN; /* * Although we don't support writing to the old format, we do * support rewrapping the key so that the user can move and * quarantine datasets on the old format. */ if (key->zk_version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(key->zk_guid); } else { ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(key->zk_guid); aad[1] = LE_64(crypt); aad[2] = LE_64(key->zk_version); } iovecs[0].iov_base = aad; iovecs[0].iov_len = aad_len; enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; zfs_uio_iovcnt(&cuio) = 4; zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, iv, enc_len, &cuio, aad_len); if (ret != 0) goto error; return (0); error: return (ret); } int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { int ret; uint64_t aad[3]; /* * With OpenCrypto in FreeBSD, the same buffer is used for * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ zfs_uio_t cuio; struct uio cuio_s; iovec_t iovecs[4]; void *src, *dst; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); zfs_uio_init(&cuio, &cuio_s); /* * Since we only support one buffer, we need to copy * the encrypted buffer (source) to the plain buffer * (dest). We set iovecs[0] -- the authentication data -- * below. */ dst = key->zk_master_keydata; src = keydata; memcpy(dst, src, keydata_len); dst = key->zk_hmac_keydata; src = hmac_keydata; memcpy(dst, src, SHA512_HMAC_KEYLEN); iovecs[1].iov_base = key->zk_master_keydata; iovecs[1].iov_len = keydata_len; iovecs[2].iov_base = key->zk_hmac_keydata; iovecs[2].iov_len = SHA512_HMAC_KEYLEN; iovecs[3].iov_base = mac; iovecs[3].iov_len = WRAPPING_MAC_LEN; if (version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(guid); } else { ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(guid); aad[1] = LE_64(crypt); aad[2] = LE_64(version); } enc_len = keydata_len + SHA512_HMAC_KEYLEN; iovecs[0].iov_base = aad; iovecs[0].iov_len = aad_len; GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; zfs_uio_iovcnt(&cuio) = 4; zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, iv, enc_len, &cuio, aad_len); if (ret != 0) goto error; /* generate a fresh salt */ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = key->zk_hmac_keydata; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); ret = freebsd_crypt_newsession(&key->zk_session, &zio_crypt_table[crypt], &key->zk_current_key); if (ret != 0) goto error; key->zk_crypt = crypt; key->zk_version = version; key->zk_guid = guid; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy_early(key); return (ret); } int zio_crypt_generate_iv(uint8_t *ivbuf) { int ret; /* randomly generate the IV */ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); if (ret != 0) goto error; return (0); error: memset(ivbuf, 0, ZIO_DATA_IV_LEN); return (ret); } int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *digestbuf, uint_t digestlen) { uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); crypto_mac(&key->zk_hmac_key, data, datalen, raw_digestbuf, SHA512_DIGEST_LENGTH); memcpy(digestbuf, raw_digestbuf, digestlen); return (0); } int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *ivbuf, uint8_t *salt) { int ret; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; ret = zio_crypt_do_hmac(key, data, datalen, digestbuf, SHA512_DIGEST_LENGTH); if (ret != 0) return (ret); memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN); memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN); return (0); } /* * The following functions are used to encode and decode encryption parameters * into blkptr_t and zil_header_t. The ICP wants to use these parameters as * byte strings, which normally means that these strings would not need to deal * with byteswapping at all. However, both blkptr_t and zil_header_t may be * byteswapped by lower layers and so we must "undo" that byteswap here upon * decoding and encoding in a non-native byteorder. These functions require * that the byteorder bit is correct before being called. */ void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_ENCRYPTED(bp)); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, val32); } else { memcpy(&val64, salt, sizeof (uint64_t)); bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); memcpy(&val64, iv, sizeof (uint64_t)); bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, BSWAP_32(val32)); } } void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_PROTECTED(bp)); /* for convenience, so callers don't need to check */ if (BP_IS_AUTHENTICATED(bp)) { memset(salt, 0, ZIO_DATA_SALT_LEN); memset(iv, 0, ZIO_DATA_IV_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); val32 = (uint32_t)BP_GET_IV2(bp); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } else { val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); memcpy(salt, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); memcpy(iv, &val64, sizeof (uint64_t)); val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } } void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } else { memcpy(&val64, mac, sizeof (uint64_t)); bp->blk_cksum.zc_word[2] = BSWAP_64(val64); memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); bp->blk_cksum.zc_word[3] = BSWAP_64(val64); } } void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); /* for convenience, so callers don't need to check */ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { memset(mac, 0, ZIO_DATA_MAC_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], sizeof (uint64_t)); } else { val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); memcpy(mac, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); } } void zio_crypt_encode_mac_zil(void *data, uint8_t *mac) { zil_chain_t *zilc = data; memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) { /* * The ZIL MAC is embedded in the block it protects, which will * not have been byteswapped by the time this function has been called. * As a result, we don't need to worry about byteswapping the MAC. */ const zil_chain_t *zilc = data; memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], sizeof (uint64_t)); } /* * This routine takes a block of dnodes (src_abd) and copies only the bonus * buffers to the same offsets in the dst buffer. datalen should be the size * of both the src_abd and the dst buffer (not just the length of the bonus * buffers). */ void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) { uint_t i, max_dnp = datalen >> DNODE_SHIFT; uint8_t *src; dnode_phys_t *dnp, *sdnp, *ddnp; src = abd_borrow_buf_copy(src_abd, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), DN_MAX_BONUS_LEN(dnp)); } } abd_return_buf(src_abd, src, datalen); } /* * This function decides what fields from blk_prop are included in * the on-disk various MAC algorithms. */ static void zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) { int avoidlint = SPA_MINBLOCKSIZE; /* * Version 0 did not properly zero out all non-portable fields * as it should have done. We maintain this code so that we can * do read-only imports of pools on this version. */ if (version == 0) { BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); BP_SET_PSIZE(bp, avoidlint); return; } ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); /* * The hole_birth feature might set these fields even if this bp * is a hole. We zero them out here to guarantee that raw sends * will function with or without the feature. */ if (BP_IS_HOLE(bp)) { bp->blk_prop = 0ULL; return; } /* * At L0 we want to verify these fields to ensure that data blocks * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information * about indirect blocks, so these values might be different on the * receive side. Fortunately, this does not open any new attack * vectors, since any alterations that can be made to a higher level * bp must still verify the correct order of the layer below it. */ if (BP_GET_LEVEL(bp) != 0) { BP_SET_BYTEORDER(bp, 0); BP_SET_COMPRESS(bp, 0); /* * psize cannot be set to zero or it will trigger * asserts, but the value doesn't really matter as * long as it is constant. */ BP_SET_PSIZE(bp, avoidlint); } BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); } static void zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, blkptr_auth_buf_t *bab, uint_t *bab_len) { blkptr_t tmpbp = *bp; if (should_bswap) byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); ASSERT0(BP_IS_EMBEDDED(&tmpbp)); zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); /* * We always MAC blk_prop in LE to ensure portability. This * must be done after decoding the mac, since the endianness * will get zero'd out here. */ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); bab->bab_prop = LE_64(tmpbp.blk_prop); bab->bab_pad = 0ULL; /* version 0 did not include the padding */ *bab_len = sizeof (blkptr_auth_buf_t); if (version == 0) *bab_len -= sizeof (uint64_t); } static int zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); crypto_mac_update(ctx, &bab, bab_len); return (0); } static void zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); SHA2Update(ctx, &bab, bab_len); } static void zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); memcpy(*aadp, &bab, bab_len); *aadp += bab_len; *aad_len += bab_len; } static int zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; dnode_phys_t *adnp; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; /* authenticate the core dnode (masking out non-portable bits) */ memcpy(tmp_dncore, dnp, sizeof (tmp_dncore)); adnp = (dnode_phys_t *)tmp_dncore; if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); adnp->dn_used = BSWAP_64(adnp->dn_used); } adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; crypto_mac_update(ctx, adnp, sizeof (tmp_dncore)); for (i = 0; i < dnp->dn_nblkptr; i++) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, &dnp->dn_blkptr[i]); if (ret != 0) goto error; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, DN_SPILL_BLKPTR(dnp)); if (ret != 0) goto error; } return (0); error: return (ret); } /* * objset_phys_t blocks introduce a number of exceptions to the normal * authentication process. objset_phys_t's contain 2 separate HMACS for * protecting the integrity of their data. The portable_mac protects the * metadnode. This MAC can be sent with a raw send and protects against * reordering of data within the metadnode. The local_mac protects the user * accounting objects which are not sent from one system to another. * * In addition, objset blocks are the only blocks that can be modified and * written to disk without the key loaded under certain circumstances. During * zil_claim() we need to be able to update the zil_header_t to complete * claiming log blocks and during raw receives we need to write out the * portable_mac from the send file. Both of these actions are possible * because these fields are not protected by either MAC so neither one will * need to modify the MACs without the key. However, when the modified blocks * are written out they will be byteswapped into the host machine's native * endianness which will modify fields protected by the MAC. As a result, MAC * calculation for objset blocks works slightly differently from other block * types. Where other block types MAC the data in whatever endianness is * written to disk, objset blocks always MAC little endian version of their * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() * and le_bswap indicates whether a byteswap is needed to get this block * into little endian format. */ int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) { int ret; struct hmac_ctx hash_ctx; struct hmac_ctx *ctx = &hash_ctx; objset_phys_t *osp = data; uint64_t intval; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; /* calculate the portable MAC from the portable fields and metadnode */ crypto_mac_init(ctx, &key->zk_hmac_key); /* add in the os_type */ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* add in the portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* add in fields from the metadnode */ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_meta_dnode); if (ret) goto error; crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH); memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN); /* * This is necessary here as we check next whether * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to * decide if the local_mac should be zeroed out. That flag will always * be set by dmu_objset_id_quota_upgrade_cb() and * dmu_objset_userspace_upgrade_cb() if useraccounting has been * completed. */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); boolean_t uacct_incomplete = !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE); /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ if (uacct_incomplete || (datalen >= OBJSET_PHYS_SIZE_V3 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE && osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || (datalen <= OBJSET_PHYS_SIZE_V1)) { memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (0); } /* calculate the local MAC from the userused and groupused dnodes */ crypto_mac_init(ctx, &key->zk_hmac_key); /* add in the non-portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); crypto_mac_update(ctx, &intval, sizeof (uint64_t)); /* XXX check dnode type ... */ /* add in fields from the user accounting dnodes */ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_userused_dnode); if (ret) goto error; } if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_groupused_dnode); if (ret) goto error; } if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && datalen >= OBJSET_PHYS_SIZE_V3) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_projectused_dnode); if (ret) goto error; } crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH); memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN); return (0); error: memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN); memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (ret); } static void zio_crypt_destroy_uio(zfs_uio_t *uio) { if (GET_UIO_STRUCT(uio)->uio_iov) kmem_free(GET_UIO_STRUCT(uio)->uio_iov, zfs_uio_iovcnt(uio) * sizeof (iovec_t)); } /* * This function parses an uncompressed indirect block and returns a checksum * of all the portable fields from all of the contained bps. The portable * fields are the MAC and all of the fields from blk_prop except for the dedup, * checksum, and psize bits. For an explanation of the purpose of this, see * the comment block on object set authentication. */ static int zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) { blkptr_t *bp; int i, epb = datalen >> SPA_BLKPTRSHIFT; SHA2_CTX ctx; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ SHA2Init(SHA512, &ctx); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } SHA2Final(digestbuf, &ctx); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); return (0); } if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) { #ifdef FCRYPTO_DEBUG printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__); #endif return (SET_ERROR(ECKSUM)); } return (0); } int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; /* * Unfortunately, callers of this function will not always have * easy access to the on-disk format version. This info is * normally found in the DSL Crypto Key, but the checksum-of-MACs * is expected to be verifiable even when the key isn't loaded. * Here, instead of doing a ZAP lookup for the version for each * zio, we simply try both existing formats. */ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); if (ret == ECKSUM) { ASSERT(!generate); ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, 0, byteswap, cksum); } return (ret); } int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; void *buf; buf = abd_borrow_buf_copy(abd, datalen); ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, byteswap, cksum); abd_return_buf(abd, buf, datalen); return (ret); } /* * Special case handling routine for encrypting / decrypting ZIL blocks. * We do not check for the older ZIL chain because the encryption feature * was not available before the newer ZIL chain was introduced. The goal * here is to encrypt everything except the blkptr_t of a lr_write_t and * the zil_chain_t header. Everything that is not encrypted is authenticated. */ /* * The OpenCrypto used in FreeBSD does not use separate source and * destination buffers; instead, the same buffer is used. Further, to * accommodate some of the drivers, the authbuf needs to be logically before * the data. This means that we need to copy the source to the destination, * and set up an extra iovec_t at the beginning to handle the authbuf. * It also means we'll only return one zfs_uio_t. */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { (void) puio; uint8_t *aadbuf = zio_buf_alloc(datalen); uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; iovec_t *dst_iovecs; zil_chain_t *zilc; lr_t *lr; uint64_t txtype, lr_len, nused; uint_t crypt_len, nr_iovecs, vec; uint_t aad_len = 0, total_len = 0; if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); /* Find the start and end record of the log block. */ zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); ASSERT3U(nused, >=, sizeof (zil_chain_t)); ASSERT3U(nused, <=, datalen); blkend = src + nused; /* * Calculate the number of encrypted iovecs we will need. */ /* We need at least two iovecs -- one for the AAD, one for the MAC. */ nr_iovecs = 2; for (; slrp < blkend; slrp += lr_len) { lr = (lr_t *)slrp; if (byteswap) { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } else { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } ASSERT3U(lr_len, >=, sizeof (lr_t)); ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) nr_iovecs++; } dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); /* * Copy the plain zil header over and authenticate everything except * the checksum that will store our MAC. If we are writing the data * the embedded checksum will not have been calculated yet, so we don't * authenticate that. */ memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t)); aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); slrp = src + sizeof (zil_chain_t); dlrp = dst + sizeof (zil_chain_t); /* * Loop over records again, filling in iovecs. */ /* The first iovec will contain the authbuf. */ vec = 1; for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } /* copy the common lr_t */ memcpy(dlrp, slrp, sizeof (lr_t)); memcpy(aadp, slrp, sizeof (lr_t)); aadp += sizeof (lr_t); aad_len += sizeof (lr_t); /* * If this is a TX_WRITE record we want to encrypt everything * except the bp if exists. If the bp does exist we want to * authenticate it. */ if (txtype == TX_WRITE) { const size_t o = offsetof(lr_write_t, lr_blkptr); crypt_len = o - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ memcpy(dlrp + o, slrp + o, sizeof (blkptr_t)); memcpy(aadp, slrp + o, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); vec++; total_len += crypt_len; if (lr_len != sizeof (lr_write_t)) { crypt_len = lr_len - sizeof (lr_write_t); dst_iovecs[vec].iov_base = (char *) dlrp + sizeof (lr_write_t); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } } else if (txtype == TX_CLONE_RANGE) { const size_t o = offsetof(lr_clone_range_t, lr_nbps); crypt_len = o - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; /* copy the bps now since they will not be encrypted */ memcpy(dlrp + o, slrp + o, lr_len - o); memcpy(aadp, slrp + o, lr_len - o); aadp += lr_len - o; aad_len += lr_len - o; vec++; total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } } /* The last iovec will contain the MAC. */ ASSERT3U(vec, ==, nr_iovecs - 1); /* AAD */ dst_iovecs[0].iov_base = aadbuf; dst_iovecs[0].iov_len = aad_len; /* MAC */ dst_iovecs[vec].iov_base = 0; dst_iovecs[vec].iov_len = 0; *no_crypt = (vec == 1); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } /* * Special case handling routine for encrypting / decrypting dnode blocks. */ static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { uint8_t *aadbuf = zio_buf_alloc(datalen); uint8_t *src, *dst, *aadp; dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; iovec_t *dst_iovecs; uint_t nr_iovecs, crypt_len, vec; uint_t aad_len = 0, total_len = 0; uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; aadp = aadbuf; /* * Count the number of iovecs we will need to do the encryption by * counting the number of bonus buffers that need to be encrypted. */ /* We need at least two iovecs -- one for the AAD, one for the MAC. */ nr_iovecs = 2; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { /* * This block may still be byteswapped. However, all of the * values we use are either uint8_t's (for which byteswapping * is a noop) or a * != 0 check, which will work regardless * of whether or not we byteswap. */ if (sdnp[i].dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && sdnp[i].dn_bonuslen != 0) { nr_iovecs++; } } dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP); /* * Iterate through the dnodes again, this time filling in the uios * we allocated earlier. We also concatenate any data we want to * authenticate onto aadbuf. */ /* The first iovec will contain the authbuf. */ vec = 1; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; /* copy over the core fields and blkptrs (kept as plaintext) */ memcpy(&ddnp[i], dnp, (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } /* * Handle authenticated data. We authenticate everything in * the dnode that can be brought over when we do a raw send. * This includes all of the core fields as well as the MACs * stored in the bp checksums and all of the portable bits * from blk_prop. We include the dnode padding here in case it * ever gets used in the future. Some dn_flags and dn_used are * not portable so we mask those out values out of the * authenticated data. */ crypt_len = offsetof(dnode_phys_t, dn_blkptr); memcpy(aadp, dnp, crypt_len); adnp = (dnode_phys_t *)aadp; adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; aadp += crypt_len; aad_len += crypt_len; for (j = 0; j < dnp->dn_nblkptr; j++) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, &dnp->dn_blkptr[j]); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, DN_SPILL_BLKPTR(dnp)); } /* * If this bonus buffer needs to be encrypted, we prepare an * iovec_t. The encryption / decryption functions will fill * this in for us with the encrypted or decrypted data. * Otherwise we add the bonus buffer to the authenticated * data buffer and copy it over to the destination. The * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that * we can guarantee alignment with the AES block size * (128 bits). */ crypt_len = DN_MAX_BONUS_LEN(dnp); if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]); dst_iovecs[vec].iov_len = crypt_len; vec++; total_len += crypt_len; } else { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len); memcpy(aadp, DN_BONUS(dnp), crypt_len); aadp += crypt_len; aad_len += crypt_len; } } /* The last iovec will contain the MAC. */ ASSERT3U(vec, ==, nr_iovecs - 1); /* AAD */ dst_iovecs[0].iov_base = aadbuf; dst_iovecs[0].iov_len = aad_len; /* MAC */ dst_iovecs[vec].iov_base = 0; dst_iovecs[vec].iov_len = 0; *no_crypt = (vec == 1); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len) { (void) puio; int ret; uint_t nr_plain = 1, nr_cipher = 2; iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; void *src, *dst; cipher_iovecs = kmem_zalloc(nr_cipher * sizeof (iovec_t), KM_SLEEP); if (!cipher_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } if (encrypt) { src = plainbuf; dst = cipherbuf; } else { src = cipherbuf; dst = plainbuf; } memcpy(dst, src, datalen); cipher_iovecs[0].iov_base = dst; cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs; zfs_uio_iovcnt(out_uio) = nr_cipher; return (0); error: if (plain_iovecs != NULL) kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); if (cipher_iovecs != NULL) kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; GET_UIO_STRUCT(out_uio)->uio_iov = NULL; zfs_uio_iovcnt(out_uio) = 0; return (ret); } /* * This function builds up the plaintext (puio) and ciphertext (cuio) uios so * that they can be used for encryption and decryption by zio_do_crypt_uio(). * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks * requiring special handling to parse out pieces that are to be encrypted. The * authbuf is used by these special cases to store additional authenticated * data (AAD) for the encryption modes. */ static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); /* route to handler */ switch (ot) { case DMU_OT_INTENT_LOG: ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; case DMU_OT_DNODE: ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; default: ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, datalen, puio, cuio, enc_len); *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; break; } if (ret != 0) goto error; /* populate the uios */ zfs_uio_segflg(cuio) = UIO_SYSSPACE; mac_iov = ((iovec_t *)&(GET_UIO_STRUCT(cuio)-> uio_iov[zfs_uio_iovcnt(cuio) - 1])); mac_iov->iov_base = (void *)mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; return (0); error: return (ret); } void *failed_decrypt_buf; int faile_decrypt_size; /* * Primary encryption / decryption entrypoint for zio data. */ int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, boolean_t *no_crypt) { int ret; boolean_t locked = B_FALSE; uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; zfs_uio_t puio, cuio; struct uio puio_s, cuio_s; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; freebsd_crypt_session_t *tmpl = NULL; uint8_t *authbuf = NULL; memset(&puio_s, 0, sizeof (puio_s)); memset(&cuio_s, 0, sizeof (cuio_s)); zfs_uio_init(&puio, &puio_s); zfs_uio_init(&cuio, &cuio_s); #ifdef FCRYPTO_DEBUG printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", __FUNCTION__, encrypt ? "encrypt" : "decrypt", key, salt, ot, iv, mac, datalen, byteswap ? "byteswap" : "native_endian", plainbuf, cipherbuf, no_crypt); printf("\tkey = {"); for (int i = 0; i < key->zk_current_key.ck_length/8; i++) printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]); printf("}\n"); #endif /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len, no_crypt); if (ret != 0) return (ret); /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. * If we are encrypting, we must return a copy of the current salt * so that it can be stored in the blkptr_t. */ rw_enter(&key->zk_salt_lock, RW_READER); locked = B_TRUE; if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { ckey = &key->zk_current_key; tmpl = &key->zk_session; } else { rw_exit(&key->zk_salt_lock); locked = B_FALSE; ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); if (ret != 0) goto error; tmp_ckey.ck_data = enc_keydata; tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); ckey = &tmp_ckey; tmpl = NULL; } /* perform the encryption / decryption */ ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt, ckey, iv, enc_len, &cuio, auth_len); if (ret != 0) goto error; if (locked) { rw_exit(&key->zk_salt_lock); } if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (0); error: if (!encrypt) { if (failed_decrypt_buf != NULL) kmem_free(failed_decrypt_buf, failed_decrypt_size); failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); failed_decrypt_size = datalen; memcpy(failed_decrypt_buf, cipherbuf, datalen); } if (locked) rw_exit(&key->zk_salt_lock); if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (SET_ERROR(ret)); } /* * Simple wrapper around zio_do_crypt_data() to work with abd's instead of * linear buffers. */ int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; void *ptmp, *ctmp; if (encrypt) { ptmp = abd_borrow_buf_copy(pabd, datalen); ctmp = abd_borrow_buf(cabd, datalen); } else { ptmp = abd_borrow_buf(pabd, datalen); ctmp = abd_borrow_buf_copy(cabd, datalen); } ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, datalen, ptmp, ctmp, no_crypt); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (0); error: if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (SET_ERROR(ret)); } #if defined(_KERNEL) && defined(HAVE_SPL) -/* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); #endif diff --git a/module/os/linux/spl/spl-err.c b/module/os/linux/spl/spl-err.c index 29781b9515b2..81e520547dd7 100644 --- a/module/os/linux/spl/spl-err.c +++ b/module/os/linux/spl/spl-err.c @@ -1,124 +1,123 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Error Implementation. */ #include #include /* * It is often useful to actually have the panic crash the node so you * can then get notified of the event, get the crashdump for later * analysis and other such goodies. * But we would still default to the current default of not to do that. */ static unsigned int spl_panic_halt; -/* CSTYLED */ module_param(spl_panic_halt, uint, 0644); MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); void spl_dumpstack(void) { printk("Showing stack for process %d\n", current->pid); dump_stack(); } EXPORT_SYMBOL(spl_dumpstack); void spl_panic(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; char msg[MAXMSGLEN]; va_list ap; newfile = strrchr(file, '/'); if (newfile != NULL) newfile = newfile + 1; else newfile = file; va_start(ap, fmt); (void) vsnprintf(msg, sizeof (msg), fmt, ap); va_end(ap); printk(KERN_EMERG "%s", msg); printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func); if (spl_panic_halt) panic("%s", msg); spl_dumpstack(); /* Halt the thread to facilitate further debugging */ set_current_state(TASK_UNINTERRUPTIBLE); while (1) schedule(); /* Unreachable */ } EXPORT_SYMBOL(spl_panic); void vcmn_err(int ce, const char *fmt, va_list ap) { char msg[MAXMSGLEN]; vsnprintf(msg, MAXMSGLEN, fmt, ap); switch (ce) { case CE_IGNORE: break; case CE_CONT: printk("%s", msg); break; case CE_NOTE: printk(KERN_NOTICE "NOTICE: %s\n", msg); break; case CE_WARN: printk(KERN_WARNING "WARNING: %s\n", msg); break; case CE_PANIC: printk(KERN_EMERG "PANIC: %s\n", msg); if (spl_panic_halt) panic("%s", msg); spl_dumpstack(); /* Halt the thread to facilitate further debugging */ set_current_state(TASK_UNINTERRUPTIBLE); while (1) schedule(); } } /* vcmn_err() */ EXPORT_SYMBOL(vcmn_err); void cmn_err(int ce, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vcmn_err(ce, fmt, ap); va_end(ap); } /* cmn_err() */ EXPORT_SYMBOL(cmn_err); diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c index 6a95d77ac278..e13914221a6a 100644 --- a/module/os/linux/spl/spl-generic.c +++ b/module/os/linux/spl/spl-generic.c @@ -1,903 +1,902 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Generic Implementation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned long spl_hostid = 0; EXPORT_SYMBOL(spl_hostid); -/* CSTYLED */ module_param(spl_hostid, ulong, 0644); MODULE_PARM_DESC(spl_hostid, "The system hostid."); proc_t p0; EXPORT_SYMBOL(p0); /* * xoshiro256++ 1.0 PRNG by David Blackman and Sebastiano Vigna * * "Scrambled Linear Pseudorandom Number Generators∗" * https://vigna.di.unimi.it/ftp/papers/ScrambledLinear.pdf * * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose * is to provide bytes containing random numbers. It is mapped to /dev/urandom * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so * we can implement it using a fast PRNG that we seed using Linux' actual * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU * with an independent seed so that all calls to random_get_pseudo_bytes() are * free of atomic instructions. * * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() * to generate words larger than 256 bits will paradoxically be limited to * `2^256 - 1` possibilities. This is because we have a sequence of `2^256 - 1` * 256-bit words and selecting the first will implicitly select the second. If * a caller finds this behavior undesirable, random_get_bytes() should be used * instead. * * XXX: Linux interrupt handlers that trigger within the critical section * formed by `s[3] = xp[3];` and `xp[0] = s[0];` and call this function will * see the same numbers. Nothing in the code currently calls this in an * interrupt handler, so this is considered to be okay. If that becomes a * problem, we could create a set of per-cpu variables for interrupt handlers * and use them when in_interrupt() from linux/preempt_mask.h evaluates to * true. */ static void __percpu *spl_pseudo_entropy; /* * rotl()/spl_rand_next()/spl_rand_jump() are copied from the following CC-0 * licensed file: * * https://prng.di.unimi.it/xoshiro256plusplus.c */ static inline uint64_t rotl(const uint64_t x, int k) { return ((x << k) | (x >> (64 - k))); } static inline uint64_t spl_rand_next(uint64_t *s) { const uint64_t result = rotl(s[0] + s[3], 23) + s[0]; const uint64_t t = s[1] << 17; s[2] ^= s[0]; s[3] ^= s[1]; s[1] ^= s[2]; s[0] ^= s[3]; s[2] ^= t; s[3] = rotl(s[3], 45); return (result); } static inline void spl_rand_jump(uint64_t *s) { static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; uint64_t s0 = 0; uint64_t s1 = 0; uint64_t s2 = 0; uint64_t s3 = 0; int i, b; for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) for (b = 0; b < 64; b++) { if (JUMP[i] & 1ULL << b) { s0 ^= s[0]; s1 ^= s[1]; s2 ^= s[2]; s3 ^= s[3]; } (void) spl_rand_next(s); } s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { uint64_t *xp, s[4]; ASSERT(ptr); xp = get_cpu_ptr(spl_pseudo_entropy); s[0] = xp[0]; s[1] = xp[1]; s[2] = xp[2]; s[3] = xp[3]; while (len) { union { uint64_t ui64; uint8_t byte[sizeof (uint64_t)]; }entropy; int i = MIN(len, sizeof (uint64_t)); len -= i; entropy.ui64 = spl_rand_next(s); /* * xoshiro256++ has low entropy lower bytes, so we copy the * higher order bytes first. */ while (i--) #ifdef _ZFS_BIG_ENDIAN *ptr++ = entropy.byte[i]; #else *ptr++ = entropy.byte[7 - i]; #endif } xp[0] = s[0]; xp[1] = s[1]; xp[2] = s[2]; xp[3] = s[3]; put_cpu_ptr(spl_pseudo_entropy); return (0); } EXPORT_SYMBOL(random_get_pseudo_bytes); #if BITS_PER_LONG == 32 /* * Support 64/64 => 64 division on a 32-bit platform. While the kernel * provides a div64_u64() function for this we do not use it because the * implementation is flawed. There are cases which return incorrect * results as late as linux-2.6.35. Until this is fixed upstream the * spl must provide its own implementation. * * This implementation is a slightly modified version of the algorithm * proposed by the book 'Hacker's Delight'. The original source can be * found here and is available for use without restriction. * * http://www.hackersdelight.org/HDcode/newCode/divDouble.c */ /* * Calculate number of leading of zeros for a 64-bit value. */ static int nlz64(uint64_t x) { register int n = 0; if (x == 0) return (64); if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } return (n); } /* * Newer kernels have a div_u64() function but we define our own * to simplify portability between kernel versions. */ static inline uint64_t __div_u64(uint64_t u, uint32_t v) { (void) do_div(u, v); return (u); } /* * Turn off missing prototypes warning for these functions. They are * replacements for libgcc-provided functions and will never be called * directly. */ #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmissing-prototypes" #endif /* * Implementation of 64-bit unsigned division for 32-bit machines. * * First the procedure takes care of the case in which the divisor is a * 32-bit quantity. There are two subcases: (1) If the left half of the * dividend is less than the divisor, one execution of do_div() is all that * is required (overflow is not possible). (2) Otherwise it does two * divisions, using the grade school method. */ uint64_t __udivdi3(uint64_t u, uint64_t v) { uint64_t u0, u1, v1, q0, q1, k; int n; if (v >> 32 == 0) { // If v < 2**32: if (u >> 32 < v) { // If u/v cannot overflow, return (__div_u64(u, v)); // just do one division. } else { // If u/v would overflow: u1 = u >> 32; // Break u into two halves. u0 = u & 0xFFFFFFFF; q1 = __div_u64(u1, v); // First quotient digit. k = u1 - q1 * v; // First remainder, < v. u0 += (k << 32); q0 = __div_u64(u0, v); // Seconds quotient digit. return ((q1 << 32) + q0); } } else { // If v >= 2**32: n = nlz64(v); // 0 <= n <= 31. v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. u1 = u >> 1; // To ensure no overflow. q1 = __div_u64(u1, v1); // Get quotient from q0 = (q1 << n) >> 31; // Undo normalization and // division of u by 2. if (q0 != 0) // Make q0 correct or q0 = q0 - 1; // too small by 1. if ((u - q0 * v) >= v) q0 = q0 + 1; // Now q0 is correct. return (q0); } } EXPORT_SYMBOL(__udivdi3); #ifndef abs64 /* CSTYLED */ #define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) #endif /* * Implementation of 64-bit signed division for 32-bit machines. */ int64_t __divdi3(int64_t u, int64_t v) { int64_t q, t; q = __udivdi3(abs64(u), abs64(v)); t = (u ^ v) >> 63; // If u, v have different return ((q ^ t) - t); // signs, negate q. } EXPORT_SYMBOL(__divdi3); /* * Implementation of 64-bit unsigned modulo for 32-bit machines. */ uint64_t __umoddi3(uint64_t dividend, uint64_t divisor) { return (dividend - (divisor * __udivdi3(dividend, divisor))); } EXPORT_SYMBOL(__umoddi3); /* 64-bit signed modulo for 32-bit machines. */ int64_t __moddi3(int64_t n, int64_t d) { int64_t q; boolean_t nn = B_FALSE; if (n < 0) { nn = B_TRUE; n = -n; } if (d < 0) d = -d; q = __umoddi3(n, d); return (nn ? -q : q); } EXPORT_SYMBOL(__moddi3); /* * Implementation of 64-bit unsigned division/modulo for 32-bit machines. */ uint64_t __udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) { uint64_t q = __udivdi3(n, d); if (r) *r = n - d * q; return (q); } EXPORT_SYMBOL(__udivmoddi4); /* * Implementation of 64-bit signed division/modulo for 32-bit machines. */ int64_t __divmoddi4(int64_t n, int64_t d, int64_t *r) { int64_t q, rr; boolean_t nn = B_FALSE; boolean_t nd = B_FALSE; if (n < 0) { nn = B_TRUE; n = -n; } if (d < 0) { nd = B_TRUE; d = -d; } q = __udivmoddi4(n, d, (uint64_t *)&rr); if (nn != nd) q = -q; if (nn) rr = -rr; if (r) *r = rr; return (q); } EXPORT_SYMBOL(__divmoddi4); #if defined(__arm) || defined(__arm__) /* * Implementation of 64-bit (un)signed division for 32-bit arm machines. * * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, * and the remainder in {r2, r3}. The return type is specifically left * set to 'void' to ensure the compiler does not overwrite these registers * during the return. All results are in registers as per ABI */ void __aeabi_uldivmod(uint64_t u, uint64_t v) { uint64_t res; uint64_t mod; res = __udivdi3(u, v); mod = __umoddi3(u, v); { register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); register uint32_t r1 asm("r1") = (res >> 32); register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); register uint32_t r3 asm("r3") = (mod >> 32); asm volatile("" : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ return; /* r0; */ } } EXPORT_SYMBOL(__aeabi_uldivmod); void __aeabi_ldivmod(int64_t u, int64_t v) { int64_t res; uint64_t mod; res = __divdi3(u, v); mod = __umoddi3(u, v); { register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); register uint32_t r1 asm("r1") = (res >> 32); register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); register uint32_t r3 asm("r3") = (mod >> 32); asm volatile("" : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3) /* output */ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ return; /* r0; */ } } EXPORT_SYMBOL(__aeabi_ldivmod); #endif /* __arm || __arm__ */ #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop #endif #endif /* BITS_PER_LONG */ /* * NOTE: The strtoxx behavior is solely based on my reading of the Solaris * ddi_strtol(9F) man page. I have not verified the behavior of these * functions against their Solaris counterparts. It is possible that I * may have misinterpreted the man page or the man page is incorrect. */ int ddi_strtol(const char *, char **, int, long *); int ddi_strtoull(const char *, char **, int, unsigned long long *); int ddi_strtoll(const char *, char **, int, long long *); #define define_ddi_strtox(type, valtype) \ int ddi_strto##type(const char *str, char **endptr, \ int base, valtype *result) \ { \ valtype last_value, value = 0; \ char *ptr = (char *)str; \ int digit, minus = 0; \ \ while (strchr(" \t\n\r\f", *ptr)) \ ++ptr; \ \ if (strlen(ptr) == 0) \ return (EINVAL); \ \ switch (*ptr) { \ case '-': \ minus = 1; \ zfs_fallthrough; \ case '+': \ ++ptr; \ break; \ } \ \ /* Auto-detect base based on prefix */ \ if (!base) { \ if (str[0] == '0') { \ if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \ base = 16; /* hex */ \ ptr += 2; \ } else if (str[1] >= '0' && str[1] < '8') { \ base = 8; /* octal */ \ ptr += 1; \ } else { \ return (EINVAL); \ } \ } else { \ base = 10; /* decimal */ \ } \ } \ \ while (1) { \ if (isdigit(*ptr)) \ digit = *ptr - '0'; \ else if (isalpha(*ptr)) \ digit = tolower(*ptr) - 'a' + 10; \ else \ break; \ \ if (digit >= base) \ break; \ \ last_value = value; \ value = value * base + digit; \ if (last_value > value) /* Overflow */ \ return (ERANGE); \ \ ptr++; \ } \ \ *result = minus ? -value : value; \ \ if (endptr) \ *endptr = ptr; \ \ return (0); \ } \ define_ddi_strtox(l, long) define_ddi_strtox(ull, unsigned long long) define_ddi_strtox(ll, long long) EXPORT_SYMBOL(ddi_strtol); EXPORT_SYMBOL(ddi_strtoll); EXPORT_SYMBOL(ddi_strtoull); int ddi_copyin(const void *from, void *to, size_t len, int flags) { /* Fake ioctl() issued by kernel, 'from' is a kernel address */ if (flags & FKIOCTL) { memcpy(to, from, len); return (0); } return (copyin(from, to, len)); } EXPORT_SYMBOL(ddi_copyin); #define define_spl_param(type, fmt) \ int \ spl_param_get_##type(char *buf, zfs_kernel_param_t *kp) \ { \ return (scnprintf(buf, PAGE_SIZE, fmt "\n", \ *(type *)kp->arg)); \ } \ int \ spl_param_set_##type(const char *buf, zfs_kernel_param_t *kp) \ { \ return (kstrto##type(buf, 0, (type *)kp->arg)); \ } \ const struct kernel_param_ops spl_param_ops_##type = { \ .set = spl_param_set_##type, \ .get = spl_param_get_##type, \ }; \ EXPORT_SYMBOL(spl_param_get_##type); \ EXPORT_SYMBOL(spl_param_set_##type); \ EXPORT_SYMBOL(spl_param_ops_##type); define_spl_param(s64, "%lld") define_spl_param(u64, "%llu") /* * Post a uevent to userspace whenever a new vdev adds to the pool. It is * necessary to sync blkid information with udev, which zed daemon uses * during device hotplug to identify the vdev. */ void spl_signal_kobj_evt(struct block_device *bdev) { #if defined(HAVE_BDEV_KOBJ) || defined(HAVE_PART_TO_DEV) #ifdef HAVE_BDEV_KOBJ struct kobject *disk_kobj = bdev_kobj(bdev); #else struct kobject *disk_kobj = &part_to_dev(bdev->bd_part)->kobj; #endif if (disk_kobj) { int ret = kobject_uevent(disk_kobj, KOBJ_CHANGE); if (ret) { pr_warn("ZFS: Sending event '%d' to kobject: '%s'" " (%p): failed(ret:%d)\n", KOBJ_CHANGE, kobject_name(disk_kobj), disk_kobj, ret); } } #else /* * This is encountered if neither bdev_kobj() nor part_to_dev() is available * in the kernel - likely due to an API change that needs to be chased down. */ #error "Unsupported kernel: unable to get struct kobj from bdev" #endif } EXPORT_SYMBOL(spl_signal_kobj_evt); int ddi_copyout(const void *from, void *to, size_t len, int flags) { /* Fake ioctl() issued by kernel, 'from' is a kernel address */ if (flags & FKIOCTL) { memcpy(to, from, len); return (0); } return (copyout(from, to, len)); } EXPORT_SYMBOL(ddi_copyout); static int spl_getattr(struct file *filp, struct kstat *stat) { int rc; ASSERT(filp); ASSERT(stat); rc = vfs_getattr(&filp->f_path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); if (rc) return (-rc); return (0); } /* * Read the unique system identifier from the /etc/hostid file. * * The behavior of /usr/bin/hostid on Linux systems with the * regular eglibc and coreutils is: * * 1. Generate the value if the /etc/hostid file does not exist * or if the /etc/hostid file is less than four bytes in size. * * 2. If the /etc/hostid file is at least 4 bytes, then return * the first four bytes [0..3] in native endian order. * * 3. Always ignore bytes [4..] if they exist in the file. * * Only the first four bytes are significant, even on systems that * have a 64-bit word size. * * See: * * eglibc: sysdeps/unix/sysv/linux/gethostid.c * coreutils: src/hostid.c * * Notes: * * The /etc/hostid file on Solaris is a text file that often reads: * * # DO NOT EDIT * "0123456789" * * Directly copying this file to Linux results in a constant * hostid of 4f442023 because the default comment constitutes * the first four bytes of the file. * */ static char *spl_hostid_path = HW_HOSTID_PATH; module_param(spl_hostid_path, charp, 0444); MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)"); static int hostid_read(uint32_t *hostid) { uint64_t size; uint32_t value = 0; int error; loff_t off; struct file *filp; struct kstat stat; filp = filp_open(spl_hostid_path, 0, 0); if (IS_ERR(filp)) return (ENOENT); error = spl_getattr(filp, &stat); if (error) { filp_close(filp, 0); return (error); } size = stat.size; // cppcheck-suppress sizeofwithnumericparameter if (size < sizeof (HW_HOSTID_MASK)) { filp_close(filp, 0); return (EINVAL); } off = 0; /* * Read directly into the variable like eglibc does. * Short reads are okay; native behavior is preserved. */ error = kernel_read(filp, &value, sizeof (value), &off); if (error < 0) { filp_close(filp, 0); return (EIO); } /* Mask down to 32 bits like coreutils does. */ *hostid = (value & HW_HOSTID_MASK); filp_close(filp, 0); return (0); } /* * Return the system hostid. Preferentially use the spl_hostid module option * when set, otherwise use the value in the /etc/hostid file. */ uint32_t zone_get_hostid(void *zone) { uint32_t hostid; ASSERT3P(zone, ==, NULL); if (spl_hostid != 0) return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); if (hostid_read(&hostid) == 0) return (hostid); return (0); } EXPORT_SYMBOL(zone_get_hostid); static int spl_kvmem_init(void) { int rc = 0; rc = spl_kmem_init(); if (rc) return (rc); rc = spl_vmem_init(); if (rc) { spl_kmem_fini(); return (rc); } return (rc); } /* * We initialize the random number generator with 128 bits of entropy from the * system random number generator. In the improbable case that we have a zero * seed, we fallback to the system jiffies, unless it is also zero, in which * situation we use a preprogrammed seed. We step forward by 2^64 iterations to * initialize each of the per-cpu seeds so that the sequences generated on each * CPU are guaranteed to never overlap in practice. */ static int __init spl_random_init(void) { uint64_t s[4]; int i = 0; spl_pseudo_entropy = __alloc_percpu(4 * sizeof (uint64_t), sizeof (uint64_t)); if (!spl_pseudo_entropy) return (-ENOMEM); get_random_bytes(s, sizeof (s)); if (s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0) { if (jiffies != 0) { s[0] = jiffies; s[1] = ~0 - jiffies; s[2] = ~jiffies; s[3] = jiffies - ~0; } else { (void) memcpy(s, "improbable seed", 16); } printk("SPL: get_random_bytes() returned 0 " "when generating random seed. Setting initial seed to " "0x%016llx%016llx%016llx%016llx.\n", cpu_to_be64(s[0]), cpu_to_be64(s[1]), cpu_to_be64(s[2]), cpu_to_be64(s[3])); } for_each_possible_cpu(i) { uint64_t *wordp = per_cpu_ptr(spl_pseudo_entropy, i); spl_rand_jump(s); wordp[0] = s[0]; wordp[1] = s[1]; wordp[2] = s[2]; wordp[3] = s[3]; } return (0); } static void spl_random_fini(void) { free_percpu(spl_pseudo_entropy); } static void spl_kvmem_fini(void) { spl_vmem_fini(); spl_kmem_fini(); } static int __init spl_init(void) { int rc = 0; if ((rc = spl_random_init())) goto out0; if ((rc = spl_kvmem_init())) goto out1; if ((rc = spl_tsd_init())) goto out2; if ((rc = spl_proc_init())) goto out3; if ((rc = spl_kstat_init())) goto out4; if ((rc = spl_taskq_init())) goto out5; if ((rc = spl_kmem_cache_init())) goto out6; if ((rc = spl_zlib_init())) goto out7; if ((rc = spl_zone_init())) goto out8; return (rc); out8: spl_zlib_fini(); out7: spl_kmem_cache_fini(); out6: spl_taskq_fini(); out5: spl_kstat_fini(); out4: spl_proc_fini(); out3: spl_tsd_fini(); out2: spl_kvmem_fini(); out1: spl_random_fini(); out0: return (rc); } static void __exit spl_fini(void) { spl_zone_fini(); spl_zlib_fini(); spl_kmem_cache_fini(); spl_taskq_fini(); spl_kstat_fini(); spl_proc_fini(); spl_tsd_fini(); spl_kvmem_fini(); spl_random_fini(); } module_init(spl_init); module_exit(spl_fini); MODULE_DESCRIPTION("Solaris Porting Layer"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_LICENSE("GPL"); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 7e806bd5699c..33c7d0879741 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -1,1446 +1,1444 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #define SPL_KMEM_CACHE_IMPLEMENTING #include #include #include #include #include #include #include #include #include #include /* * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() * with smp_mb__{before,after}_atomic() because they were redundant. This is * only used inside our SLAB allocator, so we implement an internal wrapper * here to give us smp_mb__{before,after}_atomic() on older kernels. */ #ifndef smp_mb__before_atomic #define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) #endif #ifndef smp_mb__after_atomic #define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) #endif -/* BEGIN CSTYLED */ /* * Cache magazines are an optimization designed to minimize the cost of * allocating memory. They do this by keeping a per-cpu cache of recently * freed objects, which can then be reallocated without taking a lock. This * can improve performance on highly contended caches. However, because * objects in magazines will prevent otherwise empty slabs from being * immediately released this may not be ideal for low memory machines. * * For this reason spl_kmem_cache_magazine_size can be used to set a maximum * magazine size. When this value is set to 0 the magazine size will be * automatically determined based on the object size. Otherwise magazines * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines * may never be entirely disabled in this implementation. */ static unsigned int spl_kmem_cache_magazine_size = 0; module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); static unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; module_param(spl_kmem_cache_max_size, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); /* * For small objects the Linux slab allocator should be used to make the most * efficient use of the memory. However, large objects are not supported by * the Linux slab and therefore the SPL implementation is preferred. A cutoff * of 16K was determined to be optimal for architectures using 4K pages and * to also work well on architecutres using larger 64K page sizes. */ static unsigned int spl_kmem_cache_slab_limit = SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE; module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); /* * The number of threads available to allocate new slabs for caches. This * should not need to be tuned but it is available for performance analysis. */ static unsigned int spl_kmem_cache_kmem_threads = 4; module_param(spl_kmem_cache_kmem_threads, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, "Number of spl_kmem_cache threads"); -/* END CSTYLED */ /* * Slab allocation interfaces * * While the Linux slab implementation was inspired by the Solaris * implementation I cannot use it to emulate the Solaris APIs. I * require two features which are not provided by the Linux slab. * * 1) Constructors AND destructors. Recent versions of the Linux * kernel have removed support for destructors. This is a deal * breaker for the SPL which contains particularly expensive * initializers for mutex's, condition variables, etc. We also * require a minimal level of cleanup for these data types unlike * many Linux data types which do need to be explicitly destroyed. * * 2) Virtual address space backed slab. Callers of the Solaris slab * expect it to work well for both small are very large allocations. * Because of memory fragmentation the Linux slab which is backed * by kmalloc'ed memory performs very badly when confronted with * large numbers of large allocations. Basing the slab on the * virtual address space removes the need for contiguous pages * and greatly improve performance for large allocations. * * For these reasons, the SPL has its own slab implementation with * the needed features. It is not as highly optimized as either the * Solaris or Linux slabs, but it should get me most of what is * needed until it can be optimized or obsoleted by another approach. * * One serious concern I do have about this method is the relatively * small virtual address space on 32bit arches. This will seriously * constrain the size of the slab caches and their performance. */ struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ static taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { gfp_t lflags = kmem_flags_convert(flags); void *ptr; if (skc->skc_flags & KMC_RECLAIMABLE) lflags |= __GFP_RECLAIMABLE; ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); return (ptr); } static void kv_free(spl_kmem_cache_t *skc, void *ptr, int size) { ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); /* * The Linux direct reclaim path uses this out of band value to * determine if forward progress is being made. Normally this is * incremented by kmem_freepages() which is part of the various * Linux slab implementations. However, since we are using none * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) #ifdef HAVE_RECLAIM_STATE_RECLAIMED current->reclaim_state->reclaimed += size >> PAGE_SHIFT; #else current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; #endif vfree(ptr); } /* * Required space for each aligned sks. */ static inline uint32_t spl_sks_size(spl_kmem_cache_t *skc) { return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), skc->skc_obj_align, uint32_t)); } /* * Required space for each aligned object. */ static inline uint32_t spl_obj_size(spl_kmem_cache_t *skc) { uint32_t align = skc->skc_obj_align; return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); } uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache) { return (cache->skc_obj_total); } EXPORT_SYMBOL(spl_kmem_cache_inuse); uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache) { return (cache->skc_obj_size); } EXPORT_SYMBOL(spl_kmem_cache_entry_size); /* * Lookup the spl_kmem_object_t for an object given that object. */ static inline spl_kmem_obj_t * spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) { return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, skc->skc_obj_align, uint32_t)); } /* * It's important that we pack the spl_kmem_obj_t structure and the * actual objects in to one large address space to minimize the number * of calls to the allocator. It is far better to do a few large * allocations and then subdivide it ourselves. Now which allocator * we use requires balancing a few trade offs. * * For small objects we use kmem_alloc() because as long as you are * only requesting a small number of pages (ideally just one) its cheap. * However, when you start requesting multiple pages with kmem_alloc() * it gets increasingly expensive since it requires contiguous pages. * For this reason we shift to vmem_alloc() for slabs of large objects * which removes the need for contiguous pages. We do not use * vmem_alloc() in all cases because there is significant locking * overhead in __get_vm_area_node(). This function takes a single * global lock when acquiring an available virtual address range which * serializes all vmem_alloc()'s for all slab caches. Using slightly * different allocation functions for small and large objects should * give us the best of both worlds. * * +------------------------+ * | spl_kmem_slab_t --+-+ | * | skc_obj_size <-+ | | * | spl_kmem_obj_t | | * | skc_obj_size <---+ | * | spl_kmem_obj_t | | * | ... v | * +------------------------+ */ static spl_kmem_slab_t * spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; void *base; uint32_t obj_size; base = kv_alloc(skc, skc->skc_slab_size, flags); if (base == NULL) return (NULL); sks = (spl_kmem_slab_t *)base; sks->sks_magic = SKS_MAGIC; sks->sks_objs = skc->skc_slab_objs; sks->sks_age = jiffies; sks->sks_cache = skc; INIT_LIST_HEAD(&sks->sks_list); INIT_LIST_HEAD(&sks->sks_free_list); sks->sks_ref = 0; obj_size = spl_obj_size(skc); for (int i = 0; i < sks->sks_objs; i++) { void *obj = base + spl_sks_size(skc) + (i * obj_size); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj); sko->sko_addr = obj; sko->sko_magic = SKO_MAGIC; sko->sko_slab = sks; INIT_LIST_HEAD(&sko->sko_list); list_add_tail(&sko->sko_list, &sks->sks_free_list); } return (sks); } /* * Remove a slab from complete or partial list, it must be called with * the 'skc->skc_lock' held but the actual free must be performed * outside the lock to prevent deadlocking on vmem addresses. */ static void spl_slab_free(spl_kmem_slab_t *sks, struct list_head *sks_list, struct list_head *sko_list) { spl_kmem_cache_t *skc; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref == 0); skc = sks->sks_cache; ASSERT(skc->skc_magic == SKC_MAGIC); /* * Update slab/objects counters in the cache, then remove the * slab from the skc->skc_partial_list. Finally add the slab * and all its objects in to the private work lists where the * destructors will be called and the memory freed to the system. */ skc->skc_obj_total -= sks->sks_objs; skc->skc_slab_total--; list_del(&sks->sks_list); list_add(&sks->sks_list, sks_list); list_splice_init(&sks->sks_free_list, sko_list); } /* * Reclaim empty slabs at the end of the partial list. */ static void spl_slab_reclaim(spl_kmem_cache_t *skc) { spl_kmem_slab_t *sks = NULL, *m = NULL; spl_kmem_obj_t *sko = NULL, *n = NULL; LIST_HEAD(sks_list); LIST_HEAD(sko_list); /* * Empty slabs and objects must be moved to a private list so they * can be safely freed outside the spin lock. All empty slabs are * at the end of skc->skc_partial_list, therefore once a non-empty * slab is found we can stop scanning. */ spin_lock(&skc->skc_lock); list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, sks_list) { if (sks->sks_ref > 0) break; spl_slab_free(sks, &sks_list, &sko_list); } spin_unlock(&skc->skc_lock); /* * The following two loops ensure all the object destructors are run, * and the slabs themselves are freed. This is all done outside the * skc->skc_lock since this allows the destructor to sleep, and * allows us to perform a conditional reschedule when a freeing a * large number of objects and slabs back to the system. */ list_for_each_entry_safe(sko, n, &sko_list, sko_list) { ASSERT(sko->sko_magic == SKO_MAGIC); } list_for_each_entry_safe(sks, m, &sks_list, sks_list) { ASSERT(sks->sks_magic == SKS_MAGIC); kv_free(skc, sks, skc->skc_slab_size); } } static spl_kmem_emergency_t * spl_emergency_search(struct rb_root *root, void *obj) { struct rb_node *node = root->rb_node; spl_kmem_emergency_t *ske; unsigned long address = (unsigned long)obj; while (node) { ske = container_of(node, spl_kmem_emergency_t, ske_node); if (address < ske->ske_obj) node = node->rb_left; else if (address > ske->ske_obj) node = node->rb_right; else return (ske); } return (NULL); } static int spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) { struct rb_node **new = &(root->rb_node), *parent = NULL; spl_kmem_emergency_t *ske_tmp; unsigned long address = ske->ske_obj; while (*new) { ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); parent = *new; if (address < ske_tmp->ske_obj) new = &((*new)->rb_left); else if (address > ske_tmp->ske_obj) new = &((*new)->rb_right); else return (0); } rb_link_node(&ske->ske_node, parent, new); rb_insert_color(&ske->ske_node, root); return (1); } /* * Allocate a single emergency object and track it in a red black tree. */ static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) { gfp_t lflags = kmem_flags_convert(flags); spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); int empty; /* Last chance use a partial slab if one now exists */ spin_lock(&skc->skc_lock); empty = list_empty(&skc->skc_partial_list); spin_unlock(&skc->skc_lock); if (!empty) return (-EEXIST); if (skc->skc_flags & KMC_RECLAIMABLE) lflags |= __GFP_RECLAIMABLE; ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); ske->ske_obj = __get_free_pages(lflags, order); if (ske->ske_obj == 0) { kfree(ske); return (-ENOMEM); } spin_lock(&skc->skc_lock); empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); if (likely(empty)) { skc->skc_obj_total++; skc->skc_obj_emergency++; if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) skc->skc_obj_emergency_max = skc->skc_obj_emergency; } spin_unlock(&skc->skc_lock); if (unlikely(!empty)) { free_pages(ske->ske_obj, order); kfree(ske); return (-EINVAL); } *obj = (void *)ske->ske_obj; return (0); } /* * Locate the passed object in the red black tree and free it. */ static int spl_emergency_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); spin_lock(&skc->skc_lock); ske = spl_emergency_search(&skc->skc_emergency_tree, obj); if (ske) { rb_erase(&ske->ske_node, &skc->skc_emergency_tree); skc->skc_obj_emergency--; skc->skc_obj_total--; } spin_unlock(&skc->skc_lock); if (ske == NULL) return (-ENOENT); free_pages(ske->ske_obj, order); kfree(ske); return (0); } /* * Release objects from the per-cpu magazine back to their slab. The flush * argument contains the max number of entries to remove from the magazine. */ static void spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) { spin_lock(&skc->skc_lock); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); int count = MIN(flush, skm->skm_avail); for (int i = 0; i < count; i++) spl_cache_shrink(skc, skm->skm_objs[i]); skm->skm_avail -= count; memmove(skm->skm_objs, &(skm->skm_objs[count]), sizeof (void *) * skm->skm_avail); spin_unlock(&skc->skc_lock); } /* * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, * for very small objects we may end up with more than this so as not * to waste space in the minimal allocation of a single page. */ static int spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) { uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; sks_size = spl_sks_size(skc); obj_size = spl_obj_size(skc); max_size = (spl_kmem_cache_max_size * 1024 * 1024); tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); if (tgt_size <= max_size) { tgt_objs = (tgt_size - sks_size) / obj_size; } else { tgt_objs = (max_size - sks_size) / obj_size; tgt_size = (tgt_objs * obj_size) + sks_size; } if (tgt_objs == 0) return (-ENOSPC); *objs = tgt_objs; *size = tgt_size; return (0); } /* * Make a guess at reasonable per-cpu magazine size based on the size of * each object and the cost of caching N of them in each magazine. Long * term this should really adapt based on an observed usage heuristic. */ static int spl_magazine_size(spl_kmem_cache_t *skc) { uint32_t obj_size = spl_obj_size(skc); int size; if (spl_kmem_cache_magazine_size > 0) return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); /* Per-magazine sizes below assume a 4Kib page size */ if (obj_size > (PAGE_SIZE * 256)) size = 4; /* Minimum 4Mib per-magazine */ else if (obj_size > (PAGE_SIZE * 32)) size = 16; /* Minimum 2Mib per-magazine */ else if (obj_size > (PAGE_SIZE)) size = 64; /* Minimum 256Kib per-magazine */ else if (obj_size > (PAGE_SIZE / 4)) size = 128; /* Minimum 128Kib per-magazine */ else size = 256; return (size); } /* * Allocate a per-cpu magazine to associate with a specific core. */ static spl_kmem_magazine_t * spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) { spl_kmem_magazine_t *skm; int size = sizeof (spl_kmem_magazine_t) + sizeof (void *) * skc->skc_mag_size; skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (skm) { skm->skm_magic = SKM_MAGIC; skm->skm_avail = 0; skm->skm_size = skc->skc_mag_size; skm->skm_refill = skc->skc_mag_refill; skm->skm_cache = skc; skm->skm_cpu = cpu; } return (skm); } /* * Free a per-cpu magazine associated with a specific core. */ static void spl_magazine_free(spl_kmem_magazine_t *skm) { ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); kfree(skm); } /* * Create all pre-cpu magazines of reasonable sizes. */ static int spl_magazine_create(spl_kmem_cache_t *skc) { int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); skc->skc_mag_size = spl_magazine_size(skc); skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; for_each_possible_cpu(i) { skc->skc_mag[i] = spl_magazine_alloc(skc, i); if (!skc->skc_mag[i]) { for (i--; i >= 0; i--) spl_magazine_free(skc->skc_mag[i]); kfree(skc->skc_mag); return (-ENOMEM); } } return (0); } /* * Destroy all pre-cpu magazines. */ static void spl_magazine_destroy(spl_kmem_cache_t *skc) { spl_kmem_magazine_t *skm; int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); for_each_possible_cpu(i) { skm = skc->skc_mag[i]; spl_cache_flush(skc, skm, skm->skm_avail); spl_magazine_free(skm); } kfree(skc->skc_mag); } /* * Create a object cache based on the following arguments: * name cache name * size cache object size * align cache object alignment * ctor cache object constructor * dtor cache object destructor * reclaim cache object reclaim * priv cache private data for ctor/dtor/reclaim * vmp unused must be NULL * flags * KMC_KVMEM Force kvmem backed SPL cache * KMC_SLAB Force Linux slab backed cache * KMC_NODEBUG Disable debugging (unsupported) * KMC_RECLAIMABLE Memory can be freed under pressure */ spl_kmem_cache_t * spl_kmem_cache_create(const char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags) { gfp_t lflags = kmem_flags_convert(KM_SLEEP); spl_kmem_cache_t *skc; int rc; /* * Unsupported flags */ ASSERT(vmp == NULL); ASSERT(reclaim == NULL); might_sleep(); skc = kzalloc(sizeof (*skc), lflags); if (skc == NULL) return (NULL); skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; skc->skc_name = kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { kfree(skc); return (NULL); } strlcpy(skc->skc_name, name, skc->skc_name_size); skc->skc_ctor = ctor; skc->skc_dtor = dtor; skc->skc_private = priv; skc->skc_vmp = vmp; skc->skc_linux_cache = NULL; skc->skc_flags = flags; skc->skc_obj_size = size; skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; atomic_set(&skc->skc_ref, 0); INIT_LIST_HEAD(&skc->skc_list); INIT_LIST_HEAD(&skc->skc_complete_list); INIT_LIST_HEAD(&skc->skc_partial_list); skc->skc_emergency_tree = RB_ROOT; spin_lock_init(&skc->skc_lock); init_waitqueue_head(&skc->skc_waitq); skc->skc_slab_fail = 0; skc->skc_slab_create = 0; skc->skc_slab_destroy = 0; skc->skc_slab_total = 0; skc->skc_slab_alloc = 0; skc->skc_slab_max = 0; skc->skc_obj_total = 0; skc->skc_obj_alloc = 0; skc->skc_obj_max = 0; skc->skc_obj_deadlock = 0; skc->skc_obj_emergency = 0; skc->skc_obj_emergency_max = 0; rc = percpu_counter_init(&skc->skc_linux_alloc, 0, GFP_KERNEL); if (rc != 0) { kfree(skc); return (NULL); } /* * Verify the requested alignment restriction is sane. */ if (align) { VERIFY(ISP2(align)); VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); VERIFY3U(align, <=, PAGE_SIZE); skc->skc_obj_align = align; } /* * When no specific type of slab is requested (kmem, vmem, or * linuxslab) then select a cache type based on the object size * and default tunables. */ if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) { if (spl_kmem_cache_slab_limit && size <= (size_t)spl_kmem_cache_slab_limit) { /* * Objects smaller than spl_kmem_cache_slab_limit can * use the Linux slab for better space-efficiency. */ skc->skc_flags |= KMC_SLAB; } else { /* * All other objects are considered large and are * placed on kvmem backed slabs. */ skc->skc_flags |= KMC_KVMEM; } } /* * Given the type of slab allocate the required resources. */ if (skc->skc_flags & KMC_KVMEM) { rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size); if (rc) goto out; rc = spl_magazine_create(skc); if (rc) goto out; } else { unsigned long slabflags = 0; if (size > spl_kmem_cache_slab_limit) goto out; if (skc->skc_flags & KMC_RECLAIMABLE) slabflags |= SLAB_RECLAIM_ACCOUNT; skc->skc_linux_cache = kmem_cache_create_usercopy( skc->skc_name, size, align, slabflags, 0, size, NULL); if (skc->skc_linux_cache == NULL) goto out; } down_write(&spl_kmem_cache_sem); list_add_tail(&skc->skc_list, &spl_kmem_cache_list); up_write(&spl_kmem_cache_sem); return (skc); out: kfree(skc->skc_name); percpu_counter_destroy(&skc->skc_linux_alloc); kfree(skc); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); /* * Register a move callback for cache defragmentation. * XXX: Unimplemented but harmless to stub out for now. */ void spl_kmem_cache_set_move(spl_kmem_cache_t *skc, kmem_cbrc_t (move)(void *, void *, size_t, void *)) { ASSERT(move != NULL); } EXPORT_SYMBOL(spl_kmem_cache_set_move); /* * Destroy a cache and all objects associated with the cache. */ void spl_kmem_cache_destroy(spl_kmem_cache_t *skc) { DECLARE_WAIT_QUEUE_HEAD(wq); taskqid_t id; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB)); down_write(&spl_kmem_cache_sem); list_del_init(&skc->skc_list); up_write(&spl_kmem_cache_sem); /* Cancel any and wait for any pending delayed tasks */ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); spin_lock(&skc->skc_lock); id = skc->skc_taskqid; spin_unlock(&skc->skc_lock); taskq_cancel_id(spl_kmem_cache_taskq, id); /* * Wait until all current callers complete, this is mainly * to catch the case where a low memory situation triggers a * cache reaping action which races with this destroy. */ wait_event(wq, atomic_read(&skc->skc_ref) == 0); if (skc->skc_flags & KMC_KVMEM) { spl_magazine_destroy(skc); spl_slab_reclaim(skc); } else { ASSERT(skc->skc_flags & KMC_SLAB); kmem_cache_destroy(skc->skc_linux_cache); } spin_lock(&skc->skc_lock); /* * Validate there are no objects in use and free all the * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ ASSERT3U(skc->skc_slab_alloc, ==, 0); ASSERT3U(skc->skc_obj_alloc, ==, 0); ASSERT3U(skc->skc_slab_total, ==, 0); ASSERT3U(skc->skc_obj_total, ==, 0); ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); percpu_counter_destroy(&skc->skc_linux_alloc); spin_unlock(&skc->skc_lock); kfree(skc->skc_name); kfree(skc); } EXPORT_SYMBOL(spl_kmem_cache_destroy); /* * Allocate an object from a slab attached to the cache. This is used to * repopulate the per-cpu magazine caches in batches when they run low. */ static void * spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) { spl_kmem_obj_t *sko; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(sks->sks_magic == SKS_MAGIC); sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); ASSERT(sko->sko_magic == SKO_MAGIC); ASSERT(sko->sko_addr != NULL); /* Remove from sks_free_list */ list_del_init(&sko->sko_list); sks->sks_age = jiffies; sks->sks_ref++; skc->skc_obj_alloc++; /* Track max obj usage statistics */ if (skc->skc_obj_alloc > skc->skc_obj_max) skc->skc_obj_max = skc->skc_obj_alloc; /* Track max slab usage statistics */ if (sks->sks_ref == 1) { skc->skc_slab_alloc++; if (skc->skc_slab_alloc > skc->skc_slab_max) skc->skc_slab_max = skc->skc_slab_alloc; } return (sko->sko_addr); } /* * Generic slab allocation function to run by the global work queues. * It is responsible for allocating a new slab, linking it in to the list * of partial slabs, and then waking any waiters. */ static int __spl_cache_grow(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; fstrans_cookie_t cookie = spl_fstrans_mark(); sks = spl_slab_alloc(skc, flags); spl_fstrans_unmark(cookie); spin_lock(&skc->skc_lock); if (sks) { skc->skc_slab_total++; skc->skc_obj_total += sks->sks_objs; list_add_tail(&sks->sks_list, &skc->skc_partial_list); smp_mb__before_atomic(); clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); smp_mb__after_atomic(); } spin_unlock(&skc->skc_lock); return (sks == NULL ? -ENOMEM : 0); } static void spl_cache_grow_work(void *data) { spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; spl_kmem_cache_t *skc = ska->ska_cache; int error = __spl_cache_grow(skc, ska->ska_flags); atomic_dec(&skc->skc_ref); smp_mb__before_atomic(); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); if (error == 0) wake_up_all(&skc->skc_waitq); kfree(ska); } /* * Returns non-zero when a new slab should be available. */ static int spl_cache_grow_wait(spl_kmem_cache_t *skc) { return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); } /* * No available objects on any slabs, create a new slab. Note that this * functionality is disabled for KMC_SLAB caches which are backed by the * Linux slab. */ static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { int remaining, rc = 0; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); *obj = NULL; /* * Since we can't sleep attempt an emergency allocation to satisfy * the request. The only alterative is to fail the allocation but * it's preferable try. The use of KM_NOSLEEP is expected to be rare. */ if (flags & KM_NOSLEEP) return (spl_emergency_alloc(skc, flags, obj)); might_sleep(); /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. */ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, TASK_UNINTERRUPTIBLE); return (rc ? rc : -EAGAIN); } /* * Note: It would be nice to reduce the overhead of context switch * and improve NUMA locality, by trying to allocate a new slab in the * current process context with KM_NOSLEEP flag. * * However, this can't be applied to vmem/kvmem due to a bug that * spl_vmalloc() doesn't honor gfp flags in page table allocation. */ /* * This is handled by dispatching a work request to the global work * queue. This allows us to asynchronously allocate a new slab while * retaining the ability to safely fall back to a smaller synchronous * allocations to ensure forward progress is always maintained. */ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_all(&skc->skc_waitq); return (-ENOMEM); } atomic_inc(&skc->skc_ref); ska->ska_cache = skc; ska->ska_flags = flags; taskq_init_ent(&ska->ska_tqe); taskq_dispatch_ent(spl_kmem_cache_taskq, spl_cache_grow_work, ska, 0, &ska->ska_tqe); } /* * The goal here is to only detect the rare case where a virtual slab * allocation has deadlocked. We must be careful to minimize the use * of emergency objects which are more expensive to track. Therefore, * we set a very long timeout for the asynchronous allocation and if * the timeout is reached the cache is flagged as deadlocked. From * this point only new emergency objects will be allocated until the * asynchronous allocation completes and clears the deadlocked flag. */ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { rc = spl_emergency_alloc(skc, flags, obj); } else { remaining = wait_event_timeout(skc->skc_waitq, spl_cache_grow_wait(skc), HZ / 10); if (!remaining) { spin_lock(&skc->skc_lock); if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); skc->skc_obj_deadlock++; } spin_unlock(&skc->skc_lock); } rc = -ENOMEM; } return (rc); } /* * Refill a per-cpu magazine with objects from the slabs for this cache. * Ideally the magazine can be repopulated using existing objects which have * been released, however if we are unable to locate enough free objects new * slabs of objects will be created. On success NULL is returned, otherwise * the address of a single emergency object is returned for use by the caller. */ static void * spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) { spl_kmem_slab_t *sks; int count = 0, rc, refill; void *obj = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); while (refill > 0) { /* No slabs available we may need to grow the cache */ if (list_empty(&skc->skc_partial_list)) { spin_unlock(&skc->skc_lock); local_irq_enable(); rc = spl_cache_grow(skc, flags, &obj); local_irq_disable(); /* Emergency object for immediate use by caller */ if (rc == 0 && obj != NULL) return (obj); if (rc) goto out; /* Rescheduled to different CPU skm is not local */ if (skm != skc->skc_mag[smp_processor_id()]) goto out; /* * Potentially rescheduled to the same CPU but * allocations may have occurred from this CPU while * we were sleeping so recalculate max refill. */ refill = MIN(refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); continue; } /* Grab the next available slab */ sks = list_entry((&skc->skc_partial_list)->next, spl_kmem_slab_t, sks_list); ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref < sks->sks_objs); ASSERT(!list_empty(&sks->sks_free_list)); /* * Consume as many objects as needed to refill the requested * cache. We must also be careful not to overfill it. */ while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { ASSERT(skm->skm_avail < skm->skm_size); ASSERT(count < skm->skm_size); skm->skm_objs[skm->skm_avail++] = spl_cache_obj(skc, sks); } /* Move slab to skc_complete_list when full */ if (sks->sks_ref == sks->sks_objs) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_complete_list); } } spin_unlock(&skc->skc_lock); out: return (NULL); } /* * Release an object back to the slab from which it came. */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) { spl_kmem_slab_t *sks = NULL; spl_kmem_obj_t *sko = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); sko = spl_sko_from_obj(skc, obj); ASSERT(sko->sko_magic == SKO_MAGIC); sks = sko->sko_slab; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_cache == skc); list_add(&sko->sko_list, &sks->sks_free_list); sks->sks_age = jiffies; sks->sks_ref--; skc->skc_obj_alloc--; /* * Move slab to skc_partial_list when no longer full. Slabs * are added to the head to keep the partial list is quasi-full * sorted order. Fuller at the head, emptier at the tail. */ if (sks->sks_ref == (sks->sks_objs - 1)) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_partial_list); } /* * Move empty slabs to the end of the partial list so * they can be easily found and freed during reclamation. */ if (sks->sks_ref == 0) { list_del(&sks->sks_list); list_add_tail(&sks->sks_list, &skc->skc_partial_list); skc->skc_slab_alloc--; } } /* * Allocate an object from the per-cpu magazine, or if the magazine * is empty directly allocate from a slab and repopulate the magazine. */ void * spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_magazine_t *skm; void *obj = NULL; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Allocate directly from a Linux slab. All optimizations are left * to the underlying cache we only need to guarantee that KM_SLEEP * callers will never fail. */ if (skc->skc_flags & KMC_SLAB) { struct kmem_cache *slc = skc->skc_linux_cache; do { obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); if (obj != NULL) { /* * Even though we leave everything up to the * underlying cache we still keep track of * how many objects we've allocated in it for * better debuggability. */ percpu_counter_inc(&skc->skc_linux_alloc); } goto ret; } local_irq_disable(); restart: /* * Safe to update per-cpu structure without lock, but * in the restart case we must be careful to reacquire * the local magazine since this may have changed * when we need to grow the cache. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); if (likely(skm->skm_avail)) { /* Object available in CPU cache, use it */ obj = skm->skm_objs[--skm->skm_avail]; } else { obj = spl_cache_refill(skc, skm, flags); if ((obj == NULL) && !(flags & KM_NOSLEEP)) goto restart; local_irq_enable(); goto ret; } local_irq_enable(); ASSERT(obj); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); ret: /* Pre-emptively migrate object to CPU L1 cache */ if (obj) { if (obj && skc->skc_ctor) skc->skc_ctor(obj, skc->skc_private, flags); else prefetchw(obj); } return (obj); } EXPORT_SYMBOL(spl_kmem_cache_alloc); /* * Free an object back to the local per-cpu magazine, there is no * guarantee that this is the same magazine the object was originally * allocated from. We may need to flush entire from the magazine * back to the slabs to make space. */ void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_magazine_t *skm; unsigned long flags; int do_reclaim = 0; int do_emergency = 0; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Run the destructor */ if (skc->skc_dtor) skc->skc_dtor(obj, skc->skc_private); /* * Free the object from the Linux underlying Linux slab. */ if (skc->skc_flags & KMC_SLAB) { kmem_cache_free(skc->skc_linux_cache, obj); percpu_counter_dec(&skc->skc_linux_alloc); return; } /* * While a cache has outstanding emergency objects all freed objects * must be checked. However, since emergency objects will never use * a virtual address these objects can be safely excluded as an * optimization. */ if (!is_vmalloc_addr(obj)) { spin_lock(&skc->skc_lock); do_emergency = (skc->skc_obj_emergency > 0); spin_unlock(&skc->skc_lock); if (do_emergency && (spl_emergency_free(skc, obj) == 0)) return; } local_irq_save(flags); /* * Safe to update per-cpu structure without lock, but * no remote memory allocation tracking is being performed * it is entirely possible to allocate an object from one * CPU cache and return it to another. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); /* * Per-CPU cache full, flush it to make space for this object, * this may result in an empty slab which can be reclaimed once * interrupts are re-enabled. */ if (unlikely(skm->skm_avail >= skm->skm_size)) { spl_cache_flush(skc, skm, skm->skm_refill); do_reclaim = 1; } /* Available space in cache, use it */ skm->skm_objs[skm->skm_avail++] = obj; local_irq_restore(flags); if (do_reclaim) spl_slab_reclaim(skc); } EXPORT_SYMBOL(spl_kmem_cache_free); /* * Depending on how many and which objects are released it may simply * repopulate the local magazine which will then need to age-out. Objects * which cannot fit in the magazine will be released back to their slabs * which will also need to age out before being released. This is all just * best effort and we do not want to thrash creating and destroying slabs. */ void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) { ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); if (skc->skc_flags & KMC_SLAB) return; atomic_inc(&skc->skc_ref); /* * Prevent concurrent cache reaping when contended. */ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) goto out; /* Reclaim from the magazine and free all now empty slabs. */ unsigned long irq_flags; local_irq_save(irq_flags); spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; spl_cache_flush(skc, skm, skm->skm_avail); local_irq_restore(irq_flags); spl_slab_reclaim(skc); clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); out: atomic_dec(&skc->skc_ref); } EXPORT_SYMBOL(spl_kmem_cache_reap_now); /* * This is stubbed out for code consistency with other platforms. There * is existing logic to prevent concurrent reaping so while this is ugly * it should do no harm. */ int spl_kmem_cache_reap_active(void) { return (0); } EXPORT_SYMBOL(spl_kmem_cache_reap_active); /* * Reap all free slabs from all registered caches. */ void spl_kmem_reap(void) { spl_kmem_cache_t *skc = NULL; down_read(&spl_kmem_cache_sem); list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { spl_kmem_cache_reap_now(skc); } up_read(&spl_kmem_cache_sem); } EXPORT_SYMBOL(spl_kmem_reap); int spl_kmem_cache_init(void) { init_rwsem(&spl_kmem_cache_sem); INIT_LIST_HEAD(&spl_kmem_cache_list); spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", spl_kmem_cache_kmem_threads, maxclsyspri, spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (spl_kmem_cache_taskq == NULL) return (-ENOMEM); return (0); } void spl_kmem_cache_fini(void) { taskq_destroy(spl_kmem_cache_taskq); } diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c index cae304d33bc3..3e8361184d57 100644 --- a/module/os/linux/spl/spl-kmem.c +++ b/module/os/linux/spl/spl-kmem.c @@ -1,629 +1,627 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #include #include #include #include -/* BEGIN CSTYLED */ /* * As a general rule kmem_alloc() allocations should be small, preferably * just a few pages since they must by physically contiguous. Therefore, a * rate limited warning will be printed to the console for any kmem_alloc() * which exceeds a reasonable threshold. * * The default warning threshold is set to sixteen pages but capped at 64K to * accommodate systems using large pages. This value was selected to be small * enough to ensure the largest allocations are quickly noticed and fixed. * But large enough to avoid logging any warnings when a allocation size is * larger than optimal but not a serious concern. Since this value is tunable, * developers are encouraged to set it lower when testing so any new largish * allocations are quickly caught. These warnings may be disabled by setting * the threshold to zero. */ unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); module_param(spl_kmem_alloc_warn, uint, 0644); MODULE_PARM_DESC(spl_kmem_alloc_warn, "Warning threshold in bytes for a kmem_alloc()"); EXPORT_SYMBOL(spl_kmem_alloc_warn); /* * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. * Allocations which are marginally smaller than this limit may succeed but * should still be avoided due to the expense of locating a contiguous range * of free pages. Therefore, a maximum kmem size with reasonable safely * margin of 4x is set. Kmem_alloc() allocations larger than this maximum * will quickly fail. Vmem_alloc() allocations less than or equal to this * value will use kmalloc(), but shift to vmalloc() when exceeding this value. */ unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); module_param(spl_kmem_alloc_max, uint, 0644); MODULE_PARM_DESC(spl_kmem_alloc_max, "Maximum size in bytes for a kmem_alloc()"); EXPORT_SYMBOL(spl_kmem_alloc_max); -/* END CSTYLED */ int kmem_debugging(void) { return (0); } EXPORT_SYMBOL(kmem_debugging); char * kmem_vasprintf(const char *fmt, va_list ap) { va_list aq; char *ptr; do { va_copy(aq, ap); ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); va_end(aq); } while (ptr == NULL); return (ptr); } EXPORT_SYMBOL(kmem_vasprintf); char * kmem_asprintf(const char *fmt, ...) { va_list ap; char *ptr; do { va_start(ap, fmt); ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); va_end(ap); } while (ptr == NULL); return (ptr); } EXPORT_SYMBOL(kmem_asprintf); static char * __strdup(const char *str, int flags) { char *ptr; int n; n = strlen(str); ptr = kmalloc(n + 1, kmem_flags_convert(flags)); if (ptr) memcpy(ptr, str, n + 1); return (ptr); } char * kmem_strdup(const char *str) { return (__strdup(str, KM_SLEEP)); } EXPORT_SYMBOL(kmem_strdup); void kmem_strfree(char *str) { kfree(str); } EXPORT_SYMBOL(kmem_strfree); void * spl_kvmalloc(size_t size, gfp_t lflags) { /* * GFP_KERNEL allocations can safely use kvmalloc which may * improve performance by avoiding a) high latency caused by * vmalloc's on-access allocation, b) performance loss due to * MMU memory address mapping and c) vmalloc locking overhead. * This has the side-effect that the slab statistics will * incorrectly report this as a vmem allocation, but that is * purely cosmetic. */ if ((lflags & GFP_KERNEL) == GFP_KERNEL) return (kvmalloc(size, lflags)); gfp_t kmalloc_lflags = lflags; if (size > PAGE_SIZE) { /* * We need to set __GFP_NOWARN here since spl_kvmalloc is not * only called by spl_kmem_alloc_impl but can be called * directly with custom lflags, too. In that case * kmem_flags_convert does not get called, which would * implicitly set __GFP_NOWARN. */ kmalloc_lflags |= __GFP_NOWARN; /* * N.B. __GFP_RETRY_MAYFAIL is supported only for large * e (>32kB) allocations. * * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY * for !costly requests because there is no other way to tell * the allocator that we want to fail rather than retry * endlessly. */ if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { kmalloc_lflags |= __GFP_NORETRY; } } /* * We first try kmalloc - even for big sizes - and fall back to * spl_vmalloc if that fails. * * For non-__GFP-RECLAIM allocations we always stick to * kmalloc_node, and fail when kmalloc is not successful (returns * NULL). * We cannot fall back to spl_vmalloc in this case because spl_vmalloc * internally uses GPF_KERNEL allocations. */ void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); if (ptr || size <= PAGE_SIZE || (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { return (ptr); } return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); } /* * General purpose unified implementation of kmem_alloc(). It is an * amalgamation of Linux and Illumos allocator design. It should never be * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains * relatively portable. Consumers may only access this function through * wrappers that enforce the common flags to ensure portability. */ inline void * spl_kmem_alloc_impl(size_t size, int flags, int node) { gfp_t lflags = kmem_flags_convert(flags); void *ptr; /* * Log abnormally large allocations and rate limit the console output. * Allocations larger than spl_kmem_alloc_warn should be performed * through the vmem_alloc()/vmem_zalloc() interfaces. */ if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && !(flags & KM_VMEM)) { printk(KERN_WARNING "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" "https://github.com/openzfs/zfs/issues/new\n", (unsigned long)size, flags); dump_stack(); } /* * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used * unlike kmem_alloc() with KM_SLEEP on Illumos. */ do { /* * Calling kmalloc_node() when the size >= spl_kmem_alloc_max * is unsafe. This must fail for all for kmem_alloc() and * kmem_zalloc() callers. * * For vmem_alloc() and vmem_zalloc() callers it is permissible * to use spl_vmalloc(). However, in general use of * spl_vmalloc() is strongly discouraged because a global lock * must be acquired. Contention on this lock can significantly * impact performance so frequently manipulating the virtual * address space is strongly discouraged. */ if (size > spl_kmem_alloc_max) { if (flags & KM_VMEM) { ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); } else { return (NULL); } } else { /* * We use kmalloc when doing kmem_alloc(KM_NOSLEEP), * because kvmalloc/vmalloc may sleep. We also use * kmalloc on systems with limited kernel VA space (e.g. * 32-bit), which have HIGHMEM. Otherwise we use * kvmalloc, which tries to get contiguous physical * memory (fast, like kmalloc) and falls back on using * virtual memory to stitch together pages (slow, like * vmalloc). */ #ifdef CONFIG_HIGHMEM if (flags & KM_VMEM) { #else if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) { #endif ptr = spl_kvmalloc(size, lflags); } else { ptr = kmalloc_node(size, lflags, node); } } if (likely(ptr) || (flags & KM_NOSLEEP)) return (ptr); /* * Try hard to satisfy the allocation. However, when progress * cannot be made, the allocation is allowed to fail. */ if ((lflags & GFP_KERNEL) == GFP_KERNEL) lflags |= __GFP_RETRY_MAYFAIL; /* * Use cond_resched() instead of congestion_wait() to avoid * deadlocking systems where there are no block devices. */ cond_resched(); } while (1); return (NULL); } inline void spl_kmem_free_impl(const void *buf, size_t size) { if (is_vmalloc_addr(buf)) vfree(buf); else kfree(buf); } /* * Memory allocation and accounting for kmem_* * style allocations. When * DEBUG_KMEM is enabled the total memory allocated will be tracked and * any memory leaked will be reported during module unload. * * ./configure --enable-debug-kmem */ #ifdef DEBUG_KMEM /* Shim layer memory accounting */ #ifdef HAVE_ATOMIC64_T atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); unsigned long long kmem_alloc_max = 0; #else /* HAVE_ATOMIC64_T */ atomic_t kmem_alloc_used = ATOMIC_INIT(0); unsigned long long kmem_alloc_max = 0; #endif /* HAVE_ATOMIC64_T */ EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); inline void * spl_kmem_alloc_debug(size_t size, int flags, int node) { void *ptr; ptr = spl_kmem_alloc_impl(size, flags, node); if (ptr) { kmem_alloc_used_add(size); if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) kmem_alloc_max = kmem_alloc_used_read(); } return (ptr); } inline void spl_kmem_free_debug(const void *ptr, size_t size) { kmem_alloc_used_sub(size); spl_kmem_free_impl(ptr, size); } /* * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked * but also the location of every alloc and free. When the SPL module is * unloaded a list of all leaked addresses and where they were allocated * will be dumped to the console. Enabling this feature has a significant * impact on performance but it makes finding memory leaks straight forward. * * Not surprisingly with debugging enabled the xmem_locks are very highly * contended particularly on xfree(). If we want to run with this detailed * debugging enabled for anything other than debugging we need to minimize * the contention by moving to a lock per xmem_table entry model. * * ./configure --enable-debug-kmem-tracking */ #ifdef DEBUG_KMEM_TRACKING #include #include #define KMEM_HASH_BITS 10 #define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) typedef struct kmem_debug { struct hlist_node kd_hlist; /* Hash node linkage */ struct list_head kd_list; /* List of all allocations */ void *kd_addr; /* Allocation pointer */ size_t kd_size; /* Allocation size */ const char *kd_func; /* Allocation function */ int kd_line; /* Allocation line */ } kmem_debug_t; static spinlock_t kmem_lock; static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; static struct list_head kmem_list; static kmem_debug_t * kmem_del_init(spinlock_t *lock, struct hlist_head *table, int bits, const void *addr) { struct hlist_head *head; struct hlist_node *node = NULL; struct kmem_debug *p; unsigned long flags; spin_lock_irqsave(lock, flags); head = &table[hash_ptr((void *)addr, bits)]; hlist_for_each(node, head) { p = list_entry(node, struct kmem_debug, kd_hlist); if (p->kd_addr == addr) { hlist_del_init(&p->kd_hlist); list_del_init(&p->kd_list); spin_unlock_irqrestore(lock, flags); return (p); } } spin_unlock_irqrestore(lock, flags); return (NULL); } inline void * spl_kmem_alloc_track(size_t size, int flags, const char *func, int line, int node) { void *ptr = NULL; kmem_debug_t *dptr; unsigned long irq_flags; dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); if (dptr == NULL) return (NULL); dptr->kd_func = __strdup(func, flags); if (dptr->kd_func == NULL) { kfree(dptr); return (NULL); } ptr = spl_kmem_alloc_debug(size, flags, node); if (ptr == NULL) { kfree(dptr->kd_func); kfree(dptr); return (NULL); } INIT_HLIST_NODE(&dptr->kd_hlist); INIT_LIST_HEAD(&dptr->kd_list); dptr->kd_addr = ptr; dptr->kd_size = size; dptr->kd_line = line; spin_lock_irqsave(&kmem_lock, irq_flags); hlist_add_head(&dptr->kd_hlist, &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); list_add_tail(&dptr->kd_list, &kmem_list); spin_unlock_irqrestore(&kmem_lock, irq_flags); return (ptr); } inline void spl_kmem_free_track(const void *ptr, size_t size) { kmem_debug_t *dptr; /* Ignore NULL pointer since we haven't tracked it at all */ if (ptr == NULL) return; /* Must exist in hash due to kmem_alloc() */ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); ASSERT3P(dptr, !=, NULL); ASSERT3S(dptr->kd_size, ==, size); kfree(dptr->kd_func); kfree(dptr); spl_kmem_free_debug(ptr, size); } #endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ /* * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. */ void * spl_kmem_alloc(size_t size, int flags, const char *func, int line) { ASSERT0(flags & ~KM_PUBLIC_MASK); #if !defined(DEBUG_KMEM) return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); #elif !defined(DEBUG_KMEM_TRACKING) return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); #else return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); #endif } EXPORT_SYMBOL(spl_kmem_alloc); void * spl_kmem_zalloc(size_t size, int flags, const char *func, int line) { ASSERT0(flags & ~KM_PUBLIC_MASK); flags |= KM_ZERO; #if !defined(DEBUG_KMEM) return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); #elif !defined(DEBUG_KMEM_TRACKING) return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); #else return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); #endif } EXPORT_SYMBOL(spl_kmem_zalloc); void spl_kmem_free(const void *buf, size_t size) { #if !defined(DEBUG_KMEM) return (spl_kmem_free_impl(buf, size)); #elif !defined(DEBUG_KMEM_TRACKING) return (spl_kmem_free_debug(buf, size)); #else return (spl_kmem_free_track(buf, size)); #endif } EXPORT_SYMBOL(spl_kmem_free); #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) static char * spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) { int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; int i, flag = 1; ASSERT(str != NULL && len >= 17); memset(str, 0, len); /* * Check for a fully printable string, and while we are at * it place the printable characters in the passed buffer. */ for (i = 0; i < size; i++) { str[i] = ((char *)(kd->kd_addr))[i]; if (isprint(str[i])) { continue; } else { /* * Minimum number of printable characters found * to make it worthwhile to print this as ascii. */ if (i > min) break; flag = 0; break; } } if (!flag) { sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", *((uint8_t *)kd->kd_addr), *((uint8_t *)kd->kd_addr + 2), *((uint8_t *)kd->kd_addr + 4), *((uint8_t *)kd->kd_addr + 6), *((uint8_t *)kd->kd_addr + 8), *((uint8_t *)kd->kd_addr + 10), *((uint8_t *)kd->kd_addr + 12), *((uint8_t *)kd->kd_addr + 14)); } return (str); } static int spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) { int i; spin_lock_init(lock); INIT_LIST_HEAD(list); for (i = 0; i < size; i++) INIT_HLIST_HEAD(&kmem_table[i]); return (0); } static void spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) { unsigned long flags; kmem_debug_t *kd = NULL; char str[17]; spin_lock_irqsave(lock, flags); if (!list_empty(list)) printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", "size", "data", "func", "line"); list_for_each_entry(kd, list, kd_list) { printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), kd->kd_func, kd->kd_line); } spin_unlock_irqrestore(lock, flags); } #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ int spl_kmem_init(void) { #ifdef DEBUG_KMEM kmem_alloc_used_set(0); #ifdef DEBUG_KMEM_TRACKING spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); #endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ return (0); } void spl_kmem_fini(void) { #ifdef DEBUG_KMEM /* * Display all unreclaimed memory addresses, including the * allocation size and the first few bytes of what's located * at that address to aid in debugging. Performance is not * a serious concern here since it is module unload time. */ if (kmem_alloc_used_read() != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); #ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); #endif /* DEBUG_KMEM_TRACKING */ #endif /* DEBUG_KMEM */ } diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index 7f4cab5da114..77dd472ea8b1 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -1,1845 +1,1841 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * * Solaris Porting Layer (SPL) Task Queue Implementation. */ /* * Copyright (c) 2024, Klara Inc. * Copyright (c) 2024, Syneto */ #include #include #include #include #include #include #include #include #include typedef struct taskq_kstats { /* static values, for completeness */ kstat_named_t tqks_threads_max; kstat_named_t tqks_entry_pool_min; kstat_named_t tqks_entry_pool_max; /* gauges (inc/dec counters, current value) */ kstat_named_t tqks_threads_active; kstat_named_t tqks_threads_idle; kstat_named_t tqks_threads_total; kstat_named_t tqks_tasks_pending; kstat_named_t tqks_tasks_priority; kstat_named_t tqks_tasks_total; kstat_named_t tqks_tasks_delayed; kstat_named_t tqks_entries_free; /* counters (inc only, since taskq creation) */ kstat_named_t tqks_threads_created; kstat_named_t tqks_threads_destroyed; kstat_named_t tqks_tasks_dispatched; kstat_named_t tqks_tasks_dispatched_delayed; kstat_named_t tqks_tasks_executed_normal; kstat_named_t tqks_tasks_executed_priority; kstat_named_t tqks_tasks_executed; kstat_named_t tqks_tasks_delayed_requeued; kstat_named_t tqks_tasks_cancelled; kstat_named_t tqks_thread_wakeups; kstat_named_t tqks_thread_wakeups_nowork; kstat_named_t tqks_thread_sleeps; } taskq_kstats_t; static taskq_kstats_t taskq_kstats_template = { { "threads_max", KSTAT_DATA_UINT64 }, { "entry_pool_min", KSTAT_DATA_UINT64 }, { "entry_pool_max", KSTAT_DATA_UINT64 }, { "threads_active", KSTAT_DATA_UINT64 }, { "threads_idle", KSTAT_DATA_UINT64 }, { "threads_total", KSTAT_DATA_UINT64 }, { "tasks_pending", KSTAT_DATA_UINT64 }, { "tasks_priority", KSTAT_DATA_UINT64 }, { "tasks_total", KSTAT_DATA_UINT64 }, { "tasks_delayed", KSTAT_DATA_UINT64 }, { "entries_free", KSTAT_DATA_UINT64 }, { "threads_created", KSTAT_DATA_UINT64 }, { "threads_destroyed", KSTAT_DATA_UINT64 }, { "tasks_dispatched", KSTAT_DATA_UINT64 }, { "tasks_dispatched_delayed", KSTAT_DATA_UINT64 }, { "tasks_executed_normal", KSTAT_DATA_UINT64 }, { "tasks_executed_priority", KSTAT_DATA_UINT64 }, { "tasks_executed", KSTAT_DATA_UINT64 }, { "tasks_delayed_requeued", KSTAT_DATA_UINT64 }, { "tasks_cancelled", KSTAT_DATA_UINT64 }, { "thread_wakeups", KSTAT_DATA_UINT64 }, { "thread_wakeups_nowork", KSTAT_DATA_UINT64 }, { "thread_sleeps", KSTAT_DATA_UINT64 }, }; #define TQSTAT_INC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, 1) #define TQSTAT_DEC(tq, stat) wmsum_add(&tq->tq_sums.tqs_##stat, -1) #define _TQSTAT_MOD_LIST(mod, tq, t) do { \ switch (t->tqent_flags & TQENT_LIST_MASK) { \ case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\ case TQENT_LIST_PENDING: mod(tq, tasks_pending); break; \ case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break; \ case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break; \ } \ } while (0) #define TQSTAT_INC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_INC, tq, t) #define TQSTAT_DEC_LIST(tq, t) _TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t) #define TQENT_SET_LIST(t, l) \ t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l; static int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); static uint_t spl_taskq_thread_timeout_ms = 5000; -/* BEGIN CSTYLED */ module_param(spl_taskq_thread_timeout_ms, uint, 0644); -/* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, "Minimum idle threads exit interval for dynamic taskqs"); static int spl_taskq_thread_dynamic = 1; module_param(spl_taskq_thread_dynamic, int, 0444); MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); static int spl_taskq_thread_priority = 1; module_param(spl_taskq_thread_priority, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_priority, "Allow non-default priority for taskq threads"); static uint_t spl_taskq_thread_sequential = 4; -/* BEGIN CSTYLED */ module_param(spl_taskq_thread_sequential, uint, 0644); -/* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_sequential, "Create new taskq threads after N sequential tasks"); /* * Global system-wide dynamic task queue available for all consumers. This * taskq is not intended for long-running tasks; instead, a dedicated taskq * should be created. */ taskq_t *system_taskq; EXPORT_SYMBOL(system_taskq); /* Global dynamic task queue for long delay */ taskq_t *system_delay_taskq; EXPORT_SYMBOL(system_delay_taskq); /* Private dedicated taskq for creating new taskq threads on demand. */ static taskq_t *dynamic_taskq; static taskq_thread_t *taskq_thread_create(taskq_t *); /* Multi-callback id for cpu hotplugging. */ static int spl_taskq_cpuhp_state; /* List of all taskqs */ LIST_HEAD(tq_list); struct rw_semaphore tq_list_sem; static uint_t taskq_tsd; static int task_km_flags(uint_t flags) { if (flags & TQ_NOSLEEP) return (KM_NOSLEEP); if (flags & TQ_PUSHPAGE) return (KM_PUSHPAGE); return (KM_SLEEP); } /* * taskq_find_by_name - Find the largest instance number of a named taskq. */ static int taskq_find_by_name(const char *name) { struct list_head *tql = NULL; taskq_t *tq; list_for_each_prev(tql, &tq_list) { tq = list_entry(tql, taskq_t, tq_taskqs); if (strcmp(name, tq->tq_name) == 0) return (tq->tq_instance); } return (-1); } /* * NOTE: Must be called with tq->tq_lock held, returns a list_t which * is not attached to the free, work, or pending taskq lists. */ static taskq_ent_t * task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) { taskq_ent_t *t; int count = 0; ASSERT(tq); retry: /* Acquire taskq_ent_t's from free list if available */ if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); ASSERT(!timer_pending(&t->tqent_timer)); list_del_init(&t->tqent_list); TQSTAT_DEC(tq, entries_free); return (t); } /* Free list is empty and memory allocations are prohibited */ if (flags & TQ_NOALLOC) return (NULL); /* Hit maximum taskq_ent_t pool size */ if (tq->tq_nalloc >= tq->tq_maxalloc) { if (flags & TQ_NOSLEEP) return (NULL); /* * Sleep periodically polling the free list for an available * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed * but we cannot block forever waiting for an taskq_ent_t to * show up in the free list, otherwise a deadlock can happen. * * Therefore, we need to allocate a new task even if the number * of allocated tasks is above tq->tq_maxalloc, but we still * end up delaying the task allocation by one second, thereby * throttling the task dispatch rate. */ spin_unlock_irqrestore(&tq->tq_lock, *irqflags); schedule_timeout_interruptible(HZ / 100); spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (count < 100) { count++; goto retry; } } spin_unlock_irqrestore(&tq->tq_lock, *irqflags); t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); if (t) { taskq_init_ent(t); tq->tq_nalloc++; } return (t); } /* * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t * to already be removed from the free, work, or pending taskq lists. */ static void task_free(taskq_t *tq, taskq_ent_t *t) { ASSERT(tq); ASSERT(t); ASSERT(list_empty(&t->tqent_list)); ASSERT(!timer_pending(&t->tqent_timer)); kmem_free(t, sizeof (taskq_ent_t)); tq->tq_nalloc--; } /* * NOTE: Must be called with tq->tq_lock held, either destroys the * taskq_ent_t if too many exist or moves it to the free list for later use. */ static void task_done(taskq_t *tq, taskq_ent_t *t) { ASSERT(tq); ASSERT(t); ASSERT(list_empty(&t->tqent_list)); /* Wake tasks blocked in taskq_wait_id() */ wake_up_all(&t->tqent_waitq); if (tq->tq_nalloc <= tq->tq_minalloc) { t->tqent_id = TASKQID_INVALID; t->tqent_func = NULL; t->tqent_arg = NULL; t->tqent_flags = 0; list_add_tail(&t->tqent_list, &tq->tq_free_list); TQSTAT_INC(tq, entries_free); } else { task_free(tq, t); } } /* * When a delayed task timer expires remove it from the delay list and * add it to the priority list in order for immediate processing. */ static void task_expire_impl(taskq_ent_t *t) { taskq_ent_t *w; taskq_t *tq = t->tqent_taskq; struct list_head *l = NULL; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); if (t->tqent_flags & TQENT_FLAG_CANCEL) { ASSERT(list_empty(&t->tqent_list)); spin_unlock_irqrestore(&tq->tq_lock, flags); return; } t->tqent_birth = jiffies; DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); /* * The priority list must be maintained in strict task id order * from lowest to highest for lowest_id to be easily calculable. */ list_del(&t->tqent_list); list_for_each_prev(l, &tq->tq_prio_list) { w = list_entry(l, taskq_ent_t, tqent_list); if (w->tqent_id < t->tqent_id) { list_add(&t->tqent_list, l); break; } } if (l == &tq->tq_prio_list) list_add(&t->tqent_list, &tq->tq_prio_list); spin_unlock_irqrestore(&tq->tq_lock, flags); wake_up(&tq->tq_work_waitq); TQSTAT_INC(tq, tasks_delayed_requeued); } static void task_expire(struct timer_list *tl) { struct timer_list *tmr = (struct timer_list *)tl; taskq_ent_t *t = from_timer(t, tmr, tqent_timer); task_expire_impl(t); } /* * Returns the lowest incomplete taskqid_t. The taskqid_t may * be queued on the pending list, on the priority list, on the * delay list, or on the work list currently being handled, but * it is not 100% complete yet. */ static taskqid_t taskq_lowest_id(taskq_t *tq) { taskqid_t lowest_id = tq->tq_next_id; taskq_ent_t *t; taskq_thread_t *tqt; if (!list_empty(&tq->tq_pend_list)) { t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); } if (!list_empty(&tq->tq_prio_list)) { t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); } if (!list_empty(&tq->tq_delay_list)) { t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); } if (!list_empty(&tq->tq_active_list)) { tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, tqt_active_list); ASSERT(tqt->tqt_id != TASKQID_INVALID); lowest_id = MIN(lowest_id, tqt->tqt_id); } return (lowest_id); } /* * Insert a task into a list keeping the list sorted by increasing taskqid. */ static void taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) { taskq_thread_t *w; struct list_head *l = NULL; ASSERT(tq); ASSERT(tqt); list_for_each_prev(l, &tq->tq_active_list) { w = list_entry(l, taskq_thread_t, tqt_active_list); if (w->tqt_id < tqt->tqt_id) { list_add(&tqt->tqt_active_list, l); break; } } if (l == &tq->tq_active_list) list_add(&tqt->tqt_active_list, &tq->tq_active_list); } /* * Find and return a task from the given list if it exists. The list * must be in lowest to highest task id order. */ static taskq_ent_t * taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) { struct list_head *l = NULL; taskq_ent_t *t; list_for_each(l, lh) { t = list_entry(l, taskq_ent_t, tqent_list); if (t->tqent_id == id) return (t); if (t->tqent_id > id) break; } return (NULL); } /* * Find an already dispatched task given the task id regardless of what * state it is in. If a task is still pending it will be returned. * If a task is executing, then -EBUSY will be returned instead. * If the task has already been run then NULL is returned. */ static taskq_ent_t * taskq_find(taskq_t *tq, taskqid_t id) { taskq_thread_t *tqt; struct list_head *l = NULL; taskq_ent_t *t; t = taskq_find_list(tq, &tq->tq_delay_list, id); if (t) return (t); t = taskq_find_list(tq, &tq->tq_prio_list, id); if (t) return (t); t = taskq_find_list(tq, &tq->tq_pend_list, id); if (t) return (t); list_for_each(l, &tq->tq_active_list) { tqt = list_entry(l, taskq_thread_t, tqt_active_list); if (tqt->tqt_id == id) { /* * Instead of returning tqt_task, we just return a non * NULL value to prevent misuse, since tqt_task only * has two valid fields. */ return (ERR_PTR(-EBUSY)); } } return (NULL); } /* * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and * taskq_wait() functions below. * * Taskq waiting is accomplished by tracking the lowest outstanding task * id and the next available task id. As tasks are dispatched they are * added to the tail of the pending, priority, or delay lists. As worker * threads become available the tasks are removed from the heads of these * lists and linked to the worker threads. This ensures the lists are * kept sorted by lowest to highest task id. * * Therefore the lowest outstanding task id can be quickly determined by * checking the head item from all of these lists. This value is stored * with the taskq as the lowest id. It only needs to be recalculated when * either the task with the current lowest id completes or is canceled. * * By blocking until the lowest task id exceeds the passed task id the * taskq_wait_outstanding() function can be easily implemented. Similarly, * by blocking until the lowest task id matches the next task id taskq_wait() * can be implemented. * * Callers should be aware that when there are multiple worked threads it * is possible for larger task ids to complete before smaller ones. Also * when the taskq contains delay tasks with small task ids callers may * block for a considerable length of time waiting for them to expire and * execute. */ static int taskq_wait_id_check(taskq_t *tq, taskqid_t id) { int rc; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (taskq_find(tq, id) == NULL); spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } /* * The taskq_wait_id() function blocks until the passed task id completes. * This does not guarantee that all lower task ids have completed. */ void taskq_wait_id(taskq_t *tq, taskqid_t id) { wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); } EXPORT_SYMBOL(taskq_wait_id); static int taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) { int rc; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (id < tq->tq_lowest_id); spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } /* * The taskq_wait_outstanding() function will block until all tasks with a * lower taskqid than the passed 'id' have been completed. Note that all * task id's are assigned monotonically at dispatch time. Zero may be * passed for the id to indicate all tasks dispatch up to this point, * but not after, should be waited for. */ void taskq_wait_outstanding(taskq_t *tq, taskqid_t id) { id = id ? id : tq->tq_next_id - 1; wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); } EXPORT_SYMBOL(taskq_wait_outstanding); static int taskq_wait_check(taskq_t *tq) { int rc; unsigned long flags; spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); rc = (tq->tq_lowest_id == tq->tq_next_id); spin_unlock_irqrestore(&tq->tq_lock, flags); return (rc); } /* * The taskq_wait() function will block until the taskq is empty. * This means that if a taskq re-dispatches work to itself taskq_wait() * callers will block indefinitely. */ void taskq_wait(taskq_t *tq) { wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); } EXPORT_SYMBOL(taskq_wait); int taskq_member(taskq_t *tq, kthread_t *t) { return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); } EXPORT_SYMBOL(taskq_member); taskq_t * taskq_of_curthread(void) { return (tsd_get(taskq_tsd)); } EXPORT_SYMBOL(taskq_of_curthread); /* * Cancel an already dispatched task given the task id. Still pending tasks * will be immediately canceled, and if the task is active the function will * block until it completes. Preallocated tasks which are canceled must be * freed by the caller. */ int taskq_cancel_id(taskq_t *tq, taskqid_t id) { taskq_ent_t *t; int rc = ENOENT; unsigned long flags; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); t = taskq_find(tq, id); if (t && t != ERR_PTR(-EBUSY)) { list_del_init(&t->tqent_list); TQSTAT_DEC_LIST(tq, t); TQSTAT_DEC(tq, tasks_total); t->tqent_flags |= TQENT_FLAG_CANCEL; TQSTAT_INC(tq, tasks_cancelled); /* * When canceling the lowest outstanding task id we * must recalculate the new lowest outstanding id. */ if (tq->tq_lowest_id == t->tqent_id) { tq->tq_lowest_id = taskq_lowest_id(tq); ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); } /* * The task_expire() function takes the tq->tq_lock so drop * drop the lock before synchronously cancelling the timer. */ if (timer_pending(&t->tqent_timer)) { spin_unlock_irqrestore(&tq->tq_lock, flags); del_timer_sync(&t->tqent_timer); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); } if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) task_done(tq, t); rc = 0; } spin_unlock_irqrestore(&tq->tq_lock, flags); if (t == ERR_PTR(-EBUSY)) { taskq_wait_id(tq, id); rc = EBUSY; } return (rc); } EXPORT_SYMBOL(taskq_cancel_id); static int taskq_thread_spawn(taskq_t *tq); taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) { taskq_ent_t *t; taskqid_t rc = TASKQID_INVALID; unsigned long irqflags; ASSERT(tq); ASSERT(func); spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) goto out; /* Do not queue the task unless there is idle thread for it */ ASSERT(tq->tq_nactive <= tq->tq_nthreads); if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ if (taskq_thread_spawn(tq) == 0) goto out; } if ((t = task_alloc(tq, flags, &irqflags)) == NULL) goto out; spin_lock(&t->tqent_lock); /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ if (flags & TQ_NOQUEUE) { TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add(&t->tqent_list, &tq->tq_prio_list); /* Queue to the priority list instead of the pending list */ } else if (flags & TQ_FRONT) { TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add_tail(&t->tqent_list, &tq->tq_prio_list); } else { TQENT_SET_LIST(t, TQENT_LIST_PENDING); list_add_tail(&t->tqent_list, &tq->tq_pend_list); } TQSTAT_INC_LIST(tq, t); TQSTAT_INC(tq, tasks_total); t->tqent_id = rc = tq->tq_next_id; tq->tq_next_id++; t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; t->tqent_timer.function = NULL; t->tqent_timer.expires = 0; t->tqent_birth = jiffies; DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); TQSTAT_INC(tq, tasks_dispatched); /* Spawn additional taskq threads if required. */ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } EXPORT_SYMBOL(taskq_dispatch); taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t flags, clock_t expire_time) { taskqid_t rc = TASKQID_INVALID; taskq_ent_t *t; unsigned long irqflags; ASSERT(tq); ASSERT(func); spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) goto out; if ((t = task_alloc(tq, flags, &irqflags)) == NULL) goto out; spin_lock(&t->tqent_lock); /* Queue to the delay list for subsequent execution */ list_add_tail(&t->tqent_list, &tq->tq_delay_list); TQENT_SET_LIST(t, TQENT_LIST_DELAY); TQSTAT_INC_LIST(tq, t); TQSTAT_INC(tq, tasks_total); t->tqent_id = rc = tq->tq_next_id; tq->tq_next_id++; t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; t->tqent_timer.function = task_expire; t->tqent_timer.expires = (unsigned long)expire_time; add_timer(&t->tqent_timer); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); spin_unlock(&t->tqent_lock); TQSTAT_INC(tq, tasks_dispatched_delayed); /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } EXPORT_SYMBOL(taskq_dispatch_delay); void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { unsigned long irqflags; ASSERT(tq); ASSERT(func); spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); /* Taskq being destroyed and all tasks drained */ if (!(tq->tq_flags & TASKQ_ACTIVE)) { t->tqent_id = TASKQID_INVALID; goto out; } if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ if (taskq_thread_spawn(tq) == 0) goto out; flags |= TQ_FRONT; } spin_lock(&t->tqent_lock); /* * Make sure the entry is not on some other taskq; it is important to * ASSERT() under lock */ ASSERT(taskq_empty_ent(t)); /* * Mark it as a prealloc'd task. This is important * to ensure that we don't free it later. */ t->tqent_flags |= TQENT_FLAG_PREALLOC; /* Queue to the priority list instead of the pending list */ if (flags & TQ_FRONT) { TQENT_SET_LIST(t, TQENT_LIST_PRIORITY); list_add_tail(&t->tqent_list, &tq->tq_prio_list); } else { TQENT_SET_LIST(t, TQENT_LIST_PENDING); list_add_tail(&t->tqent_list, &tq->tq_pend_list); } TQSTAT_INC_LIST(tq, t); TQSTAT_INC(tq, tasks_total); t->tqent_id = tq->tq_next_id; tq->tq_next_id++; t->tqent_func = func; t->tqent_arg = arg; t->tqent_taskq = tq; t->tqent_birth = jiffies; DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); TQSTAT_INC(tq, tasks_dispatched); /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); } EXPORT_SYMBOL(taskq_dispatch_ent); int taskq_empty_ent(taskq_ent_t *t) { return (list_empty(&t->tqent_list)); } EXPORT_SYMBOL(taskq_empty_ent); void taskq_init_ent(taskq_ent_t *t) { spin_lock_init(&t->tqent_lock); init_waitqueue_head(&t->tqent_waitq); timer_setup(&t->tqent_timer, NULL, 0); INIT_LIST_HEAD(&t->tqent_list); t->tqent_id = 0; t->tqent_func = NULL; t->tqent_arg = NULL; t->tqent_flags = 0; t->tqent_taskq = NULL; } EXPORT_SYMBOL(taskq_init_ent); /* * Return the next pending task, preference is given to tasks on the * priority list which were dispatched with TQ_FRONT. */ static taskq_ent_t * taskq_next_ent(taskq_t *tq) { struct list_head *list; if (!list_empty(&tq->tq_prio_list)) list = &tq->tq_prio_list; else if (!list_empty(&tq->tq_pend_list)) list = &tq->tq_pend_list; else return (NULL); return (list_entry(list->next, taskq_ent_t, tqent_list)); } /* * Spawns a new thread for the specified taskq. */ static void taskq_thread_spawn_task(void *arg) { taskq_t *tq = (taskq_t *)arg; unsigned long flags; if (taskq_thread_create(tq) == NULL) { /* restore spawning count if failed */ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_nspawn--; spin_unlock_irqrestore(&tq->tq_lock, flags); } } /* * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current * number of threads is insufficient to handle the pending tasks. These * new threads must be created by the dedicated dynamic_taskq to avoid * deadlocks between thread creation and memory reclaim. The system_taskq * which is also a dynamic taskq cannot be safely used for this. */ static int taskq_thread_spawn(taskq_t *tq) { int spawning = 0; if (!(tq->tq_flags & TASKQ_DYNAMIC)) return (0); tq->lastspawnstop = jiffies; if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && (tq->tq_flags & TASKQ_ACTIVE)) { spawning = (++tq->tq_nspawn); taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, tq, TQ_NOSLEEP); } return (spawning); } /* * Threads in a dynamic taskq may exit once there is no more work to do. * To prevent threads from being created and destroyed too often limit * the exit rate to one per spl_taskq_thread_timeout_ms. * * The first thread is the thread list is treated as the primary thread. * There is nothing special about the primary thread but in order to avoid * all the taskq pids from changing we opt to make it long running. */ static int taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) { ASSERT(!taskq_next_ent(tq)); if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic) return (0); if (!(tq->tq_flags & TASKQ_ACTIVE)) return (1); if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, tqt_thread_list) == tqt) return (0); ASSERT3U(tq->tq_nthreads, >, 1); if (tq->tq_nspawn != 0) return (0); if (time_before(jiffies, tq->lastspawnstop + msecs_to_jiffies(spl_taskq_thread_timeout_ms))) return (0); tq->lastspawnstop = jiffies; return (1); } static int taskq_thread(void *args) { DECLARE_WAITQUEUE(wait, current); sigset_t blocked; taskq_thread_t *tqt = args; taskq_t *tq; taskq_ent_t *t; int seq_tasks = 0; unsigned long flags; taskq_ent_t dup_task = {}; ASSERT(tqt); ASSERT(tqt->tqt_tq); tq = tqt->tqt_tq; current->flags |= PF_NOFREEZE; (void) spl_fstrans_mark(); sigfillset(&blocked); sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); tsd_set(taskq_tsd, tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); /* * If we are dynamically spawned, decrease spawning count. Note that * we could be created during taskq_create, in which case we shouldn't * do the decrement. But it's fine because taskq_create will reset * tq_nspawn later. */ if (tq->tq_flags & TASKQ_DYNAMIC) tq->tq_nspawn--; /* Immediately exit if more threads than allowed were created. */ if (tq->tq_nthreads >= tq->tq_maxthreads) goto error; tq->tq_nthreads++; list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); wake_up(&tq->tq_wait_waitq); set_current_state(TASK_INTERRUPTIBLE); TQSTAT_INC(tq, threads_total); while (!kthread_should_stop()) { if (list_empty(&tq->tq_pend_list) && list_empty(&tq->tq_prio_list)) { if (taskq_thread_should_stop(tq, tqt)) break; add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); spin_unlock_irqrestore(&tq->tq_lock, flags); TQSTAT_INC(tq, thread_sleeps); TQSTAT_INC(tq, threads_idle); schedule(); seq_tasks = 0; TQSTAT_DEC(tq, threads_idle); TQSTAT_INC(tq, thread_wakeups); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); remove_wait_queue(&tq->tq_work_waitq, &wait); } else { __set_current_state(TASK_RUNNING); } if ((t = taskq_next_ent(tq)) != NULL) { list_del_init(&t->tqent_list); TQSTAT_DEC_LIST(tq, t); TQSTAT_DEC(tq, tasks_total); /* * A TQENT_FLAG_PREALLOC task may be reused or freed * during the task function call. Store tqent_id and * tqent_flags here. * * Also use an on stack taskq_ent_t for tqt_task * assignment in this case; we want to make sure * to duplicate all fields, so the values are * correct when it's accessed via DTRACE_PROBE*. */ tqt->tqt_id = t->tqent_id; tqt->tqt_flags = t->tqent_flags; if (t->tqent_flags & TQENT_FLAG_PREALLOC) { dup_task = *t; t = &dup_task; } tqt->tqt_task = t; taskq_insert_in_order(tq, tqt); tq->tq_nactive++; spin_unlock_irqrestore(&tq->tq_lock, flags); TQSTAT_INC(tq, threads_active); DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t); /* Perform the requested task */ t->tqent_func(t->tqent_arg); DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t); TQSTAT_DEC(tq, threads_active); if ((t->tqent_flags & TQENT_LIST_MASK) == TQENT_LIST_PENDING) TQSTAT_INC(tq, tasks_executed_normal); else TQSTAT_INC(tq, tasks_executed_priority); TQSTAT_INC(tq, tasks_executed); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_nactive--; list_del_init(&tqt->tqt_active_list); tqt->tqt_task = NULL; /* For prealloc'd tasks, we don't free anything. */ if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) task_done(tq, t); /* * When the current lowest outstanding taskqid is * done calculate the new lowest outstanding id */ if (tq->tq_lowest_id == tqt->tqt_id) { tq->tq_lowest_id = taskq_lowest_id(tq); ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); } /* Spawn additional taskq threads if required. */ if ((++seq_tasks) > spl_taskq_thread_sequential && taskq_thread_spawn(tq)) seq_tasks = 0; tqt->tqt_id = TASKQID_INVALID; tqt->tqt_flags = 0; wake_up_all(&tq->tq_wait_waitq); } else TQSTAT_INC(tq, thread_wakeups_nowork); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); tq->tq_nthreads--; list_del_init(&tqt->tqt_thread_list); TQSTAT_DEC(tq, threads_total); TQSTAT_INC(tq, threads_destroyed); error: kmem_free(tqt, sizeof (taskq_thread_t)); spin_unlock_irqrestore(&tq->tq_lock, flags); tsd_set(taskq_tsd, NULL); thread_exit(); return (0); } static taskq_thread_t * taskq_thread_create(taskq_t *tq) { static int last_used_cpu = 0; taskq_thread_t *tqt; tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); INIT_LIST_HEAD(&tqt->tqt_thread_list); INIT_LIST_HEAD(&tqt->tqt_active_list); tqt->tqt_tq = tq; tqt->tqt_id = TASKQID_INVALID; tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, "%s", tq->tq_name); if (tqt->tqt_thread == NULL) { kmem_free(tqt, sizeof (taskq_thread_t)); return (NULL); } if (spl_taskq_thread_bind) { last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); kthread_bind(tqt->tqt_thread, last_used_cpu); } if (spl_taskq_thread_priority) set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); wake_up_process(tqt->tqt_thread); TQSTAT_INC(tq, threads_created); return (tqt); } static void taskq_stats_init(taskq_t *tq) { taskq_sums_t *tqs = &tq->tq_sums; wmsum_init(&tqs->tqs_threads_active, 0); wmsum_init(&tqs->tqs_threads_idle, 0); wmsum_init(&tqs->tqs_threads_total, 0); wmsum_init(&tqs->tqs_tasks_pending, 0); wmsum_init(&tqs->tqs_tasks_priority, 0); wmsum_init(&tqs->tqs_tasks_total, 0); wmsum_init(&tqs->tqs_tasks_delayed, 0); wmsum_init(&tqs->tqs_entries_free, 0); wmsum_init(&tqs->tqs_threads_created, 0); wmsum_init(&tqs->tqs_threads_destroyed, 0); wmsum_init(&tqs->tqs_tasks_dispatched, 0); wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0); wmsum_init(&tqs->tqs_tasks_executed_normal, 0); wmsum_init(&tqs->tqs_tasks_executed_priority, 0); wmsum_init(&tqs->tqs_tasks_executed, 0); wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0); wmsum_init(&tqs->tqs_tasks_cancelled, 0); wmsum_init(&tqs->tqs_thread_wakeups, 0); wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0); wmsum_init(&tqs->tqs_thread_sleeps, 0); } static void taskq_stats_fini(taskq_t *tq) { taskq_sums_t *tqs = &tq->tq_sums; wmsum_fini(&tqs->tqs_threads_active); wmsum_fini(&tqs->tqs_threads_idle); wmsum_fini(&tqs->tqs_threads_total); wmsum_fini(&tqs->tqs_tasks_pending); wmsum_fini(&tqs->tqs_tasks_priority); wmsum_fini(&tqs->tqs_tasks_total); wmsum_fini(&tqs->tqs_tasks_delayed); wmsum_fini(&tqs->tqs_entries_free); wmsum_fini(&tqs->tqs_threads_created); wmsum_fini(&tqs->tqs_threads_destroyed); wmsum_fini(&tqs->tqs_tasks_dispatched); wmsum_fini(&tqs->tqs_tasks_dispatched_delayed); wmsum_fini(&tqs->tqs_tasks_executed_normal); wmsum_fini(&tqs->tqs_tasks_executed_priority); wmsum_fini(&tqs->tqs_tasks_executed); wmsum_fini(&tqs->tqs_tasks_delayed_requeued); wmsum_fini(&tqs->tqs_tasks_cancelled); wmsum_fini(&tqs->tqs_thread_wakeups); wmsum_fini(&tqs->tqs_thread_wakeups_nowork); wmsum_fini(&tqs->tqs_thread_sleeps); } static int taskq_kstats_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) return (EACCES); taskq_t *tq = ksp->ks_private; taskq_kstats_t *tqks = ksp->ks_data; tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads; tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc; tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc; taskq_sums_t *tqs = &tq->tq_sums; tqks->tqks_threads_active.value.ui64 = wmsum_value(&tqs->tqs_threads_active); tqks->tqks_threads_idle.value.ui64 = wmsum_value(&tqs->tqs_threads_idle); tqks->tqks_threads_total.value.ui64 = wmsum_value(&tqs->tqs_threads_total); tqks->tqks_tasks_pending.value.ui64 = wmsum_value(&tqs->tqs_tasks_pending); tqks->tqks_tasks_priority.value.ui64 = wmsum_value(&tqs->tqs_tasks_priority); tqks->tqks_tasks_total.value.ui64 = wmsum_value(&tqs->tqs_tasks_total); tqks->tqks_tasks_delayed.value.ui64 = wmsum_value(&tqs->tqs_tasks_delayed); tqks->tqks_entries_free.value.ui64 = wmsum_value(&tqs->tqs_entries_free); tqks->tqks_threads_created.value.ui64 = wmsum_value(&tqs->tqs_threads_created); tqks->tqks_threads_destroyed.value.ui64 = wmsum_value(&tqs->tqs_threads_destroyed); tqks->tqks_tasks_dispatched.value.ui64 = wmsum_value(&tqs->tqs_tasks_dispatched); tqks->tqks_tasks_dispatched_delayed.value.ui64 = wmsum_value(&tqs->tqs_tasks_dispatched_delayed); tqks->tqks_tasks_executed_normal.value.ui64 = wmsum_value(&tqs->tqs_tasks_executed_normal); tqks->tqks_tasks_executed_priority.value.ui64 = wmsum_value(&tqs->tqs_tasks_executed_priority); tqks->tqks_tasks_executed.value.ui64 = wmsum_value(&tqs->tqs_tasks_executed); tqks->tqks_tasks_delayed_requeued.value.ui64 = wmsum_value(&tqs->tqs_tasks_delayed_requeued); tqks->tqks_tasks_cancelled.value.ui64 = wmsum_value(&tqs->tqs_tasks_cancelled); tqks->tqks_thread_wakeups.value.ui64 = wmsum_value(&tqs->tqs_thread_wakeups); tqks->tqks_thread_wakeups_nowork.value.ui64 = wmsum_value(&tqs->tqs_thread_wakeups_nowork); tqks->tqks_thread_sleeps.value.ui64 = wmsum_value(&tqs->tqs_thread_sleeps); return (0); } static void taskq_kstats_init(taskq_t *tq) { char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance); kstat_t *ksp = kstat_create("taskq", 0, name, "misc", KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (ksp == NULL) return; ksp->ks_private = tq; ksp->ks_update = taskq_kstats_update; ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP); memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t)); kstat_install(ksp); tq->tq_ksp = ksp; } static void taskq_kstats_fini(taskq_t *tq) { if (tq->tq_ksp == NULL) return; kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t)); kstat_delete(tq->tq_ksp); tq->tq_ksp = NULL; } taskq_t * taskq_create(const char *name, int threads_arg, pri_t pri, int minalloc, int maxalloc, uint_t flags) { taskq_t *tq; taskq_thread_t *tqt; int count = 0, rc = 0, i; unsigned long irqflags; int nthreads = threads_arg; ASSERT(name != NULL); ASSERT(minalloc >= 0); ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ /* Scale the number of threads using nthreads as a percentage */ if (flags & TASKQ_THREADS_CPU_PCT) { ASSERT(nthreads <= 100); ASSERT(nthreads >= 0); nthreads = MIN(threads_arg, 100); nthreads = MAX(nthreads, 0); nthreads = MAX((num_online_cpus() * nthreads) /100, 1); } tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); if (tq == NULL) return (NULL); tq->tq_hp_support = B_FALSE; if (flags & TASKQ_THREADS_CPU_PCT) { tq->tq_hp_support = B_TRUE; if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state, &tq->tq_hp_cb_node) != 0) { kmem_free(tq, sizeof (*tq)); return (NULL); } } spin_lock_init(&tq->tq_lock); INIT_LIST_HEAD(&tq->tq_thread_list); INIT_LIST_HEAD(&tq->tq_active_list); tq->tq_name = kmem_strdup(name); tq->tq_nactive = 0; tq->tq_nthreads = 0; tq->tq_nspawn = 0; tq->tq_maxthreads = nthreads; tq->tq_cpu_pct = threads_arg; tq->tq_pri = pri; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; tq->tq_nalloc = 0; tq->tq_flags = (flags | TASKQ_ACTIVE); tq->tq_next_id = TASKQID_INITIAL; tq->tq_lowest_id = TASKQID_INITIAL; tq->lastspawnstop = jiffies; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); INIT_LIST_HEAD(&tq->tq_delay_list); init_waitqueue_head(&tq->tq_work_waitq); init_waitqueue_head(&tq->tq_wait_waitq); tq->tq_lock_class = TQ_LOCK_GENERAL; INIT_LIST_HEAD(&tq->tq_taskqs); taskq_stats_init(tq); if (flags & TASKQ_PREPOPULATE) { spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); for (i = 0; i < minalloc; i++) task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, &irqflags)); spin_unlock_irqrestore(&tq->tq_lock, irqflags); } if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) nthreads = 1; for (i = 0; i < nthreads; i++) { tqt = taskq_thread_create(tq); if (tqt == NULL) rc = 1; else count++; } /* Wait for all threads to be started before potential destroy */ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); /* * taskq_thread might have touched nspawn, but we don't want them to * because they're not dynamically spawned. So we reset it to 0 */ tq->tq_nspawn = 0; if (rc) { taskq_destroy(tq); return (NULL); } down_write(&tq_list_sem); tq->tq_instance = taskq_find_by_name(name) + 1; list_add_tail(&tq->tq_taskqs, &tq_list); up_write(&tq_list_sem); /* Install kstats late, because the name includes tq_instance */ taskq_kstats_init(tq); return (tq); } EXPORT_SYMBOL(taskq_create); void taskq_destroy(taskq_t *tq) { struct task_struct *thread; taskq_thread_t *tqt; taskq_ent_t *t; unsigned long flags; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); tq->tq_flags &= ~TASKQ_ACTIVE; spin_unlock_irqrestore(&tq->tq_lock, flags); if (tq->tq_hp_support) { VERIFY0(cpuhp_state_remove_instance_nocalls( spl_taskq_cpuhp_state, &tq->tq_hp_cb_node)); } /* * When TASKQ_ACTIVE is clear new tasks may not be added nor may * new worker threads be spawned for dynamic taskq. */ if (dynamic_taskq != NULL) taskq_wait_outstanding(dynamic_taskq, 0); taskq_wait(tq); taskq_kstats_fini(tq); /* remove taskq from global list used by the kstats */ down_write(&tq_list_sem); list_del(&tq->tq_taskqs); up_write(&tq_list_sem); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); /* wait for spawning threads to insert themselves to the list */ while (tq->tq_nspawn) { spin_unlock_irqrestore(&tq->tq_lock, flags); schedule_timeout_interruptible(1); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); } /* * Signal each thread to exit and block until it does. Each thread * is responsible for removing itself from the list and freeing its * taskq_thread_t. This allows for idle threads to opt to remove * themselves from the taskq. They can be recreated as needed. */ while (!list_empty(&tq->tq_thread_list)) { tqt = list_entry(tq->tq_thread_list.next, taskq_thread_t, tqt_thread_list); thread = tqt->tqt_thread; spin_unlock_irqrestore(&tq->tq_lock, flags); kthread_stop(thread); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); } while (!list_empty(&tq->tq_free_list)) { t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); list_del_init(&t->tqent_list); task_free(tq, t); } ASSERT0(tq->tq_nthreads); ASSERT0(tq->tq_nalloc); ASSERT0(tq->tq_nspawn); ASSERT(list_empty(&tq->tq_thread_list)); ASSERT(list_empty(&tq->tq_active_list)); ASSERT(list_empty(&tq->tq_free_list)); ASSERT(list_empty(&tq->tq_pend_list)); ASSERT(list_empty(&tq->tq_prio_list)); ASSERT(list_empty(&tq->tq_delay_list)); spin_unlock_irqrestore(&tq->tq_lock, flags); taskq_stats_fini(tq); kmem_strfree(tq->tq_name); kmem_free(tq, sizeof (taskq_t)); } EXPORT_SYMBOL(taskq_destroy); /* * Create a taskq with a specified number of pool threads. Allocate * and return an array of nthreads kthread_t pointers, one for each * thread in the pool. The array is not ordered and must be freed * by the caller. */ taskq_t * taskq_create_synced(const char *name, int nthreads, pri_t pri, int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp) { taskq_t *tq; taskq_thread_t *tqt; int i = 0; kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads, KM_SLEEP); flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); /* taskq_create spawns all the threads before returning */ tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, flags | TASKQ_PREPOPULATE); VERIFY(tq != NULL); VERIFY(tq->tq_nthreads == nthreads); list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) { kthreads[i] = tqt->tqt_thread; i++; } ASSERT3S(i, ==, nthreads); *ktpp = kthreads; return (tq); } EXPORT_SYMBOL(taskq_create_synced); static kstat_t *taskq_summary_ksp = NULL; static int spl_taskq_kstat_headers(char *buf, size_t size) { size_t n = snprintf(buf, size, "%-20s | %-17s | %-23s\n" "%-20s | %-17s | %-23s\n" "%-20s | %-17s | %-23s\n", "", "threads", "tasks on queue", "taskq name", "tot [act idl] max", " pend [ norm high] dly", "--------------------", "-----------------", "-----------------------"); return (n >= size ? ENOMEM : 0); } static int spl_taskq_kstat_data(char *buf, size_t size, void *data) { struct list_head *tql = NULL; taskq_t *tq; char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */ char threads[25]; char tasks[30]; size_t n; int err = 0; down_read(&tq_list_sem); list_for_each_prev(tql, &tq_list) { tq = list_entry(tql, taskq_t, tq_taskqs); mutex_enter(tq->tq_ksp->ks_lock); taskq_kstats_update(tq->tq_ksp, KSTAT_READ); taskq_kstats_t *tqks = tq->tq_ksp->ks_data; snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance); snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu", tqks->tqks_threads_total.value.ui64, tqks->tqks_threads_active.value.ui64, tqks->tqks_threads_idle.value.ui64, tqks->tqks_threads_max.value.ui64); snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu", tqks->tqks_tasks_total.value.ui64, tqks->tqks_tasks_pending.value.ui64, tqks->tqks_tasks_priority.value.ui64, tqks->tqks_tasks_delayed.value.ui64); mutex_exit(tq->tq_ksp->ks_lock); n = snprintf(buf, size, "%-20s | %-17s | %-23s\n", name, threads, tasks); if (n >= size) { err = ENOMEM; break; } buf = &buf[n]; size -= n; } up_read(&tq_list_sem); return (err); } static void spl_taskq_kstat_init(void) { kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (ksp == NULL) return; ksp->ks_data = (void *)(uintptr_t)1; ksp->ks_ndata = 1; kstat_set_raw_ops(ksp, spl_taskq_kstat_headers, spl_taskq_kstat_data, NULL); kstat_install(ksp); taskq_summary_ksp = ksp; } static void spl_taskq_kstat_fini(void) { if (taskq_summary_ksp == NULL) return; kstat_delete(taskq_summary_ksp); taskq_summary_ksp = NULL; } static unsigned int spl_taskq_kick = 0; /* * 2.6.36 API Change * module_param_cb is introduced to take kernel_param_ops and * module_param_call is marked as obsolete. Also set and get operations * were changed to take a 'const struct kernel_param *'. */ static int #ifdef module_param_cb param_set_taskq_kick(const char *val, const struct kernel_param *kp) #else param_set_taskq_kick(const char *val, struct kernel_param *kp) #endif { int ret; taskq_t *tq = NULL; taskq_ent_t *t; unsigned long flags; ret = param_set_uint(val, kp); if (ret < 0 || !spl_taskq_kick) return (ret); /* reset value */ spl_taskq_kick = 0; down_read(&tq_list_sem); list_for_each_entry(tq, &tq_list, tq_taskqs) { spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); /* Check if the first pending is older than 5 seconds */ t = taskq_next_ent(tq); if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { (void) taskq_thread_spawn(tq); printk(KERN_INFO "spl: Kicked taskq %s/%d\n", tq->tq_name, tq->tq_instance); } spin_unlock_irqrestore(&tq->tq_lock, flags); } up_read(&tq_list_sem); return (ret); } #ifdef module_param_cb static const struct kernel_param_ops param_ops_taskq_kick = { .set = param_set_taskq_kick, .get = param_get_uint, }; module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); #else module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, &spl_taskq_kick, 0644); #endif MODULE_PARM_DESC(spl_taskq_kick, "Write nonzero to kick stuck taskqs to spawn more threads"); /* * This callback will be called exactly once for each core that comes online, * for each dynamic taskq. We attempt to expand taskqs that have * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every * time, to correctly determine whether or not to add a thread. */ static int spl_taskq_expand(unsigned int cpu, struct hlist_node *node) { taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); unsigned long flags; int err = 0; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); if (!(tq->tq_flags & TASKQ_ACTIVE)) { spin_unlock_irqrestore(&tq->tq_lock, flags); return (err); } ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); int nthreads = MIN(tq->tq_cpu_pct, 100); nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1); tq->tq_maxthreads = nthreads; if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && tq->tq_maxthreads > tq->tq_nthreads) { spin_unlock_irqrestore(&tq->tq_lock, flags); taskq_thread_t *tqt = taskq_thread_create(tq); if (tqt == NULL) err = -1; return (err); } spin_unlock_irqrestore(&tq->tq_lock, flags); return (err); } /* * While we don't support offlining CPUs, it is possible that CPUs will fail * to online successfully. We do need to be able to handle this case * gracefully. */ static int spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node) { taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); unsigned long flags; ASSERT(tq); spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); if (!(tq->tq_flags & TASKQ_ACTIVE)) goto out; ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); int nthreads = MIN(tq->tq_cpu_pct, 100); nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1); tq->tq_maxthreads = nthreads; if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && tq->tq_maxthreads < tq->tq_nthreads) { ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1); taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next, taskq_thread_t, tqt_thread_list); struct task_struct *thread = tqt->tqt_thread; spin_unlock_irqrestore(&tq->tq_lock, flags); kthread_stop(thread); return (0); } out: spin_unlock_irqrestore(&tq->tq_lock, flags); return (0); } int spl_taskq_init(void) { init_rwsem(&tq_list_sem); tsd_create(&taskq_tsd, NULL); spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down); system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) return (-ENOMEM); system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_delay_taskq == NULL) { cpuhp_remove_multi_state(spl_taskq_cpuhp_state); taskq_destroy(system_taskq); return (-ENOMEM); } dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); if (dynamic_taskq == NULL) { cpuhp_remove_multi_state(spl_taskq_cpuhp_state); taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); return (-ENOMEM); } /* * This is used to annotate tq_lock, so * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch * does not trigger a lockdep warning re: possible recursive locking */ dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; spl_taskq_kstat_init(); return (0); } void spl_taskq_fini(void) { spl_taskq_kstat_fini(); taskq_destroy(dynamic_taskq); dynamic_taskq = NULL; taskq_destroy(system_delay_taskq); system_delay_taskq = NULL; taskq_destroy(system_taskq); system_taskq = NULL; tsd_destroy(&taskq_tsd); cpuhp_remove_multi_state(spl_taskq_cpuhp_state); spl_taskq_cpuhp_state = 0; } diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 04ab8bbca352..39ea3e62dba0 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -1,1352 +1,1351 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. */ /* * See abd.c for a general overview of the arc buffered data (ABD). * * Linear buffers act exactly like normal buffers and are always mapped into the * kernel's virtual memory space, while scattered ABD data chunks are allocated * as physical pages and then mapped in only while they are actually being * accessed through one of the abd_* library functions. Using scattered ABDs * provides several benefits: * * (1) They avoid use of kmem_*, preventing performance problems where running * kmem_reap on very large memory systems never finishes and causes * constant TLB shootdowns. * * (2) Fragmentation is less of an issue since when we are at the limit of * allocatable space, we won't have to search around for a long free * hole in the VA space for large ARC allocations. Each chunk is mapped in * individually, so even if we are using HIGHMEM (see next point) we * wouldn't need to worry about finding a contiguous address range. * * (3) If we are not using HIGHMEM, then all physical memory is always * mapped into the kernel's address space, so we also avoid the map / * unmap costs on each ABD access. * * If we are not using HIGHMEM, scattered buffers which have only one chunk * can be treated as linear buffers, because they are contiguous in the * kernel's virtual address space. See abd_alloc_chunks() for details. */ #include #include #include #include #include #include #include #include #include #include #if defined(MAX_ORDER) #define ABD_MAX_ORDER (MAX_ORDER) #elif defined(MAX_PAGE_ORDER) #define ABD_MAX_ORDER (MAX_PAGE_ORDER) #endif typedef struct abd_stats { kstat_named_t abdstat_struct_size; kstat_named_t abdstat_linear_cnt; kstat_named_t abdstat_linear_data_size; kstat_named_t abdstat_scatter_cnt; kstat_named_t abdstat_scatter_data_size; kstat_named_t abdstat_scatter_chunk_waste; kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; kstat_named_t abdstat_scatter_page_multi_chunk; kstat_named_t abdstat_scatter_page_multi_zone; kstat_named_t abdstat_scatter_page_alloc_retry; kstat_named_t abdstat_scatter_sg_table_retry; } abd_stats_t; static abd_stats_t abd_stats = { /* Amount of memory occupied by all of the abd_t struct allocations */ { "struct_size", KSTAT_DATA_UINT64 }, /* * The number of linear ABDs which are currently allocated, excluding * ABDs which don't own their data (for instance the ones which were * allocated through abd_get_offset() and abd_get_from_buf()). If an * ABD takes ownership of its buf then it will become tracked. */ { "linear_cnt", KSTAT_DATA_UINT64 }, /* Amount of data stored in all linear ABDs tracked by linear_cnt */ { "linear_data_size", KSTAT_DATA_UINT64 }, /* * The number of scatter ABDs which are currently allocated, excluding * ABDs which don't own their data (for instance the ones which were * allocated through abd_get_offset()). */ { "scatter_cnt", KSTAT_DATA_UINT64 }, /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ { "scatter_data_size", KSTAT_DATA_UINT64 }, /* * The amount of space wasted at the end of the last chunk across all * scatter ABDs tracked by scatter_cnt. */ { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, /* * The number of compound allocations of a given order. These * allocations are spread over all currently allocated ABDs, and * act as a measure of memory fragmentation. */ { { "scatter_order_N", KSTAT_DATA_UINT64 } }, /* * The number of scatter ABDs which contain multiple chunks. * ABDs are preferentially allocated from the minimum number of * contiguous multi-page chunks, a single chunk is optimal. */ { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, /* * The number of scatter ABDs which are split across memory zones. * ABDs are preferentially allocated using pages from a single zone. */ { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, /* * The total number of retries encountered when attempting to * allocate the pages to populate the scatter ABD. */ { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, /* * The total number of retries encountered when attempting to * allocate the sg table for an ABD. */ { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, }; static struct { wmsum_t abdstat_struct_size; wmsum_t abdstat_linear_cnt; wmsum_t abdstat_linear_data_size; wmsum_t abdstat_scatter_cnt; wmsum_t abdstat_scatter_data_size; wmsum_t abdstat_scatter_chunk_waste; wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; wmsum_t abdstat_scatter_page_multi_chunk; wmsum_t abdstat_scatter_page_multi_zone; wmsum_t abdstat_scatter_page_alloc_retry; wmsum_t abdstat_scatter_sg_table_retry; } abd_sums; #define abd_for_each_sg(abd, sg, n, i) \ for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) /* * zfs_abd_scatter_min_size is the minimum allocation size to use scatter * ABD's. Smaller allocations will use linear ABD's which uses * zio_[data_]buf_alloc(). * * Scatter ABD's use at least one page each, so sub-page allocations waste * some space when allocated as scatter (e.g. 2KB scatter allocation wastes * half of each page). Using linear ABD's for small allocations means that * they will be put on slabs which contain many allocations. This can * improve memory efficiency, but it also makes it much harder for ARC * evictions to actually free pages, because all the buffers on one slab need * to be freed in order for the slab (and underlying pages) to be freed. * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's * possible for them to actually waste more memory than scatter (one page per * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). * * Spill blocks are typically 512B and are heavily used on systems running * selinux with the default dnode size and the `xattr=sa` property set. * * By default we use linear allocations for 512B and 1KB, and scatter * allocations for larger (1.5KB and up). */ static int zfs_abd_scatter_min_size = 512 * 3; /* * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are * just a single zero'd page. This allows us to conserve memory by * only using a single zero page for the scatterlist. */ abd_t *abd_zero_scatter = NULL; struct page; /* * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will * point to ZERO_PAGE if it is available or it will be an allocated zero'd * PAGESIZE buffer. */ static struct page *abd_zero_page = NULL; static kmem_cache_t *abd_cache = NULL; static kstat_t *abd_ksp; static uint_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); } abd_t * abd_alloc_struct_impl(size_t size) { /* * In Linux we do not use the size passed in during ABD * allocation, so we just ignore it. */ (void) size; abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); ASSERT3P(abd, !=, NULL); ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); return (abd); } void abd_free_struct_impl(abd_t *abd) { kmem_cache_free(abd_cache, abd); ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); } static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; /* * Mark zfs data pages so they can be excluded from kernel crash dumps */ #ifdef _LP64 #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E static inline void abd_mark_zfs_page(struct page *page) { get_page(page); SetPagePrivate(page); set_page_private(page, ABD_FILE_CACHE_PAGE); } static inline void abd_unmark_zfs_page(struct page *page) { set_page_private(page, 0UL); ClearPagePrivate(page); put_page(page); } #else #define abd_mark_zfs_page(page) #define abd_unmark_zfs_page(page) #endif /* _LP64 */ #ifndef CONFIG_HIGHMEM #ifndef __GFP_RECLAIM #define __GFP_RECLAIM __GFP_WAIT #endif /* * The goal is to minimize fragmentation by preferentially populating ABDs * with higher order compound pages from a single zone. Allocation size is * progressively decreased until it can be satisfied without performing * reclaim or compaction. When necessary this function will degenerate to * allocating individual pages and allowing reclaim to satisfy allocations. */ void abd_alloc_chunks(abd_t *abd, size_t size) { struct list_head pages; struct sg_table table; struct scatterlist *sg; struct page *page, *tmp_page = NULL; gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; unsigned int max_order = MIN(zfs_abd_scatter_max_order, ABD_MAX_ORDER - 1); unsigned int nr_pages = abd_chunkcnt_for_bytes(size); unsigned int chunks = 0, zones = 0; size_t remaining_size; int nid = NUMA_NO_NODE; unsigned int alloc_pages = 0; INIT_LIST_HEAD(&pages); ASSERT3U(alloc_pages, <, nr_pages); while (alloc_pages < nr_pages) { unsigned int chunk_pages; unsigned int order; order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); chunk_pages = (1U << order); page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); if (page == NULL) { if (order == 0) { ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); schedule_timeout_interruptible(1); } else { max_order = MAX(0, order - 1); } continue; } list_add_tail(&page->lru, &pages); if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) zones++; nid = page_to_nid(page); ABDSTAT_BUMP(abdstat_scatter_orders[order]); chunks++; alloc_pages += chunk_pages; } ASSERT3S(alloc_pages, ==, nr_pages); while (sg_alloc_table(&table, chunks, gfp)) { ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); schedule_timeout_interruptible(1); } sg = table.sgl; remaining_size = size; list_for_each_entry_safe(page, tmp_page, &pages, lru) { size_t sg_size = MIN(PAGESIZE << compound_order(page), remaining_size); sg_set_page(sg, page, sg_size, 0); abd_mark_zfs_page(page); remaining_size -= sg_size; sg = sg_next(sg); list_del(&page->lru); } /* * These conditions ensure that a possible transformation to a linear * ABD would be valid. */ ASSERT(!PageHighMem(sg_page(table.sgl))); ASSERT0(ABD_SCATTER(abd).abd_offset); if (table.nents == 1) { /* * Since there is only one entry, this ABD can be represented * as a linear buffer. All single-page (4K) ABD's can be * represented this way. Some multi-page ABD's can also be * represented this way, if we were able to allocate a single * "chunk" (higher-order "page" which represents a power-of-2 * series of physically-contiguous pages). This is often the * case for 2-page (8K) ABD's. * * Representing a single-entry scatter ABD as a linear ABD * has the performance advantage of avoiding the copy (and * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. * A performance increase of around 5% has been observed for * ARC-cached reads (of small blocks which can take advantage * of this). * * Note that this optimization is only possible because the * pages are always mapped into the kernel's address space. * This is not the case for highmem pages, so the * optimization can not be made there. */ abd->abd_flags |= ABD_FLAG_LINEAR; abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; abd->abd_u.abd_linear.abd_sgl = table.sgl; ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); } else if (table.nents > 1) { ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; if (zones) { ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); abd->abd_flags |= ABD_FLAG_MULTI_ZONE; } ABD_SCATTER(abd).abd_sgl = table.sgl; ABD_SCATTER(abd).abd_nents = table.nents; } } #else /* * Allocate N individual pages to construct a scatter ABD. This function * makes no attempt to request contiguous pages and requires the minimal * number of kernel interfaces. It's designed for maximum compatibility. */ void abd_alloc_chunks(abd_t *abd, size_t size) { struct scatterlist *sg = NULL; struct sg_table table; struct page *page; gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; int nr_pages = abd_chunkcnt_for_bytes(size); int i = 0; while (sg_alloc_table(&table, nr_pages, gfp)) { ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); schedule_timeout_interruptible(1); } ASSERT3U(table.nents, ==, nr_pages); ABD_SCATTER(abd).abd_sgl = table.sgl; ABD_SCATTER(abd).abd_nents = nr_pages; abd_for_each_sg(abd, sg, nr_pages, i) { while ((page = __page_cache_alloc(gfp)) == NULL) { ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); schedule_timeout_interruptible(1); } ABDSTAT_BUMP(abdstat_scatter_orders[0]); sg_set_page(sg, page, PAGESIZE, 0); abd_mark_zfs_page(page); } if (nr_pages > 1) { ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; } } #endif /* !CONFIG_HIGHMEM */ /* * This must be called if any of the sg_table allocation functions * are called. */ static void abd_free_sg_table(abd_t *abd) { struct sg_table table; table.sgl = ABD_SCATTER(abd).abd_sgl; table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; sg_free_table(&table); } void abd_free_chunks(abd_t *abd) { struct scatterlist *sg = NULL; struct page *page; int nr_pages = ABD_SCATTER(abd).abd_nents; int order, i = 0; if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); /* * Scatter ABDs may be constructed by abd_alloc_from_pages() from * an array of pages. In which case they should not be freed. */ if (!abd_is_from_pages(abd)) { abd_for_each_sg(abd, sg, nr_pages, i) { page = sg_page(sg); abd_unmark_zfs_page(page); order = compound_order(page); __free_pages(page, order); ASSERT3U(sg->length, <=, PAGE_SIZE << order); ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); } } abd_free_sg_table(abd); } /* * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in * the scatterlist will be set to the zero'd out buffer abd_zero_page. */ static void abd_alloc_zero_scatter(void) { struct scatterlist *sg = NULL; struct sg_table table; gfp_t gfp = __GFP_NOWARN | GFP_NOIO; int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); int i = 0; #if defined(HAVE_ZERO_PAGE_GPL_ONLY) gfp_t gfp_zero_page = gfp | __GFP_ZERO; while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); schedule_timeout_interruptible(1); } abd_mark_zfs_page(abd_zero_page); #else abd_zero_page = ZERO_PAGE(0); #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ while (sg_alloc_table(&table, nr_pages, gfp)) { ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); schedule_timeout_interruptible(1); } ASSERT3U(table.nents, ==, nr_pages); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; ABD_SCATTER(abd_zero_scatter).abd_offset = 0; ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK; abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { sg_set_page(sg, abd_zero_page, PAGESIZE, 0); } ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); } boolean_t abd_size_alloc_linear(size_t size) { return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); } void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); } else { ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); } } void abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) { ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); } else { ABDSTAT_BUMPDOWN(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); } } void abd_verify_scatter(abd_t *abd) { ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, ABD_SCATTER(abd).abd_sgl->length); #ifdef ZFS_DEBUG struct scatterlist *sg = NULL; size_t n = ABD_SCATTER(abd).abd_nents; int i = 0; abd_for_each_sg(abd, sg, n, i) { ASSERT3P(sg_page(sg), !=, NULL); } #endif } static void abd_free_zero_scatter(void) { ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); abd_free_sg_table(abd_zero_scatter); abd_free_struct(abd_zero_scatter); abd_zero_scatter = NULL; ASSERT3P(abd_zero_page, !=, NULL); #if defined(HAVE_ZERO_PAGE_GPL_ONLY) abd_unmark_zfs_page(abd_zero_page); __free_page(abd_zero_page); #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ } static int abd_kstats_update(kstat_t *ksp, int rw) { abd_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); as->abdstat_struct_size.value.ui64 = wmsum_value(&abd_sums.abdstat_struct_size); as->abdstat_linear_cnt.value.ui64 = wmsum_value(&abd_sums.abdstat_linear_cnt); as->abdstat_linear_data_size.value.ui64 = wmsum_value(&abd_sums.abdstat_linear_data_size); as->abdstat_scatter_cnt.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_cnt); as->abdstat_scatter_data_size.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_data_size); as->abdstat_scatter_chunk_waste.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); for (int i = 0; i < ABD_MAX_ORDER; i++) { as->abdstat_scatter_orders[i].value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_orders[i]); } as->abdstat_scatter_page_multi_chunk.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); as->abdstat_scatter_page_multi_zone.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); as->abdstat_scatter_page_alloc_retry.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); as->abdstat_scatter_sg_table_retry.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); return (0); } void abd_init(void) { int i; abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE); wmsum_init(&abd_sums.abdstat_struct_size, 0); wmsum_init(&abd_sums.abdstat_linear_cnt, 0); wmsum_init(&abd_sums.abdstat_linear_data_size, 0); wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); for (i = 0; i < ABD_MAX_ORDER; i++) wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { for (i = 0; i < ABD_MAX_ORDER; i++) { snprintf(abd_stats.abdstat_scatter_orders[i].name, KSTAT_STRLEN, "scatter_order_%d", i); abd_stats.abdstat_scatter_orders[i].data_type = KSTAT_DATA_UINT64; } abd_ksp->ks_data = &abd_stats; abd_ksp->ks_update = abd_kstats_update; kstat_install(abd_ksp); } abd_alloc_zero_scatter(); } void abd_fini(void) { abd_free_zero_scatter(); if (abd_ksp != NULL) { kstat_delete(abd_ksp); abd_ksp = NULL; } wmsum_fini(&abd_sums.abdstat_struct_size); wmsum_fini(&abd_sums.abdstat_linear_cnt); wmsum_fini(&abd_sums.abdstat_linear_data_size); wmsum_fini(&abd_sums.abdstat_scatter_cnt); wmsum_fini(&abd_sums.abdstat_scatter_data_size); wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); for (int i = 0; i < ABD_MAX_ORDER; i++) wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); if (abd_cache) { kmem_cache_destroy(abd_cache); abd_cache = NULL; } } void abd_free_linear_page(abd_t *abd) { /* Transform it back into a scatter ABD for freeing */ struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; /* When backed by user page unmap it */ if (abd_is_from_pages(abd)) zfs_kunmap(sg_page(sg)); else abd_update_scatter_stats(abd, ABDSTAT_DECR); abd->abd_flags &= ~ABD_FLAG_LINEAR; abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; ABD_SCATTER(abd).abd_nents = 1; ABD_SCATTER(abd).abd_offset = 0; ABD_SCATTER(abd).abd_sgl = sg; abd_free_chunks(abd); } /* * Allocate a scatter ABD structure from user pages. The pages must be * pinned with get_user_pages, or similiar, but need not be mapped via * the kmap interfaces. */ abd_t * abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) { uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE); struct sg_table table; VERIFY3U(size, <=, DMU_MAX_ACCESS); ASSERT3U(offset, <, PAGE_SIZE); ASSERT3P(pages, !=, NULL); /* * Even if this buf is filesystem metadata, we only track that we * own the underlying data buffer, which is not true in this case. * Therefore, we don't ever use ABD_FLAG_META here. */ abd_t *abd = abd_alloc_struct(0); abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER; abd->abd_size = size; while (sg_alloc_table_from_pages(&table, pages, npages, offset, size, __GFP_NOWARN | GFP_NOIO) != 0) { ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); schedule_timeout_interruptible(1); } if ((offset + size) <= PAGE_SIZE) { /* * Since there is only one entry, this ABD can be represented * as a linear buffer. All single-page (4K) ABD's constructed * from a user page can be represented this way as long as the * page is mapped to a virtual address. This allows us to * apply an offset in to the mapped page. * * Note that kmap() must be used, not kmap_atomic(), because * the mapping needs to bet set up on all CPUs. Using kmap() * also enables the user of highmem pages when required. */ abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; abd->abd_u.abd_linear.abd_sgl = table.sgl; zfs_kmap(sg_page(table.sgl)); ABD_LINEAR_BUF(abd) = sg_virt(table.sgl); } else { ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; ABD_SCATTER(abd).abd_offset = offset; ABD_SCATTER(abd).abd_sgl = table.sgl; ABD_SCATTER(abd).abd_nents = table.nents; ASSERT0(ABD_SCATTER(abd).abd_offset); } return (abd); } /* * If we're going to use this ABD for doing I/O using the block layer, the * consumer of the ABD data doesn't care if it's scattered or not, and we don't * plan to store this ABD in memory for a long period of time, we should * allocate the ABD type that requires the least data copying to do the I/O. * * On Linux the optimal thing to do would be to use abd_get_offset() and * construct a new ABD which shares the original pages thereby eliminating * the copy. But for the moment a new linear ABD is allocated until this * performance optimization can be implemented. */ abd_t * abd_alloc_for_io(size_t size, boolean_t is_metadata) { return (abd_alloc(size, is_metadata)); } abd_t * abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) { (void) size; int i = 0; struct scatterlist *sg = NULL; abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; if (abd == NULL) abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { if (new_offset < sg->length) break; new_offset -= sg->length; } ABD_SCATTER(abd).abd_sgl = sg; ABD_SCATTER(abd).abd_offset = new_offset; ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; if (abd_is_from_pages(sabd)) abd->abd_flags |= ABD_FLAG_FROM_PAGES; return (abd); } /* * Initialize the abd_iter. */ void abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; if (!abd_is_linear(abd)) { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } } /* * This is just a helper function to see if we have exhausted the * abd_iter and reached the end. */ boolean_t abd_iter_at_end(struct abd_iter *aiter) { ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); return (aiter->iter_pos == aiter->iter_abd->abd_size); } /* * Advance the iterator by a certain amount. Cannot be called when a chunk is * in use. This can be safely called when the aiter has already exhausted, in * which case this does nothing. */ void abd_iter_advance(struct abd_iter *aiter, size_t amount) { /* * Ensure that last chunk is not in use. abd_iterate_*() must clear * this state (directly or abd_iter_unmap()) before advancing. */ ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); ASSERT3P(aiter->iter_page, ==, NULL); ASSERT0(aiter->iter_page_doff); ASSERT0(aiter->iter_page_dsize); /* There's nothing left to advance to, so do nothing */ if (abd_iter_at_end(aiter)) return; aiter->iter_pos += amount; aiter->iter_offset += amount; if (!abd_is_linear(aiter->iter_abd)) { while (aiter->iter_offset >= aiter->iter_sg->length) { aiter->iter_offset -= aiter->iter_sg->length; aiter->iter_sg = sg_next(aiter->iter_sg); if (aiter->iter_sg == NULL) { ASSERT0(aiter->iter_offset); break; } } } } /* * Map the current chunk into aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ void abd_iter_map(struct abd_iter *aiter) { void *paddr; size_t offset = 0; ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ if (abd_iter_at_end(aiter)) return; if (abd_is_linear(aiter->iter_abd)) { ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); offset = aiter->iter_offset; aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; paddr = ABD_LINEAR_BUF(aiter->iter_abd); } else { offset = aiter->iter_offset; aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, aiter->iter_abd->abd_size - aiter->iter_pos); paddr = zfs_kmap_local(sg_page(aiter->iter_sg)); } aiter->iter_mapaddr = (char *)paddr + offset; } /* * Unmap the current chunk from aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ void abd_iter_unmap(struct abd_iter *aiter) { /* There's nothing left to unmap, so do nothing */ if (abd_iter_at_end(aiter)) return; if (!abd_is_linear(aiter->iter_abd)) { /* LINTED E_FUNC_SET_NOT_USED */ zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset); } ASSERT3P(aiter->iter_mapaddr, !=, NULL); ASSERT3U(aiter->iter_mapsize, >, 0); aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } void abd_cache_reap_now(void) { } /* * Borrow a raw buffer from an ABD without copying the contents of the ABD * into the buffer. If the ABD is scattered, this will allocate a raw buffer * whose contents are undefined. To copy over the existing data in the ABD, use * abd_borrow_buf_copy() instead. */ void * abd_borrow_buf(abd_t *abd, size_t n) { void *buf; abd_verify(abd); ASSERT3U(abd->abd_size, >=, 0); /* * In the event the ABD is composed of a single user page from Direct * I/O we can not direclty return the raw buffer. This is a consequence * of not being able to write protect the page and the contents of the * page can be changed at any time by the user. */ if (abd_is_from_pages(abd)) { buf = zio_buf_alloc(n); } else if (abd_is_linear(abd)) { buf = abd_to_buf(abd); } else { buf = zio_buf_alloc(n); } #ifdef ZFS_DEBUG (void) zfs_refcount_add_many(&abd->abd_children, n, buf); #endif return (buf); } void * abd_borrow_buf_copy(abd_t *abd, size_t n) { void *buf = abd_borrow_buf(abd, n); /* * In the event the ABD is composed of a single user page from Direct * I/O we must make sure copy the data over into the newly allocated * buffer. This is a consequence of the fact that we can not write * protect the user page and there is a risk the contents of the page * could be changed by the user at any moment. */ if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { abd_copy_to_buf(buf, abd, n); } return (buf); } /* * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will * not change the contents of the ABD. If you want any changes you made to * buf to be copied back to abd, use abd_return_buf_copy() instead. If the * ABD is not constructed from user pages for Direct I/O then an ASSERT * checks to make sure the contents of buffer have not changed since it was * borrowed. We can not ASSERT that the contents of the buffer have not changed * if it is composed of user pages because the pages can not be placed under * write protection and the user could have possibly changed the contents in * the pages at any time. This is also an issue for Direct I/O reads. Checksum * verifications in the ZIO pipeline check for this issue and handle it by * returning an error on checksum verification failure. */ void abd_return_buf(abd_t *abd, void *buf, size_t n) { abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); #ifdef ZFS_DEBUG (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); #endif if (abd_is_from_pages(abd)) { zio_buf_free(buf, n); } else if (abd_is_linear(abd)) { ASSERT3P(buf, ==, abd_to_buf(abd)); } else if (abd_is_gang(abd)) { #ifdef ZFS_DEBUG /* * We have to be careful with gang ABD's that we do not ASSERT0 * for any ABD's that contain user pages from Direct I/O. In * order to handle this, we just iterate through the gang ABD * and only verify ABDs that are not from user pages. */ void *cmp_buf = buf; for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { if (!abd_is_from_pages(cabd)) { ASSERT0(abd_cmp_buf(cabd, cmp_buf, cabd->abd_size)); } cmp_buf = (char *)cmp_buf + cabd->abd_size; } #endif zio_buf_free(buf, n); } else { ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } } void abd_return_buf_copy(abd_t *abd, void *buf, size_t n) { if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { abd_copy_from_buf(abd, buf, n); } abd_return_buf(abd, buf, n); } /* * This is abd_iter_page(), the function underneath abd_iterate_page_func(). * It yields the next page struct and data offset and size within it, without * mapping it into the address space. */ /* * "Compound pages" are a group of pages that can be referenced from a single * struct page *. Its organised as a "head" page, followed by a series of * "tail" pages. * * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a * great many of the IO buffers we get are going to be of this type. * * The tail pages are just regular PAGESIZE pages, and can be safely used * as-is. However, the head page has length covering itself and all the tail * pages. If the ABD chunk spans multiple pages, then we can use the head page * and a >PAGESIZE length, which is far more efficient. * * Before kernel 4.5 however, compound page heads were refcounted separately * from tail pages, such that moving back to the head page would require us to * take a reference to it and releasing it once we're completely finished with * it. In practice, that meant when our caller is done with the ABD, which we * have no insight into from here. Rather than contort this API to track head * page references on such ancient kernels, we disabled this special compound * page handling on kernels before 4.5, instead just using treating each page * within it as a regular PAGESIZE page (which it is). This is slightly less * efficient, but makes everything far simpler. * * We no longer support kernels before 4.5, so in theory none of this is * necessary. However, this code is still relatively new in the grand scheme of * things, so I'm leaving the ability to compile this out for the moment. * * Setting/clearing ABD_ITER_COMPOUND_PAGES below enables/disables the special * handling, by defining the ABD_ITER_PAGE_SIZE(page) macro to understand * compound pages, or not, and compiling in/out the support to detect compound * tail pages and move back to the start. */ /* On by default */ #define ABD_ITER_COMPOUND_PAGES #ifdef ABD_ITER_COMPOUND_PAGES #define ABD_ITER_PAGE_SIZE(page) \ (PageCompound(page) ? page_size(page) : PAGESIZE) #else #define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) #endif void abd_iter_page(struct abd_iter *aiter) { if (abd_iter_at_end(aiter)) { aiter->iter_page = NULL; aiter->iter_page_doff = 0; aiter->iter_page_dsize = 0; return; } struct page *page; size_t doff, dsize; /* * Find the page, and the start of the data within it. This is computed * differently for linear and scatter ABDs; linear is referenced by * virtual memory location, while scatter is referenced by page * pointer. */ if (abd_is_linear(aiter->iter_abd)) { ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); /* memory address at iter_pos */ void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; /* struct page for address */ page = is_vmalloc_addr(paddr) ? vmalloc_to_page(paddr) : virt_to_page(paddr); /* offset of address within the page */ doff = offset_in_page(paddr); } else { ASSERT(!abd_is_gang(aiter->iter_abd)); /* current scatter page */ page = nth_page(sg_page(aiter->iter_sg), aiter->iter_offset >> PAGE_SHIFT); /* position within page */ doff = aiter->iter_offset & (PAGESIZE - 1); } #ifdef ABD_ITER_COMPOUND_PAGES if (PageTail(page)) { /* * If this is a compound tail page, move back to the head, and * adjust the offset to match. This may let us yield a much * larger amount of data from a single logical page, and so * leave our caller with fewer pages to process. */ struct page *head = compound_head(page); doff += ((page - head) * PAGESIZE); page = head; } #endif ASSERT(page); /* * Compute the maximum amount of data we can take from this page. This * is the smaller of: * - the remaining space in the page * - the remaining space in this scatterlist entry (which may not cover * the entire page) * - the remaining space in the abd (which may not cover the entire * scatterlist entry) */ dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, aiter->iter_abd->abd_size - aiter->iter_pos); if (!abd_is_linear(aiter->iter_abd)) dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); ASSERT3U(dsize, >, 0); /* final iterator outputs */ aiter->iter_page = page; aiter->iter_page_doff = doff; aiter->iter_page_dsize = dsize; } /* * Note: ABD BIO functions only needed to support vdev_classic. See comments in * vdev_disk.c. */ /* * bio_nr_pages for ABD. * @off is the offset in @abd */ unsigned long abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) { unsigned long pos; if (abd_is_gang(abd)) { unsigned long count = 0; for (abd_t *cabd = abd_gang_get_offset(abd, &off); cabd != NULL && size != 0; cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { ASSERT3U(off, <, cabd->abd_size); int mysize = MIN(size, cabd->abd_size - off); count += abd_nr_pages_off(cabd, mysize, off); size -= mysize; off = 0; } return (count); } if (abd_is_linear(abd)) pos = (unsigned long)abd_to_buf(abd) + off; else pos = ABD_SCATTER(abd).abd_offset + off; return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT)); } static unsigned int bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) { unsigned int offset, size, i; struct page *page; offset = offset_in_page(buf_ptr); for (i = 0; i < bio->bi_max_vecs; i++) { size = PAGE_SIZE - offset; if (bio_size <= 0) break; if (size > bio_size) size = bio_size; if (is_vmalloc_addr(buf_ptr)) page = vmalloc_to_page(buf_ptr); else page = virt_to_page(buf_ptr); /* * Some network related block device uses tcp_sendpage, which * doesn't behave well when using 0-count page, this is a * safety net to catch them. */ ASSERT3S(page_count(page), >, 0); if (bio_add_page(bio, page, size, offset) != size) break; buf_ptr += size; bio_size -= size; offset = 0; } return (bio_size); } /* * bio_map for gang ABD. */ static unsigned int abd_gang_bio_map_off(struct bio *bio, abd_t *abd, unsigned int io_size, size_t off) { ASSERT(abd_is_gang(abd)); for (abd_t *cabd = abd_gang_get_offset(abd, &off); cabd != NULL; cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { ASSERT3U(off, <, cabd->abd_size); int size = MIN(io_size, cabd->abd_size - off); int remainder = abd_bio_map_off(bio, cabd, size, off); io_size -= (size - remainder); if (io_size == 0 || remainder > 0) return (io_size); off = 0; } ASSERT0(io_size); return (io_size); } /* * bio_map for ABD. * @off is the offset in @abd * Remaining IO size is returned */ unsigned int abd_bio_map_off(struct bio *bio, abd_t *abd, unsigned int io_size, size_t off) { struct abd_iter aiter; ASSERT3U(io_size, <=, abd->abd_size - off); if (abd_is_linear(abd)) return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); ASSERT(!abd_is_linear(abd)); if (abd_is_gang(abd)) return (abd_gang_bio_map_off(bio, abd, io_size, off)); abd_iter_init(&aiter, abd); abd_iter_advance(&aiter, off); for (int i = 0; i < bio->bi_max_vecs; i++) { struct page *pg; size_t len, sgoff, pgoff; struct scatterlist *sg; if (io_size <= 0) break; sg = aiter.iter_sg; sgoff = aiter.iter_offset; pgoff = sgoff & (PAGESIZE - 1); len = MIN(io_size, PAGESIZE - pgoff); ASSERT(len > 0); pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); if (bio_add_page(bio, pg, len, pgoff) != len) break; io_size -= len; abd_iter_advance(&aiter, len); } return (io_size); } /* Tunable Parameters */ module_param(zfs_abd_scatter_enabled, int, 0644); MODULE_PARM_DESC(zfs_abd_scatter_enabled, "Toggle whether ABD allocations must be linear."); module_param(zfs_abd_scatter_min_size, int, 0644); MODULE_PARM_DESC(zfs_abd_scatter_min_size, "Minimum size of scatter allocations."); -/* CSTYLED */ module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c index a017900d5538..7d01f8f373b2 100644 --- a/module/os/linux/zfs/zfs_debug.c +++ b/module/os/linux/zfs/zfs_debug.c @@ -1,220 +1,218 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include #include typedef struct zfs_dbgmsg { procfs_list_node_t zdm_node; uint64_t zdm_timestamp; uint_t zdm_size; char zdm_msg[]; /* variable length allocation */ } zfs_dbgmsg_t; static procfs_list_t zfs_dbgmsgs; static uint_t zfs_dbgmsg_size = 0; static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ /* * Internal ZFS debug messages are enabled by default. * * # Print debug messages * cat /proc/spl/kstat/zfs/dbgmsg * * # Disable the kernel debug message log. * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable * * # Clear the kernel debug message log. * echo 0 >/proc/spl/kstat/zfs/dbgmsg */ int zfs_dbgmsg_enable = B_TRUE; static int zfs_dbgmsg_show_header(struct seq_file *f) { seq_printf(f, "%-12s %-8s\n", "timestamp", "message"); return (0); } static int zfs_dbgmsg_show(struct seq_file *f, void *p) { zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p; seq_printf(f, "%-12llu %-s\n", (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); return (0); } static void zfs_dbgmsg_purge(uint_t max_size) { while (zfs_dbgmsg_size > max_size) { zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list); if (zdm == NULL) return; uint_t size = zdm->zdm_size; kmem_free(zdm, size); zfs_dbgmsg_size -= size; } } static int zfs_dbgmsg_clear(procfs_list_t *procfs_list) { (void) procfs_list; mutex_enter(&zfs_dbgmsgs.pl_lock); zfs_dbgmsg_purge(0); mutex_exit(&zfs_dbgmsgs.pl_lock); return (0); } void zfs_dbgmsg_init(void) { procfs_list_install("zfs", NULL, "dbgmsg", 0600, &zfs_dbgmsgs, zfs_dbgmsg_show, zfs_dbgmsg_show_header, zfs_dbgmsg_clear, offsetof(zfs_dbgmsg_t, zdm_node)); } void zfs_dbgmsg_fini(void) { procfs_list_uninstall(&zfs_dbgmsgs); zfs_dbgmsg_purge(0); procfs_list_destroy(&zfs_dbgmsgs); } void __set_error(const char *file, const char *func, int line, int err) { /* * To enable this: * * $ echo 512 >/sys/module/zfs/parameters/zfs_flags */ if (zfs_flags & ZFS_DEBUG_SET_ERROR) __dprintf(B_FALSE, file, func, line, "error %lu", (ulong_t)err); } void __zfs_dbgmsg(char *buf) { uint_t size = sizeof (zfs_dbgmsg_t) + strlen(buf) + 1; zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); zdm->zdm_size = size; zdm->zdm_timestamp = gethrestime_sec(); strcpy(zdm->zdm_msg, buf); mutex_enter(&zfs_dbgmsgs.pl_lock); procfs_list_add(&zfs_dbgmsgs, zdm); zfs_dbgmsg_size += size; zfs_dbgmsg_purge(zfs_dbgmsg_maxsize); mutex_exit(&zfs_dbgmsgs.pl_lock); } void __dprintf(boolean_t dprint, const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; va_list adx; size_t size; char *buf; char *nl; int i; char *prefix = (dprint) ? "dprintf: " : ""; size = 1024; buf = kmem_alloc(size, KM_SLEEP); /* * Get rid of annoying prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } i = snprintf(buf, size, "%px %s%s:%d:%s(): ", curthread, prefix, newfile, line, func); if (i < size) { va_start(adx, fmt); (void) vsnprintf(buf + i, size - i, fmt, adx); va_end(adx); } /* * Get rid of trailing newline for dprintf logs. */ if (dprint && buf[0] != '\0') { nl = &buf[strlen(buf) - 1]; if (*nl == '\n') *nl = '\0'; } /* * To get this data enable the zfs__dprintf trace point as shown: * * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable * $ echo 0 > /sys/kernel/debug/tracing/trace * * # Dump the ring buffer. * $ cat /sys/kernel/debug/tracing/trace */ DTRACE_PROBE1(zfs__dprintf, char *, buf); /* * To get this data: * * $ cat /proc/spl/kstat/zfs/dbgmsg * * To clear the buffer: * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg */ __zfs_dbgmsg(buf); kmem_free(buf, size); } module_param(zfs_dbgmsg_enable, int, 0644); MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); -/* BEGIN CSTYLED */ module_param(zfs_dbgmsg_maxsize, uint, 0644); -/* END CSTYLED */ MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index dd9fd760b9c2..a882c88a7a72 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4351 +1,4350 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using zfs_enter(zfsvfs). * A zfs_exit(zfsvfs) is needed before all returns. Any znodes * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires * range locks) and syncing out cached atime changes. Third, * zfs_zinactive() may require a new tx, which could deadlock the system * if you were already holding one. This deadlock occurs because the tx * currently being operated on prevents a txg from syncing, which * prevents the new tx from progressing, resulting in a deadlock. If you * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Honor ZFS_APPENDONLY file attribute */ if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* * Keep a count of the synchronous opens in the znode. On first * synchronous open we must convert all previous async transactions * into sync to keep correct ordering. */ if (flag & O_SYNC) { if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) zil_async_to_sync(zfsvfs->z_log, zp->z_id); } zfs_exit(zfsvfs, FTAG); return (0); } int zfs_close(struct inode *ip, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) static int zfs_fillpage(struct inode *ip, struct page *pp); /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. Update all mapped * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { struct address_space *mp = ZTOI(zp)->i_mapping; int64_t off = start & (PAGE_SIZE - 1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t nbytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); void *pb = kmap(pp); int error = dmu_read(os, zp->z_id, start + off, nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); if (error) { SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); } unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the I/O data synchronized * between the DMU cache and the memory mapped pages. Preferentially read * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; int64_t start = uio->uio_loffset; int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t bytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { /* * If filemap_fault() retries there exists a window * where the page will be unlocked and not up to date. * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { error = zfs_fillpage(ip, pp); if (error) { unlock_page(pp); put_page(pp); return (error); } } ASSERT(PageUptodate(pp) || PageDirty(pp)); unlock_page(pp); void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure. EIO is returned * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *residp) { fstrans_cookie_t cookie; int error; struct iovec iov; iov.iov_base = (void *)data; iov.iov_len = len; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); spl_fstrans_unmark(cookie); if (error == 0) { if (residp != NULL) *residp = zfs_uio_resid(&uio); else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } return (error); } static void zfs_rele_async_task(void *arg) { iput(arg); } void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); /* * If decrementing the count would put us at 0, we can't do it inline * here, because that would be synchronous. Instead, dispatch an iput * to run later. * * For more information on the dangers of a synchronous iput, see the * header comment of this file. */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) return (error); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { zfs_exit(zfsvfs, FTAG); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_TRUE, cr, zfs_init_idmap))) { zrele(*zpp); *zpp = NULL; } zfs_exit(zfsvfs, FTAG); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Perform a linear search in directory for the name of specific inode. * Note we don't pass in the buffer size of name because it's hardcoded to * NAME_MAX+1(256) in Linux. * * IN: dzp - znode of directory to search. * zp - znode of the target * * OUT: name - dentry name of the target * * RETURN: 0 on success, error code on failure. */ int zfs_get_name(znode_t *dzp, char *name, znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(dzp); int error = 0; if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); if ((error = zfs_verify_zp(zp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* ctldir should have got their name in zfs_vget */ if (dzp->z_is_ctldir || zp->z_is_ctldir) { zfs_exit(zfsvfs, FTAG); return (ENOENT); } /* buffer len is hardcoded to 256 in Linux kernel */ error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id, ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); zfs_exit(zfsvfs, FTAG); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, mnt_ns))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ static uint64_t null_xattr = 0; int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; VERIFY_IMPLY(xattr_obj_unlinked, xzp); } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); zfs_exit(zfsvfs, FTAG); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ int zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t *zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; zap = zap_attribute_long_alloc(); /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap->za_name, "."); zap->za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap->za_name, ".."); zap->za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME); zap->za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap->za_integer_length != 8 || zap->za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap->za_integer_length, (u_longlong_t)zap->za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap->za_first_integer); type = ZFS_DIRENT_TYPE(zap->za_first_integer); } done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name), objnum, type); if (done) break; if (prefetch) dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); zap_attribute_free(zap); if (error == ENOENT) error = 0; out: zfs_exit(zfsvfs, FTAG); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ int #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, struct kstat *sp) #else zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) #endif { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK zpl_generic_fillattr(user_ns, request_mask, ip, sp); #else zpl_generic_fillattr(user_ns, ip, sp); #endif /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } zfs_exit(zfsvfs, FTAG); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t *zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap = zap_attribute_alloc(); zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, zap)) == 0) { count = 0; if (zap->za_integer_length != 8 || zap->za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } uint64_t projid = dzp->z_projid; if (zp->z_projid != projid) { if (!(zp->z_pflags & ZFS_PROJID)) { err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) { err = 0; } else if (err != 0) { goto sa_add_projid_err; } else { projid = ZFS_INVALID_PROJID; } } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } } sa_add_projid_err: mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else if (projid == ZFS_INVALID_PROJID) { dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); zap_attribute_free(zap); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * mnt_ns - user namespace of the mount * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ip = ZTOI(zp); os = zfsvfs->z_os; /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, mnt_ns); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr, mnt_ns); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; uid_t uid; gid_t gid; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), vap->va_uid); gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), vap->va_gid); take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); zpl_inode_set_mtime_to_ts(ZTOI(zp), zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); zpl_inode_set_ctime_to_ts(ZTOI(zp), zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); zfs_exit(zfsvfs, FTAG); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * rflags - RENAME_* flags * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; /* Needed for whiteout inode creation. */ boolean_t fuid_dirtied; zfs_acl_ids_t acl_ids; boolean_t have_acl = B_FALSE; znode_t *wzp = NULL; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return (SET_ERROR(EINVAL)); /* Already checked by Linux VFS, but just to make sure. */ if (rflags & RENAME_EXCHANGE && (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) return (SET_ERROR(EINVAL)); /* * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the * right kind of vattr_t for the whiteout file. These are set * internally by ZFS so should never be incorrect. */ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if ((error = zfs_verify_zp(tdzp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ zfs_exit(zfsvfs, FTAG); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; zfs_exit(zfsvfs, FTAG); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; zfs_exit(zfsvfs, FTAG); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { if (rflags & RENAME_NOREPLACE) { error = SET_ERROR(EEXIST); goto out; } /* * Source and target must be the same type (unless exchanging). */ if (!(rflags & RENAME_EXCHANGE)) { boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; if (s_is_dir != t_is_dir) { error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } else if (rflags & RENAME_EXCHANGE) { /* Target must exist for RENAME_EXCHANGE. */ error = SET_ERROR(ENOENT); goto out; } /* Set up inode creation for RENAME_WHITEOUT. */ if (rflags & RENAME_WHITEOUT) { /* * Whiteout files are not regular files or directories, so to * match zfs_create() we do not inherit the project id. */ uint64_t wo_projid = ZFS_DEFAULT_PROJID; error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); if (error) goto out; if (!have_acl) { error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, &acl_ids, mnt_ns); if (error) goto out; have_acl = B_TRUE; } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { error = SET_ERROR(EDQUOT); goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } if (rflags & RENAME_WHITEOUT) { dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Unlink the source. */ szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); VERIFY0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error) goto commit; /* * Unlink the target. */ if (tzp) { int tzflg = zflg; if (rflags & RENAME_EXCHANGE) { /* This inode will be re-linked soon. */ tzflg |= ZRENAMING; tzp->z_pflags |= ZFS_AV_MODIFIED; if (sdzp->z_pflags & ZFS_PROJINHERIT) tzp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&tzp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); } error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); if (error) goto commit_link_szp; } /* * Create the new target links: * * We always link the target. * * RENAME_EXCHANGE: Link the old target to the source. * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. */ error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error) { /* * If we have removed the existing target, a subsequent call to * zfs_link_create() to add back the same entry, but with a new * dnode (szp), should not fail. */ ASSERT3P(tzp, ==, NULL); goto commit_link_tzp; } switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: error = zfs_link_create(sdl, tzp, tx, ZRENAMING); /* * The same argument as zfs_link_create() failing for * szp applies here, since the source directory must * have had an entry we are replacing. */ ASSERT0(error); if (error) goto commit_unlink_td_szp; break; case RENAME_WHITEOUT: zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); error = zfs_link_create(sdl, wzp, tx, ZNEW); if (error) { zfs_znode_delete(wzp, tx); remove_inode_hash(ZTOI(wzp)); goto commit_unlink_td_szp; } break; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: zfs_log_rename_exchange(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; case RENAME_WHITEOUT: zfs_log_rename_whiteout(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp, wzp); break; default: ASSERT0(rflags & ~RENAME_NOREPLACE); zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; } commit: dmu_tx_commit(tx); out: if (have_acl) zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zrele(szp); if (wzp) { zfs_znode_update_vfs(wzp); zrele(wzp); } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); /* * Clean-up path for broken link state. * * At this point we are in a (very) bad state, so we need to do our * best to correct the state. In particular, all of the nlinks are * wrong because we were destroying and creating links with ZRENAMING. * * In some form, all of these operations have to resolve the state: * * * link_destroy() *must* succeed. Fortunately, this is very likely * since we only just created it. * * * link_create()s are allowed to fail (though they shouldn't because * we only just unlinked them and are putting the entries back * during clean-up). But if they fail, we can just forcefully drop * the nlink value to (at the very least) avoid broken nlink values * -- though in the case of non-empty directories we will have to * panic (otherwise we'd have a leaked directory with a broken ..). */ commit_unlink_td_szp: VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); commit_link_tzp: if (tzp) { if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); } commit_link_szp: if (zfs_link_create(sdl, szp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(szp, tx, NULL)); goto commit; } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * mnt_ns - user namespace of the mount * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); zfs_exit(zfsvfs, FTAG); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_verify_zp(szp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { zfs_exit(zfsvfs, FTAG); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zfs_exit(zfsvfs, FTAG); return (error); } static void zfs_putpage_sync_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } static void zfs_putpage_async_commit_cb(void *arg) { struct page *pp = arg; znode_t *zp = ITOZ(pp->mapping->host); ClearPageError(pp); end_page_writeback(pp); atomic_dec_32(&zp->z_async_writes_cnt); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * for_sync - does the caller intend to wait synchronously for the * page writeback to complete? * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); zfs_exit(zfsvfs, FTAG); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Speed up any non-sync page writebacks since * they may take several seconds to complete. * Refer to the comment in zpl_fsync() for details. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { zil_commit(zfsvfs->z_log, zp->z_id); } if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); #else wait_on_page_bit(pp, PG_writeback); #endif } zfs_exit(zfsvfs, FTAG); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; if (!for_sync) atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); #else __set_page_dirty_nobuffers(pp); #endif ClearPageError(pp); end_page_writeback(pp); if (!for_sync) atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ tmp_ts = zpl_inode_get_mtime(ip); ZFS_TIME_ENCODE(&tmp_ts, mtime); tmp_ts = zpl_inode_get_ctime(ip); ZFS_TIME_ENCODE(&tmp_ts, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); boolean_t commit = B_FALSE; if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ commit = B_TRUE; } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { /* * If the caller does not intend to wait synchronously * for this page writeback to complete and there are active * synchronous calls on this file, do a commit so that * the latter don't accidentally end up waiting for * our writeback to complete. Refer to the comment in * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. */ commit = B_TRUE; } zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (commit) zil_commit(zfsvfs->z_log, zp->z_id); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); zfs_exit(zfsvfs, FTAG); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ tmp_ts = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_ts, atime); tmp_ts = zpl_inode_get_mtime(ip); ZFS_TIME_ENCODE(&tmp_ts, mtime); tmp_ts = zpl_inode_get_ctime(ip); ZFS_TIME_ENCODE(&tmp_ts, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: zfs_exit(zfsvfs, FTAG); return (error); } void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { inode_timespec_t tmp_atime; tmp_atime = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; void *va = kmap(pp); int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); kunmap(pp); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); } return (error); } /* * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. * pp - page to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ int zfs_getpage(struct inode *ip, struct page *pp) { zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *zp = ITOZ(ip); int error; loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; /* * It is important to hold the rangelock here because it is possible * a Direct I/O write or block clone might be taking place at the same * time that a page is being faulted in through filemap_fault(). With * Direct I/O writes and block cloning db->db_data will be set to NULL * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the * rangelock is not held, then there is a race between faulting in a * page and writing out a Direct I/O write or block cloning. Without * the rangelock a NULL pointer dereference can occur in * dmu_read_impl() for db->db_data during the mempcy operation when * zfs_fillpage() calls dmu_read(). */ zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len, RL_READER); if (lr == NULL) { /* * It is important to drop the page lock before grabbing the * rangelock to avoid another deadlock between here and * zfs_write() -> update_pages(). update_pages() holds both the * rangelock and the page lock. */ get_page(pp); unlock_page(pp); lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_READER); lock_page(pp); put_page(pp); } error = zfs_fillpage(ip, pp); zfs_rangelock_exit(lr); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); zfs_exit(zfsvfs, FTAG); return (error); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { (void) addrp; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENXIO)); } zfs_exit(zfsvfs, FTAG); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { (void) offset; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (cmd != F_FREESP) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } if ((error = zfs_verify_zp(zp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); -/* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); #endif diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c index bbaca2f58394..aff7b1f4dac1 100644 --- a/module/os/linux/zfs/zfs_znode_os.c +++ b/module/os/linux/zfs/zfs_znode_os.c @@ -1,1975 +1,1974 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" static kmem_cache_t *znode_cache = NULL; static kmem_cache_t *znode_hold_cache = NULL; unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; /* * This is used by the test suite so that it can delay znodes from being * freed in order to inspect the unlinked set. */ static int zfs_unlink_suspend_progress = 0; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_t *zp = buf; inode_init_once(ZTOI(zp)); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_hold_t *zh = buf; mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); zh->zh_refcount = 0; return (0); } static void zfs_znode_hold_cache_destructor(void *buf, void *arg) { (void) arg; znode_hold_t *zh = buf; mutex_destroy(&zh->zh_lock); } void zfs_znode_init(void) { /* * Initialize zcache. The KMC_SLAB hint is used in order that it be * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB | KMC_RECLAIMABLE); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; if (znode_hold_cache) kmem_cache_destroy(znode_hold_cache); znode_hold_cache = NULL; } /* * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to * serialize access to a znode and its SA buffer while the object is being * created or destroyed. This kind of locking would normally reside in the * znode itself but in this case that's impossible because the znode and SA * buffer may not yet exist. Therefore the locking is handled externally * with an array of mutexes and AVLs trees which contain per-object locks. * * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted * in to the correct AVL tree and finally the per-object lock is held. In * zfs_znode_hold_exit() the process is reversed. The per-object lock is * released, removed from the AVL tree and destroyed if there are no waiters. * * This scheme has two important properties: * * 1) No memory allocations are performed while holding one of the z_hold_locks. * This ensures evict(), which can be called from direct memory reclaim, will * never block waiting on a z_hold_locks which just happens to have hashed * to the same index. * * 2) All locks used to serialize access to an object are per-object and never * shared. This minimizes lock contention without creating a large number * of dedicated locks. * * On the downside it does require znode_lock_t structures to be frequently * allocated and freed. However, because these are backed by a kmem cache * and very short lived this cost is minimal. */ int zfs_znode_hold_compare(const void *a, const void *b) { const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } static boolean_t __maybe_unused zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t held; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; mutex_exit(&zfsvfs->z_hold_locks[i]); return (held); } znode_hold_t * zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, *zh_new, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t found = B_FALSE; zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); if (likely(zh == NULL)) { zh = zh_new; zh->zh_obj = obj; avl_add(&zfsvfs->z_hold_trees[i], zh); } else { ASSERT3U(zh->zh_obj, ==, obj); found = B_TRUE; } zh->zh_refcount++; ASSERT3S(zh->zh_refcount, >, 0); mutex_exit(&zfsvfs->z_hold_locks[i]); if (found == B_TRUE) kmem_cache_free(znode_hold_cache, zh_new); ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); mutex_enter(&zh->zh_lock); return (zh); } void zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) { int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); boolean_t remove = B_FALSE; ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); mutex_exit(&zh->zh_lock); mutex_enter(&zfsvfs->z_hold_locks[i]); ASSERT3S(zh->zh_refcount, >, 0); if (--zh->zh_refcount == 0) { avl_remove(&zfsvfs->z_hold_trees[i], zh); remove = B_TRUE; } mutex_exit(&zfsvfs->z_hold_locks[i]); if (remove == B_TRUE) kmem_cache_free(znode_hold_cache, zh); } dev_t zfs_cmpldev(uint64_t dev) { return (dev); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); mutex_enter(&zp->z_lock); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; mutex_exit(&zp->z_lock); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } /* * Called by new_inode() to allocate a new inode. */ int zfs_inode_alloc(struct super_block *sb, struct inode **ip) { znode_t *zp; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); *ip = ZTOI(zp); return (0); } /* * Called in multiple places when an inode should be destroyed. */ void zfs_inode_destroy(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); } mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } static void zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) { uint64_t rdev = 0; switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; case S_IFDIR: ip->i_op = &zpl_dir_inode_operations; ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; case S_IFLNK: ip->i_op = &zpl_symlink_inode_operations; break; /* * rdev is only stored in a SA only for device files. */ case S_IFCHR: case S_IFBLK: (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)); zfs_fallthrough; case S_IFIFO: case S_IFSOCK: init_special_inode(ip, ip->i_mode, rdev); ip->i_op = &zpl_special_inode_operations; break; default: zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", (u_longlong_t)ip->i_ino, ip->i_mode); /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; } } static void zfs_set_inode_flags(znode_t *zp, struct inode *ip) { /* * Linux and Solaris have different sets of file attributes, so we * restrict this conversion to the intersection of the two. */ unsigned int flags = 0; if (zp->z_pflags & ZFS_IMMUTABLE) flags |= S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) flags |= S_APPEND; inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); } /* * Update the embedded inode given the znode. */ void zfs_znode_update_vfs(znode_t *zp) { struct inode *ip; uint32_t blksize; u_longlong_t i_blocks; ASSERT(zp != NULL); ip = ZTOI(zp); /* Skip .zfs control nodes which do not exist on disk. */ if (zfsctl_is_node(ip)) return; dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); } /* * Construct a znode+inode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; uint64_t mode; uint64_t parent; uint64_t tmp_gen; uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; ASSERT(zfsvfs != NULL); ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); zp = ITOZ(ip); ASSERT(zp->z_dirlocks == NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_is_ctldir = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; goto error; } zp->z_projid = projid; zp->z_mode = ip->i_mode = mode; ip->i_generation = (uint32_t)tmp_gen; ip->i_blkbits = SPA_MINBLOCKSHIFT; set_nlink(ip, (uint32_t)links); zfs_uid_write(ip, z_uid); zfs_gid_write(ip, z_gid); zfs_set_inode_flags(zp, ip); /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; ZFS_TIME_DECODE(&tmp_ts, atime); zpl_inode_set_atime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&tmp_ts, mtime); zpl_inode_set_mtime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&tmp_ts, ctime); zpl_inode_set_ctime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* * The only way insert_inode_locked() can fail is if the ip->i_ino * number is already hashed for this super block. This can never * happen because the inode numbers map 1:1 with the object numbers. * * Exceptions include rolling back a mounted file system, either * from the zfs rollback or zfs recv command. * * Active inodes are unhashed during the rollback, but since zrele * can happen asynchronously, we can't guarantee they've been * unhashed. This can cause hash collisions in unlinked drain * processing so do not hash unlinked znodes. */ if (links > 0) VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) unlock_new_inode(ip); return (zp); error: iput(ip); return (NULL); } /* * Safely mark an inode dirty. Inodes which are part of a read-only * file system or snapshot may not be dirtied. */ void zfs_mark_inode_dirty(struct inode *ip) { zfsvfs_t *zfsvfs = ITOZSB(ip); if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return; mark_inode_dirty(ip); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_TMPFILE - new object is of O_TMPFILE * IS_XATTR - new object is an attribute * acl_ids - ACL related attributes * * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t projid = ZFS_DEFAULT_PROJID; uint64_t rdev = 0; zfsvfs_t *zfsvfs = ZTOZSB(dzp); dmu_buf_t *db; inode_timespec_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; znode_hold_t *zh; if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (S_ISDIR(vap->va_mode)) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } zh = zfs_znode_hold_enter(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } /* * If parent is an xattr, so am I. */ if (dzp->z_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (S_ISDIR(vap->va_mode)) { size = 2; /* contents ("." and "..") */ links = 2; } else { size = 0; links = (flag & IS_TMPFILE) ? 0 : 1; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) rdev = vap->va_rdev; parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { /* * With ZFS_PROJID flag, we can easily know whether there is * project ID stored on disk or not. See zfs_space_delta_cb(). */ if (obj_type != DMU_OT_ZNODE && dmu_objset_projectquota_enabled(zfsvfs->z_os)) pflags |= ZFS_PROJID; /* * Inherit project ID from parent if required. */ projid = zfs_inherit_projid(dzp); if (dzp->z_pflags & ZFS_PROJINHERIT) pflags |= ZFS_PROJINHERIT; } /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & ATTR_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & ATTR_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && pflags & ZFS_PROJID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); } if (obj_type == DMU_OT_ZNODE || (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { /* * The call to zfs_znode_alloc() may fail if memory is low * via the call path: alloc_inode() -> inode_init_always() -> * security_inode_alloc() -> inode_alloc_security(). Since * the existing code is written such that zfs_mknode() can * not fail retry until sufficient memory has been reclaimed. */ do { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); zfs_znode_hold_exit(zfsvfs, zh); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; boolean_t update_inode = B_FALSE; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (update_inode) zfs_set_inode_flags(zp, ZTOI(zp)); } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; znode_hold_t *zh; int err; sa_handle_t *hdl; *zpp = NULL; again: zh = zfs_znode_hold_enter(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); /* * If zp->z_unlinked is set, the znode is already marked * for deletion and should not be discovered. Check this * after checking igrab() due to fsetxattr() & O_TMPFILE. * * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. * The SA handle is still valid but because the VFS * requires that the eviction succeed we must drop * our locks and references to allow the eviction to * complete. The zfs_zget() may then be retried. * * This unlikely case could be optimized by registering * a sops->drop_inode() callback. The callback would * need to detect the active SA hold thereby informing * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { if (zp->z_unlinked) err = SET_ERROR(ENOENT); else err = SET_ERROR(EAGAIN); } else { *zpp = zp; err = 0; } mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); if (err == EAGAIN) { /* inode might need this to finish evict */ cond_resched(); goto again; } return (err); } /* * Not found create new znode/vnode but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } zfs_znode_hold_exit(zfsvfs, zh); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_info_t doi; dmu_buf_t *db; uint64_t obj_num = zp->z_id; uint64_t mode; uint64_t links; sa_bulk_attr_t bulk[11]; int err; int count = 0; uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; /* * skip ctldir, otherwise they will always get invalidated. This will * cause funny behaviour for the mounted snapdirs. Especially for * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent * anyone automount it again as long as someone is still using the * detached mount. */ if (zp->z_is_ctldir) return (0); zh = zfs_znode_hold_enter(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, sizeof (z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, sizeof (z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8); if (err != 0 && err != ENOENT) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(err)); } } zp->z_projid = projid; zp->z_mode = ZTOI(zp)->i_mode = mode; zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); ZFS_TIME_DECODE(&tmp_ts, atime); zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&tmp_ts, mtime); zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&tmp_ts, ctime); zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } set_nlink(ZTOI(zp), (uint32_t)links); zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = B_FALSE; zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatic removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); if (zp->z_unlinked) zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); znode_hold_t *zh; zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t z_id = zp->z_id; znode_hold_t *zh; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode. */ zh = zfs_znode_hold_enter(zfsvfs, z_id); mutex_enter(&zp->z_lock); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { mutex_exit(&zp->z_lock); zfs_znode_hold_exit(zfsvfs, zh); zfs_rmnode(zp); return; } } mutex_exit(&zp->z_lock); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } /* * Determine whether the znode's atime must be updated. The logic mostly * duplicates the Linux kernel's relatime_need_update() functionality. * This function is only called if the underlying filesystem actually has * atime updates enabled. */ boolean_t zfs_relatime_need_update(const struct inode *ip) { inode_timespec_t now, tmp_atime, tmp_ts; gethrestime(&now); tmp_atime = zpl_inode_get_atime(ip); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ tmp_ts = zpl_inode_get_mtime(ip); if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); tmp_ts = zpl_inode_get_ctime(ip); if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); } /* * Prepare to update znode time stamps. * * IN: zp - znode requiring timestamp update * flag - ATTR_MTIME, ATTR_CTIME flags * * OUT: zp - z_seq * mtime - new mtime * ctime - new ctime * * Note: We don't update atime here, because we rely on Linux VFS to do * atime updating. */ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { inode_timespec_t now, tmp_ts; gethrestime(&now); zp->z_seq++; if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); ZFS_TIME_DECODE(&tmp_ts, mtime); zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); ZFS_TIME_DECODE(&tmp_ts, ctime); zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * zfs_zero_partial_page - Modeled after update_pages() but * with different arguments and semantics for use by zfs_freesp(). * * Zeroes a piece of a single page cache entry for zp at offset * start and length len. * * Caller must acquire a range lock on the file for the region * being zeroed in order that the ARC and page cache stay in sync. */ static void zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) { struct address_space *mp = ZTOI(zp)->i_mapping; struct page *pp; int64_t off; void *pb; ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); off = start & (PAGE_SIZE - 1); start &= PAGE_MASK; pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); memset(pb + off, 0, len); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); /* * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ if (zn_has_cached_data(zp, off, off + len - 1)) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; /* first possible full page in hole */ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; /* last page of hole */ last_page = (off + len) >> PAGE_SHIFT; /* offset of first_page */ first_page_offset = first_page << PAGE_SHIFT; /* offset of last_page */ last_page_offset = last_page << PAGE_SHIFT; /* truncate whole pages */ if (last_page_offset > first_page_offset) { truncate_inode_pages_range(ZTOI(zp)->i_mapping, first_page_offset, last_page_offset - 1); } /* truncate sub-page ranges */ if (first_page > last_page) { /* entire punched area within a single page */ zfs_zero_partial_page(zp, off, len); } else { /* beginning of punched area at the end of a page */ page_len = first_page_offset - off; if (page_len > 0) zfs_zero_partial_page(zp, off, page_len); /* end of punched area at the beginning of a page */ page_len = off + len - last_page_offset; if (page_len > 0) zfs_zero_partial_page(zp, last_page_offset, page_len); } } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; goto out; } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) goto out; log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); zfs_znode_update_vfs(zp); error = 0; out: /* * Truncate the page cache - for file truncate operations, use * the purpose-built API for truncations. For punching operations, * the truncation is handled under a range lock in zfs_free_range. */ if (len == 0) truncate_setsize(ZTOI(zp), off); return (error); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { struct super_block *sb; zfsvfs_t *zfsvfs; uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int size; int error; int i; znode_t *rootzp = NULL; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; const char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); ASSERT(error == 0); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb * to allow zfs_mknode to work. */ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); sb->s_fs_info = zfsvfs; ZTOI(rootzp)->i_sb = sb; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } mutex_destroy(&zfsvfs->z_znodes_lock); vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); kmem_free(sb, sizeof (struct super_block)); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); -/* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); module_param(zfs_unlink_suspend_progress, int, 0644); MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " "(debug - leaks space into the unlinked set)"); diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 21f3740f6fe6..22eeef7f0743 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1,2080 +1,2079 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This file is responsible for handling all of the details of generating * encryption parameters and performing encryption and authentication. * * BLOCK ENCRYPTION PARAMETERS: * Encryption /Authentication Algorithm Suite (crypt): * The encryption algorithm, mode, and key length we are going to use. We * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit * keys. All authentication is currently done with SHA512-HMAC. * * Plaintext: * The unencrypted data that we want to encrypt. * * Initialization Vector (IV): * An initialization vector for the encryption algorithms. This is used to * "tweak" the encryption algorithms so that two blocks of the same data are * encrypted into different ciphertext outputs, thus obfuscating block patterns. * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is * never reused with the same encryption key. This value is stored unencrypted * and must simply be provided to the decryption function. We use a 96 bit IV * (as recommended by NIST) for all block encryption. For non-dedup blocks we * derive the IV randomly. The first 64 bits of the IV are stored in the second * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of * level 0 blocks is the number of allocated dnodes in that block. The on-disk * format supports at most 2^15 slots per L0 dnode block, because the maximum * block size is 16MB (2^24). In either case, for level 0 blocks this number * will still be smaller than UINT32_MAX so it is safe to store the IV in the * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count * for the dnode code. * * Master key: * This is the most important secret data of an encrypted dataset. It is used * along with the salt to generate that actual encryption keys via HKDF. We * do not use the master key to directly encrypt any data because there are * theoretical limits on how much data can actually be safely encrypted with * any encryption mode. The master key is stored encrypted on disk with the * user's wrapping key. Its length is determined by the encryption algorithm. * For details on how this is stored see the block comment in dsl_crypt.c * * Salt: * Used as an input to the HKDF function, along with the master key. We use a * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt * can be used for encrypting many blocks, so we cache the current salt and the * associated derived key in zio_crypt_t so we do not need to derive it again * needlessly. * * Encryption Key: * A secret binary key, generated from an HKDF function used to encrypt and * decrypt data. * * Message Authentication Code (MAC) * The MAC is an output of authenticated encryption modes such as AES-GCM and * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted * data on disk and return garbage to the application. Effectively, it is a * checksum that can not be reproduced by an attacker. We store the MAC in the * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated * regular checksum of the ciphertext which can be used for scrubbing. * * OBJECT AUTHENTICATION: * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because * they contain some info that always needs to be readable. To prevent this * data from being altered, we authenticate this data using SHA512-HMAC. This * will produce a MAC (similar to the one produced via encryption) which can * be used to verify the object was not modified. HMACs do not require key * rotation or IVs, so we can keep up to the full 3 copies of authenticated * data. * * ZIL ENCRYPTION: * ZIL blocks have their bp written to disk ahead of the associated data, so we * cannot store the MAC there as we normally do. For these blocks the MAC is * stored in the embedded checksum within the zil_chain_t header. The salt and * IV are generated for the block on bp allocation instead of at encryption * time. In addition, ZIL blocks have some pieces that must be left in plaintext * for claiming even though all of the sensitive user data still needs to be * encrypted. The function zio_crypt_init_uios_zil() handles parsing which * pieces of the block need to be encrypted. All data that is not encrypted is * authenticated using the AAD mechanisms that the supported encryption modes * provide for. In order to preserve the semantics of the ZIL for encrypted * datasets, the ZIL is not protected at the objset level as described below. * * DNODE ENCRYPTION: * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left * in plaintext for scrubbing and claiming, but the bonus buffers might contain * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing * which pieces of the block need to be encrypted. For more details about * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). * * OBJECT SET AUTHENTICATION: * Up to this point, everything we have encrypted and authenticated has been * at level 0 (or -2 for the ZIL). If we did not do any further work the * on-disk format would be susceptible to attacks that deleted or rearranged * the order of level 0 blocks. Ideally, the cleanest solution would be to * maintain a tree of authentication MACs going up the bp tree. However, this * presents a problem for raw sends. Send files do not send information about * indirect blocks so there would be no convenient way to transfer the MACs and * they cannot be recalculated on the receive side without the master key which * would defeat one of the purposes of raw sends in the first place. Instead, * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs * from the level below. We also include some portable fields from blk_prop such * as the lsize and compression algorithm to prevent the data from being * misinterpreted. * * At the objset level, we maintain 2 separate 256 bit MACs in the * objset_phys_t. The first one is "portable" and is the logical root of the * MAC tree maintained in the metadnode's bps. The second, is "local" and is * used as the root MAC for the user accounting objects, which are also not * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload * of the send file. The useraccounting code ensures that the useraccounting * info is not present upon a receive, so the local MAC can simply be cleared * out at that time. For more info about objset_phys_t authentication, see * zio_crypt_do_objset_hmacs(). * * CONSIDERATIONS FOR DEDUP: * In order for dedup to work, blocks that we want to dedup with one another * need to use the same IV and encryption key, so that they will have the same * ciphertext. Normally, one should never reuse an IV with the same encryption * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both * blocks. In this case, however, since we are using the same plaintext as * well all that we end up with is a duplicate of the original ciphertext we * already had. As a result, an attacker with read access to the raw disk will * be able to tell which blocks are the same but this information is given away * by dedup anyway. In order to get the same IVs and encryption keys for * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC * here so that a reproducible checksum of the plaintext is never available to * the attacker. The HMAC key is kept alongside the master key, encrypted on * disk. The first 64 bits of the HMAC are used in place of the random salt, and * the next 96 bits are used as the IV. As a result of this mechanism, dedup * will only work within a clone family since encrypted dedup requires use of * the same master and HMAC keys. */ /* * After encrypting many blocks with the same key we may start to run up * against the theoretical limits of how much data can securely be encrypted * with a single key using the supported encryption modes. The most obvious * limitation is that our risk of generating 2 equivalent 96 bit IVs increases * the more IVs we generate (which both GCM and CCM modes strictly forbid). * This risk actually grows surprisingly quickly over time according to the * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have * generated n IVs with a cryptographically secure RNG, the approximate * probability p(n) of a collision is given as: * * p(n) ~= e^(-n*(n-1)/(2*(2^96))) * * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] * * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion * we must not write more than 398,065,730 blocks with the same encryption key. * Therefore, we rotate our keys after 400,000,000 blocks have been written by * generating a new random 64 bit salt for our HKDF encryption key generation * function. */ #define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 #define ZFS_CURRENT_MAX_SALT_USES \ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; typedef struct blkptr_auth_buf { uint64_t bab_prop; /* blk_prop - portable mask */ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ uint64_t bab_pad; /* reserved for future use */ } blkptr_auth_buf_t; const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { {"", ZC_TYPE_NONE, 0, "inherit"}, {"", ZC_TYPE_NONE, 0, "on"}, {"", ZC_TYPE_NONE, 0, "off"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} }; void zio_crypt_key_destroy(zio_crypt_key_t *key) { rw_destroy(&key->zk_salt_lock); /* free crypto templates */ crypto_destroy_ctx_template(key->zk_current_tmpl); crypto_destroy_ctx_template(key->zk_hmac_tmpl); /* zero out sensitive data */ memset(key, 0, sizeof (zio_crypt_key_t)); } int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) { int ret; crypto_mechanism_t mech = {0}; uint_t keydata_len; ASSERT(key != NULL); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); /* * Workaround for GCC 12+ with UBSan enabled deficencies. * * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code * below as violating -Warray-bounds */ #if defined(__GNUC__) && !defined(__clang__) && \ ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \ defined(CONFIG_UBSAN)) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif keydata_len = zio_crypt_table[crypt].ci_keylen; #if defined(__GNUC__) && !defined(__clang__) && \ ((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \ defined(CONFIG_UBSAN)) #pragma GCC diagnostic pop #endif memset(key, 0, sizeof (zio_crypt_key_t)); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); if (ret != 0) goto error; ret = random_get_bytes(key->zk_master_keydata, keydata_len); if (ret != 0) goto error; ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); if (ret != 0) goto error; ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for the ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = &key->zk_hmac_key; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy(key); return (ret); } static int zio_crypt_key_change_salt(zio_crypt_key_t *key) { int ret = 0; uint8_t salt[ZIO_DATA_SALT_LEN]; crypto_mechanism_t mech; uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; /* generate a new salt */ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; rw_enter(&key->zk_salt_lock, RW_WRITER); /* someone beat us to the salt rotation, just unlock and return */ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) goto out_unlock; /* derive the current key from the master key and the new salt */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto out_unlock; /* assign the salt and reset the usage count */ memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); key->zk_salt_count = 0; /* destroy the old context template and create the new one */ crypto_destroy_ctx_template(key->zk_current_tmpl); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; rw_exit(&key->zk_salt_lock); return (0); out_unlock: rw_exit(&key->zk_salt_lock); error: return (ret); } /* See comment above zfs_key_max_salt_uses definition for details */ int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) { int ret; boolean_t salt_change; rw_enter(&key->zk_salt_lock, RW_READER); memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= ZFS_CURRENT_MAX_SALT_USES); rw_exit(&key->zk_salt_lock); if (salt_change) { ret = zio_crypt_key_change_salt(key); if (ret != 0) goto error; } return (0); error: return (ret); } /* * This function handles all encryption and decryption in zfs. When * encrypting it expects puio to reference the plaintext and cuio to * reference the ciphertext. cuio must have enough space for the * ciphertext + room for a MAC. datalen should be the length of the * plaintext / ciphertext alone. */ static int zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len) { int ret; crypto_data_t plaindata, cipherdata; CK_AES_CCM_PARAMS ccmp; CK_AES_GCM_PARAMS gcmp; crypto_mechanism_t mech; zio_crypt_info_t crypt_info; uint_t plain_full_len, maclen; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); /* lookup the encryption info */ crypt_info = zio_crypt_table[crypt]; /* the mac will always be the last iovec_t in the cipher uio */ maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; ASSERT(maclen <= ZIO_DATA_MAC_LEN); /* setup encryption mechanism (same as crypt) */ mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); /* * Strangely, the ICP requires that plain_full_len must include * the MAC length when decrypting, even though the UIO does not * need to have the extra space allocated. */ if (encrypt) { plain_full_len = datalen; } else { plain_full_len = datalen + maclen; } /* * setup encryption params (currently only AES CCM and AES GCM * are supported) */ if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { ccmp.ulNonceSize = ZIO_DATA_IV_LEN; ccmp.ulAuthDataSize = auth_len; ccmp.authData = authbuf; ccmp.ulMACSize = maclen; ccmp.nonce = ivbuf; ccmp.ulDataSize = plain_full_len; mech.cm_param = (char *)(&ccmp); mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); } else { gcmp.ulIvLen = ZIO_DATA_IV_LEN; gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); gcmp.ulAADLen = auth_len; gcmp.pAAD = authbuf; gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); gcmp.pIv = ivbuf; mech.cm_param = (char *)(&gcmp); mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); } /* populate the cipher and plain data structs. */ plaindata.cd_format = CRYPTO_DATA_UIO; plaindata.cd_offset = 0; plaindata.cd_uio = puio; plaindata.cd_length = plain_full_len; cipherdata.cd_format = CRYPTO_DATA_UIO; cipherdata.cd_offset = 0; cipherdata.cd_uio = cuio; cipherdata.cd_length = datalen + maclen; /* perform the actual encryption */ if (encrypt) { ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } } else { ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata); if (ret != CRYPTO_SUCCESS) { ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); ret = SET_ERROR(ECKSUM); goto error; } } return (0); error: return (ret); } int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); if (ret != 0) goto error; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata_out; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata_out; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; /* * Although we don't support writing to the old format, we do * support rewrapping the key so that the user can move and * quarantine datasets on the old format. */ if (key->zk_version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(key->zk_guid); } else { ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(key->zk_guid); aad[1] = LE_64(crypt); aad[2] = LE_64(key->zk_version); } enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_iovcnt = 2; puio.uio_segflg = UIO_SYSSPACE; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; return (0); error: return (ret); } int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, uint8_t *mac, zio_crypt_key_t *key) { crypto_mechanism_t mech; zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint_t enc_len, keydata_len, aad_len; int ret; ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); keydata_len = zio_crypt_table[crypt].ci_keylen; /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[0].iov_base = keydata; cipher_iovecs[0].iov_len = keydata_len; cipher_iovecs[1].iov_base = hmac_keydata; cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; cipher_iovecs[2].iov_base = mac; cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; if (version == 0) { aad_len = sizeof (uint64_t); aad[0] = LE_64(guid); } else { ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); aad_len = sizeof (uint64_t) * 3; aad[0] = LE_64(guid); aad[1] = LE_64(crypt); aad[2] = LE_64(version); } enc_len = keydata_len + SHA512_HMAC_KEYLEN; puio.uio_iov = plain_iovecs; puio.uio_segflg = UIO_SYSSPACE; puio.uio_iovcnt = 2; cuio.uio_iov = cipher_iovecs; cuio.uio_iovcnt = 3; cuio.uio_segflg = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, &puio, &cuio, (uint8_t *)aad, aad_len); if (ret != 0) goto error; /* generate a fresh salt */ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); if (ret != 0) goto error; /* derive the current key from the master key */ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); if (ret != 0) goto error; /* initialize keys for ICP */ key->zk_current_key.ck_data = key->zk_current_keydata; key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); key->zk_hmac_key.ck_data = key->zk_hmac_keydata; key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); /* * Initialize the crypto templates. It's ok if this fails because * this is just an optimization. */ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); ret = crypto_create_ctx_template(&mech, &key->zk_current_key, &key->zk_current_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_current_tmpl = NULL; mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, &key->zk_hmac_tmpl); if (ret != CRYPTO_SUCCESS) key->zk_hmac_tmpl = NULL; key->zk_crypt = crypt; key->zk_version = version; key->zk_guid = guid; key->zk_salt_count = 0; return (0); error: zio_crypt_key_destroy(key); return (ret); } int zio_crypt_generate_iv(uint8_t *ivbuf) { int ret; /* randomly generate the IV */ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); if (ret != 0) goto error; return (0); error: memset(ivbuf, 0, ZIO_DATA_IV_LEN); return (ret); } int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *digestbuf, uint_t digestlen) { int ret; crypto_mechanism_t mech; crypto_data_t in_data, digest_data; uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); /* initialize sha512-hmac mechanism and crypto data */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; /* initialize the crypto data */ in_data.cd_format = CRYPTO_DATA_RAW; in_data.cd_offset = 0; in_data.cd_length = datalen; in_data.cd_raw.iov_base = (char *)data; in_data.cd_raw.iov_len = in_data.cd_length; digest_data.cd_format = CRYPTO_DATA_RAW; digest_data.cd_offset = 0; digest_data.cd_length = SHA512_DIGEST_LENGTH; digest_data.cd_raw.iov_base = (char *)raw_digestbuf; digest_data.cd_raw.iov_len = digest_data.cd_length; /* generate the hmac */ ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, &digest_data); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(digestbuf, raw_digestbuf, digestlen); return (0); error: memset(digestbuf, 0, digestlen); return (ret); } int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, uint8_t *ivbuf, uint8_t *salt) { int ret; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; ret = zio_crypt_do_hmac(key, data, datalen, digestbuf, SHA512_DIGEST_LENGTH); if (ret != 0) return (ret); memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN); memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN); return (0); } /* * The following functions are used to encode and decode encryption parameters * into blkptr_t and zil_header_t. The ICP wants to use these parameters as * byte strings, which normally means that these strings would not need to deal * with byteswapping at all. However, both blkptr_t and zil_header_t may be * byteswapped by lower layers and so we must "undo" that byteswap here upon * decoding and encoding in a non-native byteorder. These functions require * that the byteorder bit is correct before being called. */ void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_ENCRYPTED(bp)); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, val32); } else { memcpy(&val64, salt, sizeof (uint64_t)); bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); memcpy(&val64, iv, sizeof (uint64_t)); bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); BP_SET_IV2(bp, BSWAP_32(val32)); } } void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) { uint64_t val64; uint32_t val32; ASSERT(BP_IS_PROTECTED(bp)); /* for convenience, so callers don't need to check */ if (BP_IS_AUTHENTICATED(bp)) { memset(salt, 0, ZIO_DATA_SALT_LEN); memset(iv, 0, ZIO_DATA_IV_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); val32 = (uint32_t)BP_GET_IV2(bp); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } else { val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); memcpy(salt, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); memcpy(iv, &val64, sizeof (uint64_t)); val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); } } void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } else { memcpy(&val64, mac, sizeof (uint64_t)); bp->blk_cksum.zc_word[2] = BSWAP_64(val64); memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); bp->blk_cksum.zc_word[3] = BSWAP_64(val64); } } void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) { uint64_t val64; ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); /* for convenience, so callers don't need to check */ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { memset(mac, 0, ZIO_DATA_MAC_LEN); return; } if (!BP_SHOULD_BYTESWAP(bp)) { memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], sizeof (uint64_t)); } else { val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); memcpy(mac, &val64, sizeof (uint64_t)); val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); } } void zio_crypt_encode_mac_zil(void *data, uint8_t *mac) { zil_chain_t *zilc = data; memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), sizeof (uint64_t)); } void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) { /* * The ZIL MAC is embedded in the block it protects, which will * not have been byteswapped by the time this function has been called. * As a result, we don't need to worry about byteswapping the MAC. */ const zil_chain_t *zilc = data; memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], sizeof (uint64_t)); } /* * This routine takes a block of dnodes (src_abd) and copies only the bonus * buffers to the same offsets in the dst buffer. datalen should be the size * of both the src_abd and the dst buffer (not just the length of the bonus * buffers). */ void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) { uint_t i, max_dnp = datalen >> DNODE_SHIFT; uint8_t *src; dnode_phys_t *dnp, *sdnp, *ddnp; src = abd_borrow_buf_copy(src_abd, datalen); sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), DN_MAX_BONUS_LEN(dnp)); } } abd_return_buf(src_abd, src, datalen); } /* * This function decides what fields from blk_prop are included in * the on-disk various MAC algorithms. */ static void zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) { /* * Version 0 did not properly zero out all non-portable fields * as it should have done. We maintain this code so that we can * do read-only imports of pools on this version. */ if (version == 0) { BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); return; } ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); /* * The hole_birth feature might set these fields even if this bp * is a hole. We zero them out here to guarantee that raw sends * will function with or without the feature. */ if (BP_IS_HOLE(bp)) { bp->blk_prop = 0ULL; return; } /* * At L0 we want to verify these fields to ensure that data blocks * can not be reinterpreted. For instance, we do not want an attacker * to trick us into returning raw lz4 compressed data to the user * by modifying the compression bits. At higher levels, we cannot * enforce this policy since raw sends do not convey any information * about indirect blocks, so these values might be different on the * receive side. Fortunately, this does not open any new attack * vectors, since any alterations that can be made to a higher level * bp must still verify the correct order of the layer below it. */ if (BP_GET_LEVEL(bp) != 0) { BP_SET_BYTEORDER(bp, 0); BP_SET_COMPRESS(bp, 0); /* * psize cannot be set to zero or it will trigger * asserts, but the value doesn't really matter as * long as it is constant. */ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); } BP_SET_DEDUP(bp, 0); BP_SET_CHECKSUM(bp, 0); } static void zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, blkptr_auth_buf_t *bab, uint_t *bab_len) { blkptr_t tmpbp = *bp; if (should_bswap) byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); ASSERT0(BP_IS_EMBEDDED(&tmpbp)); zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); /* * We always MAC blk_prop in LE to ensure portability. This * must be done after decoding the mac, since the endianness * will get zero'd out here. */ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); bab->bab_prop = LE_64(tmpbp.blk_prop); bab->bab_pad = 0ULL; /* version 0 did not include the padding */ *bab_len = sizeof (blkptr_auth_buf_t); if (version == 0) *bab_len -= sizeof (uint64_t); } static int zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { int ret; uint_t bab_len; blkptr_auth_buf_t bab; crypto_data_t cd; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; cd.cd_length = bab_len; cd.cd_raw.iov_base = (char *)&bab; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } return (0); error: return (ret); } static void zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); SHA2Update(ctx, &bab, bab_len); } static void zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, boolean_t should_bswap, blkptr_t *bp) { uint_t bab_len; blkptr_auth_buf_t bab; zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); memcpy(*aadp, &bab, bab_len); *aadp += bab_len; *aad_len += bab_len; } static int zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, boolean_t should_bswap, dnode_phys_t *dnp) { int ret, i; dnode_phys_t *adnp, tmp_dncore; size_t dn_core_size = offsetof(dnode_phys_t, dn_blkptr); boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); crypto_data_t cd; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* * Authenticate the core dnode (masking out non-portable bits). * We only copy the first 64 bytes we operate on to avoid the overhead * of copying 512-64 unneeded bytes. The compiler seems to be fine * with that. */ memcpy(&tmp_dncore, dnp, dn_core_size); adnp = &tmp_dncore; if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); adnp->dn_used = BSWAP_64(adnp->dn_used); } adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; cd.cd_length = dn_core_size; cd.cd_raw.iov_base = (char *)adnp; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } for (i = 0; i < dnp->dn_nblkptr; i++) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, &dnp->dn_blkptr[i]); if (ret != 0) goto error; } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { ret = zio_crypt_bp_do_hmac_updates(ctx, version, should_bswap, DN_SPILL_BLKPTR(dnp)); if (ret != 0) goto error; } return (0); error: return (ret); } /* * objset_phys_t blocks introduce a number of exceptions to the normal * authentication process. objset_phys_t's contain 2 separate HMACS for * protecting the integrity of their data. The portable_mac protects the * metadnode. This MAC can be sent with a raw send and protects against * reordering of data within the metadnode. The local_mac protects the user * accounting objects which are not sent from one system to another. * * In addition, objset blocks are the only blocks that can be modified and * written to disk without the key loaded under certain circumstances. During * zil_claim() we need to be able to update the zil_header_t to complete * claiming log blocks and during raw receives we need to write out the * portable_mac from the send file. Both of these actions are possible * because these fields are not protected by either MAC so neither one will * need to modify the MACs without the key. However, when the modified blocks * are written out they will be byteswapped into the host machine's native * endianness which will modify fields protected by the MAC. As a result, MAC * calculation for objset blocks works slightly differently from other block * types. Where other block types MAC the data in whatever endianness is * written to disk, objset blocks always MAC little endian version of their * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() * and le_bswap indicates whether a byteswap is needed to get this block * into little endian format. */ int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) { int ret; crypto_mechanism_t mech; crypto_context_t ctx; crypto_data_t cd; objset_phys_t *osp = data; uint64_t intval; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; /* initialize HMAC mechanism */ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); mech.cm_param = NULL; mech.cm_param_len = 0; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; /* calculate the portable MAC from the portable fields and metadnode */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the os_type */ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the metadnode */ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_meta_dnode); if (ret) goto error; /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_portable_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN); /* * This is necessary here as we check next whether * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to * decide if the local_mac should be zeroed out. That flag will always * be set by dmu_objset_id_quota_upgrade_cb() and * dmu_objset_userspace_upgrade_cb() if useraccounting has been * completed. */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); boolean_t uacct_incomplete = !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE); /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ if (uacct_incomplete || (datalen >= OBJSET_PHYS_SIZE_V3 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE && osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || (datalen <= OBJSET_PHYS_SIZE_V1)) { memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (0); } /* calculate the local MAC from the userused and groupused dnodes */ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in the non-portable os_flags */ intval = osp->os_flags; if (should_bswap) intval = BSWAP_64(intval); intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; if (!ZFS_HOST_BYTEORDER) intval = BSWAP_64(intval); cd.cd_length = sizeof (uint64_t); cd.cd_raw.iov_base = (char *)&intval; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_update(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } /* add in fields from the user accounting dnodes */ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_userused_dnode); if (ret) goto error; } if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_groupused_dnode); if (ret) goto error; } if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && datalen >= OBJSET_PHYS_SIZE_V3) { ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, should_bswap, &osp->os_projectused_dnode); if (ret) goto error; } /* store the final digest in a temporary buffer and copy what we need */ cd.cd_length = SHA512_DIGEST_LENGTH; cd.cd_raw.iov_base = (char *)raw_local_mac; cd.cd_raw.iov_len = cd.cd_length; ret = crypto_mac_final(ctx, &cd); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); goto error; } memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN); return (0); error: memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN); memset(local_mac, 0, ZIO_OBJSET_MAC_LEN); return (ret); } static void zio_crypt_destroy_uio(zfs_uio_t *uio) { if (uio->uio_iov) kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); } /* * This function parses an uncompressed indirect block and returns a checksum * of all the portable fields from all of the contained bps. The portable * fields are the MAC and all of the fields from blk_prop except for the dedup, * checksum, and psize bits. For an explanation of the purpose of this, see * the comment block on object set authentication. */ static int zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) { blkptr_t *bp; int i, epb = datalen >> SPA_BLKPTRSHIFT; SHA2_CTX ctx; uint8_t digestbuf[SHA512_DIGEST_LENGTH]; /* checksum all of the MACs from the layer below */ SHA2Init(SHA512, &ctx); for (i = 0, bp = buf; i < epb; i++, bp++) { zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, byteswap, bp); } SHA2Final(digestbuf, &ctx); if (generate) { memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN); return (0); } if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) return (SET_ERROR(ECKSUM)); return (0); } int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; /* * Unfortunately, callers of this function will not always have * easy access to the on-disk format version. This info is * normally found in the DSL Crypto Key, but the checksum-of-MACs * is expected to be verifiable even when the key isn't loaded. * Here, instead of doing a ZAP lookup for the version for each * zio, we simply try both existing formats. */ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); if (ret == ECKSUM) { ASSERT(!generate); ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, datalen, 0, byteswap, cksum); } return (ret); } int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, uint_t datalen, boolean_t byteswap, uint8_t *cksum) { int ret; void *buf; buf = abd_borrow_buf_copy(abd, datalen); ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, byteswap, cksum); abd_return_buf(abd, buf, datalen); return (ret); } /* * Special case handling routine for encrypting / decrypting ZIL blocks. * We do not check for the older ZIL chain because the encryption feature * was not available before the newer ZIL chain was introduced. The goal * here is to encrypt everything except the blkptr_t of a lr_write_t and * the zil_chain_t header. Everything that is not encrypted is authenticated. */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; uint64_t txtype, lr_len, nused; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; zil_chain_t *zilc; lr_t *lr; uint8_t *aadbuf = zio_buf_alloc(datalen); /* cipherbuf always needs an extra iovec for the MAC */ if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } memset(dst, 0, datalen); /* find the start and end record of the log block */ zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); ASSERT3U(nused, >=, sizeof (zil_chain_t)); ASSERT3U(nused, <=, datalen); blkend = src + nused; /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } ASSERT3U(lr_len, >=, sizeof (lr_t)); ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) nr_iovecs++; } nr_src += nr_iovecs; nr_dst += nr_iovecs; /* allocate the iovec arrays */ if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } /* * Copy the plain zil header over and authenticate everything except * the checksum that will store our MAC. If we are writing the data * the embedded checksum will not have been calculated yet, so we don't * authenticate that. */ memcpy(dst, src, sizeof (zil_chain_t)); memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t)); aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); /* loop over records again, filling in iovecs */ nr_iovecs = 0; slrp = src + sizeof (zil_chain_t); dlrp = dst + sizeof (zil_chain_t); for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { lr = (lr_t *)slrp; if (!byteswap) { txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } else { txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } /* copy the common lr_t */ memcpy(dlrp, slrp, sizeof (lr_t)); memcpy(aadp, slrp, sizeof (lr_t)); aadp += sizeof (lr_t); aad_len += sizeof (lr_t); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); /* * If this is a TX_WRITE record we want to encrypt everything * except the bp if exists. If the bp does exist we want to * authenticate it. */ if (txtype == TX_WRITE) { const size_t o = offsetof(lr_write_t, lr_blkptr); crypt_len = o - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bp now since it will not be encrypted */ memcpy(dlrp + o, slrp + o, sizeof (blkptr_t)); memcpy(aadp, slrp + o, sizeof (blkptr_t)); aadp += sizeof (blkptr_t); aad_len += sizeof (blkptr_t); nr_iovecs++; total_len += crypt_len; if (lr_len != sizeof (lr_write_t)) { crypt_len = lr_len - sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_write_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_write_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } else if (txtype == TX_CLONE_RANGE) { const size_t o = offsetof(lr_clone_range_t, lr_nbps); crypt_len = o - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; /* copy the bps now since they will not be encrypted */ memcpy(dlrp + o, slrp + o, lr_len - o); memcpy(aadp, slrp + o, lr_len - o); aadp += lr_len - o; aad_len += lr_len - o; nr_iovecs++; total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * Special case handling routine for encrypting / decrypting dnode blocks. */ static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; uint8_t *src, *dst, *aadp; dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; uint8_t *aadbuf = zio_buf_alloc(datalen); if (encrypt) { src = plainbuf; dst = cipherbuf; nr_src = 0; nr_dst = 1; } else { src = cipherbuf; dst = plainbuf; nr_src = 1; nr_dst = 0; } sdnp = (dnode_phys_t *)src; ddnp = (dnode_phys_t *)dst; aadp = aadbuf; /* * Count the number of iovecs we will need to do the encryption by * counting the number of bonus buffers that need to be encrypted. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { /* * This block may still be byteswapped. However, all of the * values we use are either uint8_t's (for which byteswapping * is a noop) or a * != 0 check, which will work regardless * of whether or not we byteswap. */ if (sdnp[i].dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && sdnp[i].dn_bonuslen != 0) { nr_iovecs++; } } nr_src += nr_iovecs; nr_dst += nr_iovecs; if (nr_src != 0) { src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); if (src_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } if (nr_dst != 0) { dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); if (dst_iovecs == NULL) { ret = SET_ERROR(ENOMEM); goto error; } } nr_iovecs = 0; /* * Iterate through the dnodes again, this time filling in the uios * we allocated earlier. We also concatenate any data we want to * authenticate onto aadbuf. */ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { dnp = &sdnp[i]; /* copy over the core fields and blkptrs (kept as plaintext) */ memcpy(&ddnp[i], dnp, (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); } /* * Handle authenticated data. We authenticate everything in * the dnode that can be brought over when we do a raw send. * This includes all of the core fields as well as the MACs * stored in the bp checksums and all of the portable bits * from blk_prop. We include the dnode padding here in case it * ever gets used in the future. Some dn_flags and dn_used are * not portable so we mask those out values out of the * authenticated data. */ crypt_len = offsetof(dnode_phys_t, dn_blkptr); memcpy(aadp, dnp, crypt_len); adnp = (dnode_phys_t *)aadp; adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; aadp += crypt_len; aad_len += crypt_len; for (j = 0; j < dnp->dn_nblkptr; j++) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, &dnp->dn_blkptr[j]); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zio_crypt_bp_do_aad_updates(&aadp, &aad_len, version, byteswap, DN_SPILL_BLKPTR(dnp)); } /* * If this bonus buffer needs to be encrypted, we prepare an * iovec_t. The encryption / decryption functions will fill * this in for us with the encrypted or decrypted data. * Otherwise we add the bonus buffer to the authenticated * data buffer and copy it over to the destination. The * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that * we can guarantee alignment with the AES block size * (128 bits). */ crypt_len = DN_MAX_BONUS_LEN(dnp); if (dnp->dn_type != DMU_OT_NONE && DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && dnp->dn_bonuslen != 0) { ASSERT3U(nr_iovecs, <, nr_src); ASSERT3U(nr_iovecs, <, nr_dst); ASSERT3P(src_iovecs, !=, NULL); ASSERT3P(dst_iovecs, !=, NULL); src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); src_iovecs[nr_iovecs].iov_len = crypt_len; dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); dst_iovecs[nr_iovecs].iov_len = crypt_len; nr_iovecs++; total_len += crypt_len; } else { memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len); memcpy(aadp, DN_BONUS(dnp), crypt_len); aadp += crypt_len; aad_len += crypt_len; } } *no_crypt = (nr_iovecs == 0); *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; if (encrypt) { puio->uio_iov = src_iovecs; puio->uio_iovcnt = nr_src; cuio->uio_iov = dst_iovecs; cuio->uio_iovcnt = nr_dst; } else { puio->uio_iov = dst_iovecs; puio->uio_iovcnt = nr_dst; cuio->uio_iov = src_iovecs; cuio->uio_iovcnt = nr_src; } return (0); error: zio_buf_free(aadbuf, datalen); if (src_iovecs != NULL) kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); if (dst_iovecs != NULL) kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); *enc_len = 0; *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len) { (void) encrypt; int ret; uint_t nr_plain = 1, nr_cipher = 2; iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; /* allocate the iovecs for the plain and cipher data */ plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), KM_SLEEP); if (!plain_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), KM_SLEEP); if (!cipher_iovecs) { ret = SET_ERROR(ENOMEM); goto error; } plain_iovecs[0].iov_base = plainbuf; plain_iovecs[0].iov_len = datalen; cipher_iovecs[0].iov_base = cipherbuf; cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; puio->uio_iov = plain_iovecs; puio->uio_iovcnt = nr_plain; cuio->uio_iov = cipher_iovecs; cuio->uio_iovcnt = nr_cipher; return (0); error: if (plain_iovecs != NULL) kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); if (cipher_iovecs != NULL) kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; puio->uio_iov = NULL; puio->uio_iovcnt = 0; cuio->uio_iov = NULL; cuio->uio_iovcnt = 0; return (ret); } /* * This function builds up the plaintext (puio) and ciphertext (cuio) uios so * that they can be used for encryption and decryption by zio_do_crypt_uio(). * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks * requiring special handling to parse out pieces that are to be encrypted. The * authbuf is used by these special cases to store additional authenticated * data (AAD) for the encryption modes. */ static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); /* route to handler */ switch (ot) { case DMU_OT_INTENT_LOG: ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; case DMU_OT_DNODE: ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, no_crypt); break; default: ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, datalen, puio, cuio, enc_len); *authbuf = NULL; *auth_len = 0; *no_crypt = B_FALSE; break; } if (ret != 0) goto error; /* populate the uios */ puio->uio_segflg = UIO_SYSSPACE; cuio->uio_segflg = UIO_SYSSPACE; mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); mac_iov->iov_base = mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; return (0); error: return (ret); } /* * Primary encryption / decryption entrypoint for zio data. */ int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, boolean_t *no_crypt) { int ret; boolean_t locked = B_FALSE; uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; zfs_uio_t puio, cuio; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; crypto_ctx_template_t tmpl; uint8_t *authbuf = NULL; memset(&puio, 0, sizeof (puio)); memset(&cuio, 0, sizeof (cuio)); /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. * If we are encrypting, we must return a copy of the current salt * so that it can be stored in the blkptr_t. */ rw_enter(&key->zk_salt_lock, RW_READER); locked = B_TRUE; if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { ckey = &key->zk_current_key; tmpl = key->zk_current_tmpl; } else { rw_exit(&key->zk_salt_lock); locked = B_FALSE; ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); if (ret != 0) goto error; tmp_ckey.ck_data = enc_keydata; tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); ckey = &tmp_ckey; tmpl = NULL; } /* * Attempt to use QAT acceleration if we can. We currently don't * do this for metadnode and ZIL blocks, since they have a much * more involved buffer layout and the qat_crypt() function only * works in-place. */ if (qat_crypt_use_accel(datalen) && ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) { uint8_t *srcbuf, *dstbuf; if (encrypt) { srcbuf = plainbuf; dstbuf = cipherbuf; } else { srcbuf = cipherbuf; dstbuf = plainbuf; } ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf, dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen); if (ret == CPA_STATUS_SUCCESS) { if (locked) { rw_exit(&key->zk_salt_lock); locked = B_FALSE; } return (0); } /* If the hardware implementation fails fall back to software */ } /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, &authbuf, &auth_len, no_crypt); if (ret != 0) goto error; /* perform the encryption / decryption in software */ ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, &puio, &cuio, authbuf, auth_len); if (ret != 0) goto error; if (locked) { rw_exit(&key->zk_salt_lock); } if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (0); error: if (locked) rw_exit(&key->zk_salt_lock); if (authbuf != NULL) zio_buf_free(authbuf, datalen); if (ckey == &tmp_ckey) memset(enc_keydata, 0, keydata_len); zio_crypt_destroy_uio(&puio); zio_crypt_destroy_uio(&cuio); return (ret); } /* * Simple wrapper around zio_do_crypt_data() to work with abd's instead of * linear buffers. */ int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; void *ptmp, *ctmp; if (encrypt) { ptmp = abd_borrow_buf_copy(pabd, datalen); ctmp = abd_borrow_buf(cabd, datalen); } else { ptmp = abd_borrow_buf(pabd, datalen); ctmp = abd_borrow_buf_copy(cabd, datalen); } ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, datalen, ptmp, ctmp, no_crypt); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (0); error: if (encrypt) { abd_return_buf(pabd, ptmp, datalen); abd_return_buf_copy(cabd, ctmp, datalen); } else { abd_return_buf_copy(pabd, ptmp, datalen); abd_return_buf(cabd, ctmp, datalen); } return (ret); } #if defined(_KERNEL) -/* CSTYLED */ module_param(zfs_key_max_salt_uses, ulong, 0644); MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " "can be used for generating encryption keys before it is rotated"); #endif diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index f6e014327717..ff1370c543dc 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1,1149 +1,1148 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include #include #include #include #if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include #endif #include #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include #endif /* * When using fallocate(2) to preallocate space, inflate the requested * capacity check by 10% to account for the required metadata blocks. */ static unsigned int zfs_fallocate_reserve_percent = 110; static int zpl_open(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = generic_file_open(ip, filp); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_release(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); if (ITOZ(ip)->z_atime_dirty) zfs_mark_inode_dirty(ip); crhold(cr); error = -zfs_close(ip, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_iterate(struct file *filp, struct dir_context *ctx) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_readdir(file_inode(filp), ctx, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* * The variables z_sync_writes_cnt and z_async_writes_cnt work in * tandem so that sync writes can detect if there are any non-sync * writes going on and vice-versa. The "vice-versa" part to this logic * is located in zfs_putpage() where non-sync writes check if there are * any ongoing sync writes. If any sync and non-sync writes overlap, * we do a commit to complete the non-sync writes since the latter can * potentially take several seconds to complete and thus block sync * writes in the upcoming call to filemap_write_and_wait_range(). */ atomic_inc_32(&zp->z_sync_writes_cnt); /* * If the following check does not detect an overlapping non-sync write * (say because it's just about to start), then it is guaranteed that * the non-sync write will detect this sync write. This is because we * always increment z_sync_writes_cnt / z_async_writes_cnt before doing * the check on z_async_writes_cnt / z_sync_writes_cnt here and in * zfs_putpage() respectively. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { atomic_dec_32(&zp->z_sync_writes_cnt); return (error); } zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); } error = filemap_write_and_wait_range(inode->i_mapping, start, end); /* * The sync write is not complete yet but we decrement * z_sync_writes_cnt since zfs_fsync() increments and decrements * it internally. If a non-sync write starts just after the decrement * operation but before we call zfs_fsync(), it may not detect this * overlapping sync write but it does not matter since we have already * gone past filemap_write_and_wait_range() and we won't block due to * the non-sync write. */ atomic_dec_32(&zp->z_sync_writes_cnt); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(zp, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static inline int zfs_io_flags(struct kiocb *kiocb) { int flags = 0; #if defined(IOCB_DSYNC) if (kiocb->ki_flags & IOCB_DSYNC) flags |= O_DSYNC; #endif #if defined(IOCB_SYNC) if (kiocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; #endif #if defined(IOCB_APPEND) if (kiocb->ki_flags & IOCB_APPEND) flags |= O_APPEND; #endif #if defined(IOCB_DIRECT) if (kiocb->ki_flags & IOCB_DIRECT) flags |= O_DIRECT; #endif return (flags); } /* * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() * is true. This is needed since datasets with inherited "relatime" property * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after * `zfs set relatime=...`), which is what relatime test in VFS by * relatime_need_update() is based on. */ static inline void zpl_file_accessed(struct file *filp) { struct inode *ip = filp->f_mapping->host; if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } } /* * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to * manipulate the iov_iter are available. In which case the full iov_iter * can be attached to the uio and correctly handled in the lower layers. * Otherwise, for older kernels extract the iovec and pass it instead. */ static void zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, loff_t pos, ssize_t count, size_t skip) { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); #else zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, zfs_uio_iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif } static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; ssize_t count = iov_iter_count(to); zfs_uio_t uio; zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); crhold(cr); cookie = spl_fstrans_mark(); ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (ret < 0) return (ret); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) { ssize_t ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); *countp = ret; return (0); } static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; zfs_uio_t uio; size_t count = 0; ssize_t ret; ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); crhold(cr); cookie = spl_fstrans_mark(); ret = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (ret < 0) return (ret); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } static ssize_t zpl_direct_IO_impl(void) { /* * All O_DIRECT requests should be handled by * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code * should call the direct_IO address_space_operations function. We set * this code path to be fatal if it is executed. */ PANIC(0); return (0); } #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { return (zpl_direct_IO_impl()); } #else #error "Unknown Direct I/O interface" #endif static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { #if defined(SEEK_HOLE) && defined(SEEK_DATA) fstrans_cookie_t cookie; if (whence == SEEK_DATA || whence == SEEK_HOLE) { struct inode *ip = filp->f_mapping->host; loff_t maxbytes = ip->i_sb->s_maxbytes; loff_t error; spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); spl_inode_unlock_shared(ip); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ return (generic_file_llseek(filp, offset, whence)); } /* * It's worth taking a moment to describe how mmap is implemented * for zfs because it differs considerably from other Linux filesystems. * However, this issue is handled the same way under OpenSolaris. * * The issue is that by design zfs bypasses the Linux page cache and * leaves all caching up to the ARC. This has been shown to work * well for the common read(2)/write(2) case. However, mmap(2) * is problem because it relies on being tightly integrated with the * page cache. To handle this we cache mmap'ed files twice, once in * the ARC and a second time in the page cache. The code is careful * to keep both copies synchronized. * * When a file with an mmap'ed region is written to using write(2) * both the data in the ARC and existing pages in the page cache * are updated. For a read(2) data will be read first from the page * cache then the ARC if needed. Neither a write(2) or read(2) will * will ever result in new pages being added to the page cache. * * New pages are added to the page cache only via .readpage() which * is called when the vfs needs to read a page off disk to back the * virtual memory region. These pages may be modified without * notifying the ARC and will be written out periodically via * .writepage(). This will occur due to either a sync or the usual * page aging behavior. Note because a read(2) of a mmap'ed file * will always check the page cache first even when the ARC is out * of date correct data will still be returned. * * While this implementation ensures correct behavior it does have * have some drawbacks. The most obvious of which is that it * increases the required memory footprint when access mmap'ed * files. It also adds additional complexity to the code keeping * both caches synchronized. * * Longer term it may be possible to cleanly resolve this wart by * mapping page cache pages directly on to the ARC buffers. The * Linux address space operations are flexible enough to allow * selection of which pages back a particular index. The trick * would be working out the details of which subsystem is in * charge, the ARC, the page cache, or both. It may also prove * helpful to move the ARC buffers to a scatter-gather lists * rather than a vmalloc'ed region. */ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); if (error) return (error); error = generic_file_mmap(filp, vma); if (error) return (error); return (error); } /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). */ static inline int zpl_readpage_common(struct page *pp) { fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); cookie = spl_fstrans_mark(); int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); unlock_page(pp); return (error); } #ifdef HAVE_VFS_READ_FOLIO static int zpl_read_folio(struct file *filp, struct folio *folio) { return (zpl_readpage_common(&folio->page)); } #else static int zpl_readpage(struct file *filp, struct page *pp) { return (zpl_readpage_common(pp)); } #endif static int zpl_readpage_filler(void *data, struct page *pp) { return (zpl_readpage_common(pp)); } /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ #ifdef HAVE_VFS_READPAGES static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); } #else static void zpl_readahead(struct readahead_control *ractl) { struct page *page; while ((page = readahead_page(ractl)) != NULL) { int ret; ret = zpl_readpage_filler(NULL, page); put_page(page); if (ret) break; } } #endif static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { boolean_t *for_sync = data; fstrans_cookie_t cookie; int ret; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); return (ret); } #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) { return (zpl_putpage(&pp->page, wbc, data)); } #endif static inline int zpl_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, void *data) { int result; #ifdef HAVE_WRITEPAGE_T_FOLIO result = write_cache_pages(mapping, wbc, zpl_putfolio, data); #else result = write_cache_pages(mapping, wbc, zpl_putpage, data); #endif return (result); } static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfsvfs_t *zfsvfs = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; if ((result = zpl_enter(zfsvfs, FTAG)) != 0) return (result); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; zpl_exit(zfsvfs, FTAG); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; result = zpl_write_cache_pages(mapping, wbc, &for_sync); if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = zpl_write_cache_pages(mapping, wbc, &for_sync); } return (result); } /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); return (zpl_putpage(pp, wbc, &for_sync)); } /* * The flag combination which matches the behavior of zfs_space() is * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. * * The original mode=0 (allocate space) behavior can be reasonably emulated * by checking if enough space exists and creating a sparse file, as real * persistent space reservation is not possible due to COW, snapshots, etc. */ static long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { cred_t *cr = CRED(); loff_t olen; fstrans_cookie_t cookie; int error = 0; int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE; if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0) return (-EOPNOTSUPP); if (offset < 0 || len <= 0) return (-EINVAL); spl_inode_lock(ip); olen = i_size_read(ip); crhold(cr); cookie = spl_fstrans_mark(); if (mode & (test_mode)) { flock64_t bf; if (mode & FALLOC_FL_KEEP_SIZE) { if (offset > olen) goto out_unmark; if (offset + len > olen) len = olen - offset; } bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = offset; bf.l_len = len; bf.l_pid = 0; error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { unsigned int percent = zfs_fallocate_reserve_percent; struct kstatfs statfs; /* Legacy mode, disable fallocate compatibility. */ if (percent == 0) { error = -EOPNOTSUPP; goto out_unmark; } /* * Use zfs_statvfs() instead of dmu_objset_space() since it * also checks project quota limits, which are relevant here. */ error = zfs_statvfs(ip, &statfs); if (error) goto out_unmark; /* * Shrink available space a bit to account for overhead/races. * We know the product previously fit into availbytes from * dmu_objset_space(), so the smaller product will also fit. */ if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { error = -ENOSPC; goto out_unmark; } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); } out_unmark: spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); return (error); } static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } static int zpl_ioctl_getversion(struct file *filp, void __user *arg) { uint32_t generation = file_inode(filp)->i_generation; return (copy_to_user(arg, &generation, sizeof (generation))); } static int zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { struct inode *ip = file_inode(filp); znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os = zfsvfs->z_os; int error = 0; if (S_ISFIFO(ip->i_mode)) return (-ESPIPE); if (offset < 0 || len < 0) return (-EINVAL); if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_WILLNEED: #ifdef HAVE_GENERIC_FADVISE if (zn_has_cached_data(zp, offset, offset + len - 1)) error = generic_fadvise(filp, offset, len, advice); #endif /* * Pass on the caller's size directly, but note that * dmu_prefetch_max will effectively cap it. If there * really is a larger sequential access pattern, perhaps * dmu_zfetch will detect it. */ if (len == 0) len = i_size_read(ip) - offset; dmu_prefetch(os, zp->z_id, 0, offset, len, ZIO_PRIORITY_ASYNC_READ); break; case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_DONTNEED: case POSIX_FADV_NOREUSE: /* ignored for now */ break; default: error = -EINVAL; break; } zfs_exit(zfsvfs, FTAG); return (error); } #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) static uint32_t __zpl_ioctl_getflags(struct inode *ip) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; uint32_t ioctl_flags = 0; if (zfs_flags & ZFS_IMMUTABLE) ioctl_flags |= FS_IMMUTABLE_FL; if (zfs_flags & ZFS_APPENDONLY) ioctl_flags |= FS_APPEND_FL; if (zfs_flags & ZFS_NODUMP) ioctl_flags |= FS_NODUMP_FL; if (zfs_flags & ZFS_PROJINHERIT) ioctl_flags |= ZFS_PROJINHERIT_FL; return (ioctl_flags & ZFS_FL_USER_VISIBLE); } /* * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file * attributes common to both Linux and Solaris are mapped. */ static int zpl_ioctl_getflags(struct file *filp, void __user *arg) { uint32_t flags; int err; flags = __zpl_ioctl_getflags(file_inode(filp)); err = copy_to_user(arg, &flags, sizeof (flags)); return (err); } /* * fchange() is a helper macro to detect if we have been asked to change a * flag. This is ugly, but the requirement that we do this is a consequence of * how the Linux file attribute interface was designed. Another consequence is * that concurrent modification of files suffers from a TOCTOU race. Neither * are things we can fix without modifying the kernel-userland interface, which * is outside of our jurisdiction. */ #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | ZFS_PROJINHERIT_FL)) return (-EOPNOTSUPP); if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) return (-EACCES); if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, xoap->xoa_projinherit); #undef FLAG_CHANGE return (0); } static int zpl_ioctl_setflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint32_t flags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&flags, arg, sizeof (flags))) return (-EFAULT); err = __zpl_ioctl_setflags(ip, flags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static int zpl_ioctl_getxattr(struct file *filp, void __user *arg) { zfsxattr_t fsx = { 0 }; struct inode *ip = file_inode(filp); int err; fsx.fsx_xflags = __zpl_ioctl_getflags(ip); fsx.fsx_projid = ITOZ(ip)->z_projid; err = copy_to_user(arg, &fsx, sizeof (fsx)); return (err); } static int zpl_ioctl_setxattr(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); zfsxattr_t fsx; cred_t *cr = CRED(); xvattr_t xva; xoptattr_t *xoap; int err; fstrans_cookie_t cookie; if (copy_from_user(&fsx, arg, sizeof (fsx))) return (-EFAULT); if (!zpl_is_valid_projid(fsx.fsx_projid)) return (-EINVAL); err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); if (err) return (err); xoap = xva_getxoptattr(&xva); XVA_SET_REQ(&xva, XAT_PROJID); xoap->xoa_projid = fsx.fsx_projid; crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } /* * Expose Additional File Level Attributes of ZFS. */ static int zpl_ioctl_getdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags = ITOZ(ip)->z_pflags; dosflags &= ZFS_DOS_FL_USER_VISIBLE; int err = copy_to_user(arg, &dosflags, sizeof (dosflags)); return (err); } static int __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE)) return (-EOPNOTSUPP); if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \ ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly); FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden); FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system); FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive); FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink); FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse); FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline); FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse); #undef FLAG_CHANGE return (0); } /* * Set Additional File Level Attributes of ZFS. */ static int zpl_ioctl_setdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&dosflags, arg, sizeof (dosflags))) return (-EFAULT); err = __zpl_ioctl_setdosflags(ip, dosflags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC_GETVERSION: return (zpl_ioctl_getversion(filp, (void *)arg)); case FS_IOC_GETFLAGS: return (zpl_ioctl_getflags(filp, (void *)arg)); case FS_IOC_SETFLAGS: return (zpl_ioctl_setflags(filp, (void *)arg)); case ZFS_IOC_FSGETXATTR: return (zpl_ioctl_getxattr(filp, (void *)arg)); case ZFS_IOC_FSSETXATTR: return (zpl_ioctl_setxattr(filp, (void *)arg)); case ZFS_IOC_GETDOSFLAGS: return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); case ZFS_IOC_COMPAT_FICLONE: return (zpl_ioctl_ficlone(filp, (void *)arg)); case ZFS_IOC_COMPAT_FICLONERANGE: return (zpl_ioctl_ficlonerange(filp, (void *)arg)); case ZFS_IOC_COMPAT_FIDEDUPERANGE: return (zpl_ioctl_fideduperange(filp, (void *)arg)); default: return (-ENOTTY); } } #ifdef CONFIG_COMPAT static long zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; default: return (-ENOTTY); } return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); } #endif /* CONFIG_COMPAT */ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, #else .readahead = zpl_readahead, #endif #ifdef HAVE_VFS_READ_FOLIO .read_folio = zpl_read_folio, #else .readpage = zpl_readpage, #endif .writepage = zpl_writepage, .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS .set_page_dirty = __set_page_dirty_nobuffers, #endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO .dirty_folio = filemap_dirty_folio, #endif #ifdef HAVE_VFS_MIGRATE_FOLIO .migrate_folio = migrate_folio, #else .migratepage = migrate_page, #endif }; const struct file_operations zpl_file_operations = { .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER #ifdef HAVE_COPY_SPLICE_READ .splice_read = copy_splice_read, #else .splice_read = generic_file_splice_read, #endif .splice_write = iter_file_splice_write, #endif .mmap = zpl_mmap, .fsync = zpl_fsync, .fallocate = zpl_fallocate, .copy_file_range = zpl_copy_file_range, #ifdef HAVE_VFS_CLONE_FILE_RANGE .clone_file_range = zpl_clone_file_range, #endif #ifdef HAVE_VFS_REMAP_FILE_RANGE .remap_file_range = zpl_remap_file_range, #endif #ifdef HAVE_VFS_DEDUPE_FILE_RANGE .dedupe_file_range = zpl_dedupe_file_range, #endif .fadvise = zpl_fadvise, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; const struct file_operations zpl_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = zpl_iterate, .fsync = zpl_fsync, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; -/* CSTYLED */ module_param(zfs_fallocate_reserve_percent, uint, 0644); MODULE_PARM_DESC(zfs_fallocate_reserve_percent, "Percentage of length to use for the available capacity check"); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 47aa6417068d..7c9aae6a66af 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1,1943 +1,1940 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2024, Rob Norris * Copyright (c) 2024, Klara, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync); static unsigned int zvol_major = ZVOL_MAJOR; static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; /* * Switch taskq at multiple of 512 MB offset. This can be set to a lower value * to utilize more threads for small files but may affect prefetch hits. */ #define ZVOL_TASKQ_OFFSET_SHIFT 29 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; /* * The maximum number of volblocksize blocks to process per thread. Typically, * write heavy workloads preform better with higher values here, and read * heavy workloads preform better with lower values, but that's not a hard * and fast rule. It's basically a knob to tune between "less overhead with * less parallelism" and "more overhead, but more parallelism". * * '8' was chosen as a reasonable, balanced, default based off of sequential * read and write tests to a zvol in an NVMe pool (with 16 CPUs). */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; static unsigned int zvol_num_taskqs = 0; #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ #endif /* * Finalize our BIO or request. */ static inline void zvol_end_io(struct bio *bio, struct request *rq, int error) { if (bio) { bio->bi_status = errno_to_bi_status(-error); bio_endio(bio); } else { blk_mq_end_request(rq, errno_to_bi_status(error)); } } static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; static unsigned int zvol_actual_blk_mq_queue_depth; struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ struct blk_mq_tag_set tag_set; /* Set from the global 'zvol_use_blk_mq' at zvol load */ boolean_t use_blk_mq; }; typedef struct zv_taskq { uint_t tqs_cnt; taskq_t **tqs_taskq; } zv_taskq_t; static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; struct request *rq; } zv_request_t; typedef struct zv_work { struct request *rq; struct work_struct work; } zv_work_t; typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; } zv_request_task_t; static zv_request_task_t * zv_request_task_create(zv_request_t zvr) { zv_request_task_t *task; task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); taskq_init_ent(&task->ent); task->zvr = zvr; return (task); } static void zv_request_task_free(zv_request_task_t *task) { kmem_free(task, sizeof (*task)); } /* * This is called when a new block multiqueue request comes in. A request * contains one or more BIOs. */ static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; zvol_state_t *zv = rq->q->queuedata; /* Tell the kernel that we are starting to process this request */ blk_mq_start_request(rq); if (blk_rq_is_passthrough(rq)) { /* Skip non filesystem request */ blk_mq_end_request(rq, BLK_STS_IOERR); return (BLK_STS_IOERR); } zvol_request_impl(zv, NULL, rq, 0); /* Acknowledge to the kernel that we got this request */ return (BLK_STS_OK); } static struct blk_mq_ops zvol_blk_mq_queue_ops = { .queue_rq = zvol_mq_queue_rq, }; /* Initialize our blk-mq struct */ static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) { struct zvol_state_os *zso = zv->zv_zso; memset(&zso->tag_set, 0, sizeof (zso->tag_set)); /* Initialize tag set. */ zso->tag_set.ops = &zvol_blk_mq_queue_ops; zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; zso->tag_set.numa_node = NUMA_NO_NODE; zso->tag_set.cmd_size = 0; /* * We need BLK_MQ_F_BLOCKING here since we do blocking calls in * zvol_request_impl() */ zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; zso->tag_set.driver_data = zv; return (blk_mq_alloc_tag_set(&zso->tag_set)); } /* * Given a path, return TRUE if path is a ZVOL. */ boolean_t zvol_os_is_zvol(const char *path) { dev_t dev = 0; if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; /* bio marked as FLUSH need to flush before write */ if (io_is_flush(bio, rq)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); zvol_end_io(bio, rq, 0); return; } zfs_uio_bvec_init(&uio, bio, rq); ssize_t start_resid = uio.uio_resid; /* * With use_blk_mq, accounting is done by blk_mq_start_request() * and blk_mq_end_request(), so we can skip it here. */ if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } boolean_t sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } zvol_end_io(bio, rq, -error); } static void zvol_write_task(void *arg) { zv_request_task_t *task = arg; zvol_write(&task->zvr); zv_request_task_free(task); } static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; uint64_t start = io_offset(bio, rq); uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); size = end - start; } if (start >= end) goto unlock; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_rangelock_exit(lr); if (error == 0 && sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } zvol_end_io(bio, rq, -error); } static void zvol_discard_task(void *arg) { zv_request_task_t *task = arg; zvol_discard(&task->zvr); zv_request_task_free(task); } static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; boolean_t acct = B_FALSE; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); zfs_uio_bvec_init(&uio, bio, rq); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; /* * When blk-mq is being used, accounting is done by * blk_mq_start_request() and blk_mq_end_request(). */ if (bio) { acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, READ, bio); } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, READ, bio, start_time); } zvol_end_io(bio, rq, -error); } static void zvol_read_task(void *arg) { zv_request_task_t *task = arg; zvol_read(&task->zvr); zv_request_task_free(task); } /* * Process a BIO or request * * Either 'bio' or 'rq' should be set depending on if we are processing a * bio or a request (both should not be set). * * force_sync: Set to 0 to defer processing to a background taskq * Set to 1 to process data synchronously */ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync) { fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); goto out; } if (zvol_request_sync || zv->zv_threading == B_FALSE) force_sync = 1; zv_request_t zvr = { .zv = zv, .bio = bio, .rq = rq, }; if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); zvol_end_io(bio, rq, -SET_ERROR(EIO)); goto out; } zv_request_task_t *task; zv_taskq_t *ztqs = &zvol_taskqs; uint_t blk_mq_hw_queue = 0; uint_t tq_idx; uint_t taskq_hash; if (rq) #ifdef HAVE_BLK_MQ_RQ_HCTX blk_mq_hw_queue = rq->mq_hctx->queue_num; #else blk_mq_hw_queue = rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; #endif taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, blk_mq_hw_queue); tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { zvol_end_io(bio, rq, -SET_ERROR(EROFS)); goto out; } /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o * completes. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data, &zv->zv_kstat.dk_zil_sums); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_create_minor */ VERIFY0((zv->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } /* * We don't want this thread to be blocked waiting for i/o to * complete, so we instead wait from a taskq callback. The * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling * zvol_request() again. There are typically only a small * number of threads available to call zvol_request() (e.g. * one per iSCSI target), so keeping the latency of * zvol_request() low is important for performance. * * The zvol_request_sync module parameter allows this * behavior to be altered, for performance evaluation * purposes. If the callback blocks, setting * zvol_request_sync=1 will result in much worse performance. * * We can have up to zvol_threads concurrent i/o's being * processed for all zvols on the system. This is typically * a vast improvement over the zvol_request_sync=1 behavior * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_discard_task, task, 0, &task->ent); } } else { if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_write_task, task, 0, &task->ent); } } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { zvol_end_io(bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_read_task, task, 0, &task->ent); } } out: spl_fstrans_unmark(cookie); } #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID static void zvol_submit_bio(struct bio *bio) #else static blk_qc_t zvol_submit_bio(struct bio *bio) #endif #else static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) #endif { #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif #endif zvol_state_t *zv = q->queuedata; zvol_request_impl(zv, bio, NULL, 0); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif } static int #ifdef HAVE_BLK_MODE_T zvol_open(struct gendisk *disk, blk_mode_t flag) #else zvol_open(struct block_device *bdev, fmode_t flag) #endif { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); retry: #endif rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ #ifdef HAVE_BLK_MODE_T zv = disk->private_data; #else zv = bdev->bd_disk->private_data; #endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); } else { drop_suspend = B_TRUE; } } else { drop_suspend = B_TRUE; } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { boolean_t drop_namespace = B_FALSE; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) * the __blkdev_get() function calls fops->open() with the * bdev->bd_mutex lock held. This can result in a deadlock * when zvols from one pool are used as vdevs in another. * * To prevent a lock inversion deadlock we preemptively * take the spa_namespace_lock. Normally the lock will not * be contended and this is safe because spa_open_common() * handles the case where the caller already holds the * spa_namespace_lock. * * When the lock cannot be aquired after multiple retries * this must be the vdev on zvol deadlock case and we have * no choice but to return an error. For 5.12 and older * kernels returning -ERESTARTSYS will result in the * bdev->bd_mutex being dropped, then reacquired, and * fops->open() being called again. This process can be * repeated safely until both locks are acquired. For 5.13 * and newer the -ERESTARTSYS retry logic was removed from * the kernel so the only option is to return the error for * the caller to handle it. */ if (!mutex_owned(&spa_namespace_lock)) { if (!mutex_tryenter(&spa_namespace_lock)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; #ifdef HAVE_BLKDEV_GET_ERESTARTSYS schedule(); return (-SET_ERROR(ERESTARTSYS)); #else if ((gethrtime() - start) > timeout) return (-SET_ERROR(ERESTARTSYS)); schedule_timeout_interruptible( MSEC_TO_TICK(10)); goto retry; #endif } else { drop_namespace = B_TRUE; } } error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { if ((blk_mode_is_open_write(flag)) && (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); error = -SET_ERROR(EROFS); } else { zv->zv_open_count++; } } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == 0) #ifdef HAVE_BLK_MODE_T disk_check_media_change(disk); #else zfs_check_media_change(bdev); #endif return (error); } static void #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG zvol_release(struct gendisk *disk) #else zvol_release(struct gendisk *disk, fmode_t unused) #endif { #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: #ifdef HAVE_FSYNC_BDEV fsync_bdev(bdev); #elif defined(HAVE_SYNC_BLOCKDEV) sync_blockdev(bdev); #else #error "Neither fsync_bdev() nor sync_blockdev() found" #endif invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { struct gendisk *disk = zv->zv_zso->zvo_disk; #if defined(HAVE_REVALIDATE_DISK_SIZE) revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); #elif defined(HAVE_REVALIDATE_DISK) revalidate_disk(disk); #else zvol_revalidate_disk(disk); #endif return (0); } void zvol_os_clear_private(zvol_state_t *zv) { /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_zso->zvo_disk->private_data = NULL; } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } /* * Why have two separate block_device_operations structs? * * Normally we'd just have one, and assign 'submit_bio' as needed. However, * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we * can't just change submit_bio dynamically at runtime. So just create two * separate structs to get around this. */ static const struct block_device_operations zvol_ops_blk_mq = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif }; /* * Since 6.9, Linux has been removing queue limit setters in favour of an * initial queue_limits struct applied when the device is open. Since 6.11, * queue_limits is being extended to allow more things to be applied when the * device is open. Setters are also being removed for this. * * For OpenZFS, this means that depending on kernel version, some options may * be set up before the device is open, and some applied to an open device * (queue) after the fact. * * We manage this complexity by having our own limits struct, * zvol_queue_limits_t, in which we carry any queue config that we're * interested in setting. This structure is the same on all kernels. * * These limits are then applied to the queue at device open time by the most * appropriate method for the kernel. * * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of * blk_alloc_disk() exists). This converts our limits struct to a proper Linux * struct queue_limits, and passes it in. Any fields added in later kernels are * (obviously) not set up here. * * zvol_queue_limits_apply() is called on all kernel versions after the queue * is created, and applies any remaining config. Before 6.9 that will be * everything, via setter methods. After 6.9 that will be whatever couldn't be * put into struct queue_limits. (This implies that zvol_queue_limits_apply() * will always be a no-op on the latest kernel we support). */ typedef struct zvol_queue_limits { unsigned int zql_max_hw_sectors; unsigned short zql_max_segments; unsigned int zql_max_segment_size; unsigned int zql_io_opt; unsigned int zql_physical_block_size; unsigned int zql_max_discard_sectors; unsigned int zql_discard_granularity; } zvol_queue_limits_t; static void zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, boolean_t use_blk_mq) { limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; if (use_blk_mq) { /* * IO requests can be really big (1MB). When an IO request * comes in, it is passed off to zvol_read() or zvol_write() * in a new thread, where it is chunked up into 'volblocksize' * sized pieces and processed. So for example, if the request * is a 1MB write and your volblocksize is 128k, one zvol_write * thread will take that request and sequentially do ten 128k * IOs. This is due to the fact that the thread needs to lock * each volblocksize sized block. So you might be wondering: * "instead of passing the whole 1MB request to one thread, * why not pass ten individual 128k chunks to ten threads and * process the whole write in parallel?" The short answer is * that there's a sweet spot number of chunks that balances * the greater parallelism with the added overhead of more * threads. The sweet spot can be different depending on if you * have a read or write heavy workload. Writes typically want * high chunk counts while reads typically want lower ones. On * a test pool with 6 NVMe drives in a 3x 2-disk mirror * configuration, with volblocksize=8k, the sweet spot for good * sequential reads and writes was at 8 chunks. */ /* * Below we tell the kernel how big we want our requests * to be. You would think that blk_queue_io_opt() would be * used to do this since it is used to "set optimal request * size for the queue", but that doesn't seem to do * anything - the kernel still gives you huge requests * with tons of little PAGE_SIZE segments contained within it. * * Knowing that the kernel will just give you PAGE_SIZE segments * no matter what, you can say "ok, I want PAGE_SIZE byte * segments, and I want 'N' of them per request", where N is * the correct number of segments for the volblocksize and * number of chunks you want. */ if (zvol_blk_mq_blocks_per_thread != 0) { unsigned int chunks; chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); limits->zql_max_segment_size = PAGE_SIZE; limits->zql_max_segments = (zv->zv_volblocksize * chunks) / PAGE_SIZE; } else { /* * Special case: zvol_blk_mq_blocks_per_thread = 0 * Max everything out. */ limits->zql_max_segments = UINT16_MAX; limits->zql_max_segment_size = UINT_MAX; } } else { limits->zql_max_segments = UINT16_MAX; limits->zql_max_segment_size = UINT_MAX; } limits->zql_io_opt = DMU_MAX_ACCESS / 2; limits->zql_physical_block_size = zv->zv_volblocksize; limits->zql_max_discard_sectors = (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; limits->zql_discard_granularity = zv->zv_volblocksize; } #ifdef HAVE_BLK_ALLOC_DISK_2ARG static void zvol_queue_limits_convert(zvol_queue_limits_t *limits, struct queue_limits *qlimits) { memset(qlimits, 0, sizeof (struct queue_limits)); qlimits->max_hw_sectors = limits->zql_max_hw_sectors; qlimits->max_segments = limits->zql_max_segments; qlimits->max_segment_size = limits->zql_max_segment_size; qlimits->io_opt = limits->zql_io_opt; qlimits->physical_block_size = limits->zql_physical_block_size; qlimits->max_discard_sectors = limits->zql_max_discard_sectors; qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; qlimits->discard_granularity = limits->zql_discard_granularity; #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES qlimits->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; #endif } #endif static void zvol_queue_limits_apply(zvol_queue_limits_t *limits, struct request_queue *queue) { #ifndef HAVE_BLK_ALLOC_DISK_2ARG blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); blk_queue_max_segments(queue, limits->zql_max_segments); blk_queue_max_segment_size(queue, limits->zql_max_segment_size); blk_queue_io_opt(queue, limits->zql_io_opt); blk_queue_physical_block_size(queue, limits->zql_physical_block_size); blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); blk_queue_discard_granularity(queue, limits->zql_discard_granularity); #endif #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES blk_queue_set_write_cache(queue, B_TRUE); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); #endif } static int zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) { #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); if (zso->zvo_disk == NULL) return (1); zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) struct queue_limits qlimits; zvol_queue_limits_convert(limits, &qlimits); struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); if (IS_ERR(disk)) { zso->zvo_disk = NULL; return (1); } zso->zvo_disk = disk; zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ zvol_queue_limits_apply(limits, zso->zvo_queue); return (0); } static int zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) { struct zvol_state_os *zso = zv->zv_zso; /* Allocate our blk-mq tag_set */ if (zvol_blk_mq_alloc_tag_set(zv) != 0) return (1); #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); if (zso->zvo_disk == NULL) { blk_mq_free_tag_set(&zso->tag_set); return (1); } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) struct queue_limits qlimits; zvol_queue_limits_convert(limits, &qlimits); struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); if (IS_ERR(disk)) { zso->zvo_disk = NULL; blk_mq_free_tag_set(&zso->tag_set); return (1); } zso->zvo_disk = disk; zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Allocate queue */ zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); if (IS_ERR(zso->zvo_queue)) { blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; #endif zvol_queue_limits_apply(limits, zso->zvo_queue); return (0); } /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (NULL); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; zv->zv_volblocksize = volblocksize; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); zv->zv_zso->use_blk_mq = zvol_use_blk_mq; zvol_queue_limits_t limits; zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); /* * The block layer has 3 interfaces for getting BIOs: * * 1. blk-mq request queues (new) * 2. submit_bio() (oldest) * 3. regular request queues (old). * * Each of those interfaces has two permutations: * * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates * both the disk and its queue (5.14 kernel or newer) * * b) We don't have blk_*alloc_disk(), and have to allocate the * disk and the queue separately. (5.13 kernel or older) */ if (zv->zv_zso->use_blk_mq) { ret = zvol_alloc_blk_mq(zv, &limits); zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { ret = zvol_alloc_non_blk_mq(zso, &limits); zso->zvo_disk->fops = &zvol_ops; } if (ret != 0) goto out_kmem; /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); if (!zv->zv_zso->use_blk_mq) { /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); } zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; /* * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. * This is accomplished by limiting the number of minors for the * device to one and explicitly disabling partition scanning. */ if (volmode == ZFS_VOLMODE_DEV) { zso->zvo_disk->minors = 1; zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; zso->zvo_disk->flags |= GENHD_FL_NO_PART; } zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. * * This function may take many milliseconds to complete (e.g. we've seen * it take over 256ms), due to the calls to "blk_cleanup_queue" and * "del_gendisk". Thus, consumers need to be careful to account for this * latency when calling this function. */ void zvol_os_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else put_disk(zv->zv_zso->zvo_disk); #endif #else blk_cleanup_queue(zv->zv_zso->zvo_queue); put_disk(zv->zv_zso->zvo_disk); #endif if (zv->zv_zso->use_blk_mq) blk_mq_free_tag_set(&zv->zv_zso->tag_set); ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } void zvol_wait_close(zvol_state_t *zv) { } struct add_disk_work { struct delayed_work work; struct gendisk *disk; int error; }; static int __zvol_os_add_disk(struct gendisk *disk) { int error = 0; #ifdef HAVE_ADD_DISK_RET error = add_disk(disk); #else add_disk(disk); #endif return (error); } #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) static void zvol_os_add_disk_work(struct work_struct *work) { struct add_disk_work *add_disk_work; add_disk_work = container_of(work, struct add_disk_work, work.work); add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); } #endif /* * SPECIAL CASE: * * This function basically calls add_disk() from a workqueue. You may be * thinking: why not just call add_disk() directly? * * When you call add_disk(), the zvol appears to the world. When this happens, * the kernel calls disk_scan_partitions() on the zvol, which behaves * differently on the 6.9+ kernels: * * - 6.8 and older kernels - * disk_scan_partitions() * handle = bdev_open_by_dev( * zvol_open() * bdev_release(handle); * zvol_release() * * * - 6.9+ kernels - * disk_scan_partitions() * file = bdev_file_open_by_dev() * zvol_open() * fput(file) * < wait for return to userspace > * zvol_release() * * The difference is that the bdev_release() from the 6.8 kernel is synchronous * while the fput() from the 6.9 kernel is async. Or more specifically it's * async that has to wait until we return to userspace (since it adds the fput * into the caller's work queue with the TWA_RESUME flag set). This is not the * behavior we want, since we want do things like create+destroy a zvol within * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the * reference to the zvol while we're in the IOCTL, which can't wait until we * return to userspace. * * We can get around this since fput() has a special codepath for when it's * running in a kernel thread or interrupt. In those cases, it just puts the * fput into the system workqueue, which we can force to run with * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it * run from a kernel thread and "tricks" the fput() codepaths. * * Note that __flush_workqueue() is slowly getting deprecated. This may be ok * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via * fput) to happen, which it eventually, naturally, will from the system_wq * without us explicitly calling __flush_workqueue(). */ static int zvol_os_add_disk(struct gendisk *disk) { #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ struct add_disk_work add_disk_work; INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); add_disk_work.disk = disk; add_disk_work.error = 0; /* Use *_delayed_work functions since they're not GPL'd */ schedule_delayed_work(&add_disk_work.work, 0); flush_delayed_work(&add_disk_work.work); __flush_workqueue(system_wq); return (add_disk_work.error); #else /* <= 6.8 kernel */ return (__zvol_os_add_disk(disk)); #endif } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ int zvol_os_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); uint64_t volthreading; bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; if (MINOR(minor) != minor) { /* too many partitions can cause an overflow */ zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", name, minor, MINOR(minor)); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EINVAL)); } zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name, doi->doi_data_block_size); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volsize = volsize; zv->zv_objset = os; /* Default */ zv->zv_threading = B_TRUE; if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) == 0) zv->zv_threading = volthreading; set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); #ifdef QUEUE_FLAG_DISCARD blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); #endif /* This flag was introduced in kernel version 4.12. */ #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; ASSERT3P(zv->zv_zilog, ==, NULL); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else replayed_zil = zil_replay(os, zv, zvol_replay_vector); } if (replayed_zil) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); /* * Keep in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can * be called at any time. Incidentally, add_disk() itself calls * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); error = zvol_os_add_disk(zv->zv_zso->zvo_disk); } else { ida_simple_remove(&zvol_ida, idx); } return (error); } void zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); dataset_kstats_rename(&zv->zv_kstat, newname); } void zvol_os_set_disk_ro(zvol_state_t *zv, int flags) { set_disk_ro(zv->zv_zso->zvo_disk, flags); } void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) { set_capacity(zv->zv_zso->zvo_disk, capacity); } int zvol_init(void) { int error; /* * zvol_threads is the module param the user passes in. * * zvol_actual_threads is what we use internally, since the user can * pass zvol_thread = 0 to mean "use all the CPUs" (the default). */ static unsigned int zvol_actual_threads; if (zvol_threads == 0) { /* * See dde9380a1 for why 32 was chosen here. This should * probably be refined to be some multiple of the number * of CPUs. */ zvol_actual_threads = MAX(num_online_cpus(), 32); } else { zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } /* * Use atleast 32 zvol_threads but for many core system, * prefer 6 threads per taskq, but no more taskqs * than threads in them on large systems. * * taskq total * cpus taskqs threads threads * ------- ------- ------- ------- * 1 1 32 32 * 2 1 32 32 * 4 1 32 32 * 8 2 16 32 * 16 3 11 33 * 32 5 7 35 * 64 8 8 64 * 128 11 12 132 * 256 16 16 256 */ zv_taskq_t *ztqs = &zvol_taskqs; uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); if (num_tqs == 0) { num_tqs = 1 + num_online_cpus() / 6; while (num_tqs * num_tqs > zvol_actual_threads) num_tqs--; } uint_t per_tq_thread = zvol_actual_threads / num_tqs; if (per_tq_thread * num_tqs < zvol_actual_threads) per_tq_thread++; ztqs->tqs_cnt = num_tqs; ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } if (zvol_blk_mq_queue_depth == 0) { zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; } else { zvol_actual_blk_mq_queue_depth = MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); } if (zvol_blk_mq_threads == 0) { zvol_blk_mq_actual_threads = num_online_cpus(); } else { zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1024); } for (uint_t i = 0; i < num_tqs; i++) { char name[32]; (void) snprintf(name, sizeof (name), "%s_tq-%u", ZVOL_DRIVER, i); ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, maxclsyspri, per_tq_thread, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (ztqs->tqs_taskq[i] == NULL) { for (int j = i - 1; j >= 0; j--) taskq_destroy(ztqs->tqs_taskq[j]); unregister_blkdev(zvol_major, ZVOL_DRIVER); kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); ztqs->tqs_taskq = NULL; return (-ENOMEM); } } zvol_init_impl(); ida_init(&zvol_ida); return (0); } void zvol_fini(void) { zv_taskq_t *ztqs = &zvol_taskqs; zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); if (ztqs->tqs_taskq == NULL) { ASSERT3U(ztqs->tqs_cnt, ==, 0); } else { for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); taskq_destroy(ztqs->tqs_taskq[i]); } kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); ztqs->tqs_taskq = NULL; } ida_destroy(&zvol_ida); } -/* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" - "to 0 to use all active CPUs"); + "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_num_taskqs, uint, 0444); MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); module_param(zvol_blk_mq_queue_depth, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); module_param(zvol_use_blk_mq, uint, 0644); MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, - "Process volblocksize blocks per thread"); + "Process volblocksize blocks per thread"); #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); #endif - -/* END CSTYLED */ diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c index 43bccea14a85..fde8ae28ef36 100644 --- a/module/zcommon/zfs_valstr.c +++ b/module/zcommon/zfs_valstr.c @@ -1,280 +1,274 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2024, Klara Inc. */ #include #include #include #include #include #include "zfs_valstr.h" /* * Each bit in a bitfield has three possible string representations: * - single char * - two-char pair * - full name */ typedef struct { const char vb_bit; const char vb_pair[2]; const char *vb_name; } valstr_bit_t; /* * Emits a character for each bit in `bits`, up to the number of elements * in the table. Set bits get the character in vb_bit, clear bits get a * space. This results in all strings having the same width, for easier * visual comparison. */ static size_t valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems, uint64_t bits, char *out, size_t outlen) { ASSERT(out); size_t n = 0; for (int b = 0; b < nelems; b++) { if (n == outlen) break; uint64_t mask = (1ULL << b); out[n++] = (bits & mask) ? table[b].vb_bit : ' '; } if (n < outlen) out[n++] = '\0'; return (n); } /* * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and * separated by a `|` character. This gives a concise representation of the * whole value. */ static size_t valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems, uint64_t bits, char *out, size_t outlen) { ASSERT(out); size_t n = 0; for (int b = 0; b < nelems; b++) { ASSERT3U(n, <=, outlen); if (n == outlen) break; uint64_t mask = (1ULL << b); if (bits & mask) { size_t len = (n > 0) ? 3 : 2; if (n > outlen-len) break; if (n > 0) out[n++] = '|'; out[n++] = table[b].vb_pair[0]; out[n++] = table[b].vb_pair[1]; } } if (n < outlen) out[n++] = '\0'; return (n); } /* * Emits the full name for each bit set in `bits`, taken from vb_name, and * separated by a space. This unambiguously shows the entire set of bits, but * can get very long. */ static size_t valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems, uint64_t bits, char *out, size_t outlen) { ASSERT(out); size_t n = 0; for (int b = 0; b < nelems; b++) { ASSERT3U(n, <=, outlen); if (n == outlen) break; uint64_t mask = (1ULL << b); if (bits & mask) { size_t len = strlen(table[b].vb_name); if (n > 0) len++; if (n > outlen-len) break; if (n > 0) { out[n++] = ' '; len--; } memcpy(&out[n], table[b].vb_name, len); n += len; } } if (n < outlen) out[n++] = '\0'; return (n); } /* * Emits the name of the given enum value in the table. */ static size_t valstr_enum_str(const char **table, const size_t nelems, int v, char *out, size_t outlen) { ASSERT(out); ASSERT3U(v, <, nelems); if (v >= nelems) return (0); return (MIN(strlcpy(out, table[v], outlen), outlen)); } /* * These macros create the string tables for the given name, and implement * the public functions described in zfs_valstr.h. */ #define _VALSTR_BITFIELD_IMPL(name, ...) \ static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\ size_t \ zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen) \ { \ return (valstr_bitfield_bits(valstr_ ## name ## _table, \ ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ } \ \ size_t \ zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen) \ { \ return (valstr_bitfield_pairs(valstr_ ## name ## _table, \ ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ } \ \ size_t \ zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen) \ { \ return (valstr_bitfield_str(valstr_ ## name ## _table, \ ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ } \ #define _VALSTR_ENUM_IMPL(name, ...) \ static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ }; \ size_t \ zfs_valstr_ ## name(int v, char *out, size_t outlen) \ { \ return (valstr_enum_str(valstr_ ## name ## _table, \ ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen)); \ } \ /* String tables */ /* ZIO flags: zio_flag_t, typically zio->io_flags */ -/* BEGIN CSTYLED */ _VALSTR_BITFIELD_IMPL(zio_flag, { '.', "DA", "DONT_AGGREGATE" }, { '.', "RP", "IO_REPAIR" }, { '.', "SH", "SELF_HEAL" }, { '.', "RS", "RESILVER" }, { '.', "SC", "SCRUB" }, { '.', "ST", "SCAN_THREAD" }, { '.', "PH", "PHYSICAL" }, { '.', "CF", "CANFAIL" }, { '.', "SP", "SPECULATIVE" }, { '.', "CW", "CONFIG_WRITER" }, { '.', "DR", "DONT_RETRY" }, { '?', "??", "[UNUSED 11]" }, { '.', "ND", "NODATA" }, { '.', "ID", "INDUCE_DAMAGE" }, { '.', "AL", "IO_ALLOCATING" }, { '.', "RE", "IO_RETRY" }, { '.', "PR", "PROBE" }, { '.', "TH", "TRYHARD" }, { '.', "OP", "OPTIONAL" }, { '.', "RD", "DIO_READ" }, { '.', "DQ", "DONT_QUEUE" }, { '.', "DP", "DONT_PROPAGATE" }, { '.', "BY", "IO_BYPASS" }, { '.', "RW", "IO_REWRITE" }, { '.', "CM", "RAW_COMPRESS" }, { '.', "EN", "RAW_ENCRYPT" }, { '.', "GG", "GANG_CHILD" }, { '.', "DD", "DDT_CHILD" }, { '.', "GF", "GODFATHER" }, { '.', "NP", "NOPWRITE" }, { '.', "EX", "REEXECUTED" }, { '.', "DG", "DELEGATED" }, { '.', "DC", "DIO_CHKSUM_ERR" }, ) -/* END CSTYLED */ /* * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or * zio->io_pipeline. */ -/* BEGIN CSTYLED */ _VALSTR_BITFIELD_IMPL(zio_stage, { 'O', "O ", "OPEN" }, { 'I', "RI", "READ_BP_INIT" }, { 'I', "WI", "WRITE_BP_INIT" }, { 'I', "FI", "FREE_BP_INIT" }, { 'A', "IA", "ISSUE_ASYNC" }, { 'W', "WC", "WRITE_COMPRESS" }, { 'E', "EN", "ENCRYPT" }, { 'C', "CG", "CHECKSUM_GENERATE" }, { 'N', "NW", "NOP_WRITE" }, { 'B', "BF", "BRT_FREE" }, { 'd', "dS", "DDT_READ_START" }, { 'd', "dD", "DDT_READ_DONE" }, { 'd', "dW", "DDT_WRITE" }, { 'd', "dF", "DDT_FREE" }, { 'G', "GA", "GANG_ASSEMBLE" }, { 'G', "GI", "GANG_ISSUE" }, { 'D', "DT", "DVA_THROTTLE" }, { 'D', "DA", "DVA_ALLOCATE" }, { 'D', "DF", "DVA_FREE" }, { 'D', "DC", "DVA_CLAIM" }, { 'R', "R ", "READY" }, { 'V', "VS", "VDEV_IO_START" }, { 'V', "VD", "VDEV_IO_DONE" }, { 'V', "VA", "VDEV_IO_ASSESS" }, { 'C', "CV", "CHECKSUM_VERIFY" }, { 'C', "DC", "DIO_CHECKSUM_VERIFY" }, { 'X', "X ", "DONE" }, ) -/* END CSTYLED */ /* ZIO priority: zio_priority_t, typically zio->io_priority */ -/* BEGIN CSTYLED */ _VALSTR_ENUM_IMPL(zio_priority, "SYNC_READ", "SYNC_WRITE", "ASYNC_READ", "ASYNC_WRITE", "SCRUB", "REMOVAL", "INITIALIZING", "TRIM", "REBUILD", "[NUM_QUEUEABLE]", "NOW", ) -/* END CSTYLED */ #undef _VALSTR_BITFIELD_IMPL #undef _VALSTR_ENUM_IMPL diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 9afee4e208ec..7d94214143ea 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -1,1483 +1,1481 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Block Cloning design. * * Block Cloning allows to manually clone a file (or a subset of its blocks) * into another (or the same) file by just creating additional references to * the data blocks without copying the data itself. Those references are kept * in the Block Reference Tables (BRTs). * * In many ways this is similar to the existing deduplication, but there are * some important differences: * * - Deduplication is automatic and Block Cloning is not - one has to use a * dedicated system call(s) to clone the given file/blocks. * - Deduplication keeps all data blocks in its table, even those referenced * just once. Block Cloning creates an entry in its tables only when there * are at least two references to the given data block. If the block was * never explicitly cloned or the second to last reference was dropped, * there will be neither space nor performance overhead. * - Deduplication needs data to work - one needs to pass real data to the * write(2) syscall, so hash can be calculated. Block Cloning doesn't require * data, just block pointers to the data, so it is extremely fast, as we pay * neither the cost of reading the data, nor the cost of writing the data - * we operate exclusively on metadata. * - If the D (dedup) bit is not set in the block pointer, it means that * the block is not in the dedup table (DDT) and we won't consult the DDT * when we need to free the block. Block Cloning must be consulted on every * free, because we cannot modify the source BP (eg. by setting something * similar to the D bit), thus we have no hint if the block is in the * Block Reference Table (BRT), so we need to look into the BRT. There is * an optimization in place that allows us to eliminate the majority of BRT * lookups which is described below in the "Minimizing free penalty" section. * - The BRT entry is much smaller than the DDT entry - for BRT we only store * 64bit offset and 64bit reference counter. * - Dedup keys are cryptographic hashes, so two blocks that are close to each * other on disk are most likely in totally different parts of the DDT. * The BRT entry keys are offsets into a single top-level VDEV, so data blocks * from one file should have BRT entries close to each other. * - Scrub will only do a single pass over a block that is referenced multiple * times in the DDT. Unfortunately it is not currently (if at all) possible * with Block Cloning and block referenced multiple times will be scrubbed * multiple times. The new, sorted scrub should be able to eliminate * duplicated reads given enough memory. * - Deduplication requires cryptographically strong hash as a checksum or * additional data verification. Block Cloning works with any checksum * algorithm or even with checksumming disabled. * * As mentioned above, the BRT entries are much smaller than the DDT entries. * To uniquely identify a block we just need its vdev id and offset. We also * need to maintain a reference counter. The vdev id will often repeat, as there * is a small number of top-level VDEVs and a large number of blocks stored in * each VDEV. We take advantage of that to reduce the BRT entry size further by * maintaining one BRT for each top-level VDEV, so we can then have only offset * and counter as the BRT entry. * * Minimizing free penalty. * * Block Cloning allows creating additional references to any existing block. * When we free a block there is no hint in the block pointer whether the block * was cloned or not, so on each free we have to check if there is a * corresponding entry in the BRT or not. If there is, we need to decrease * the reference counter. Doing BRT lookup on every free can potentially be * expensive by requiring additional I/Os if the BRT doesn't fit into memory. * This is the main problem with deduplication, so we've learned our lesson and * try not to repeat the same mistake here. How do we do that? We divide each * top-level VDEV into 16MB regions. For each region we maintain a counter that * is a sum of all the BRT entries that have offsets within the region. This * creates the entries count array of 16bit numbers for each top-level VDEV. * The entries count array is always kept in memory and updated on disk in the * same transaction group as the BRT updates to keep everything in-sync. We can * keep the array in memory, because it is very small. With 16MB regions and * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease * the region size even further in the future). Now, when we want to free * a block, we first consult the array. If the counter for the whole region is * zero, there is no need to look for the BRT entry, as there isn't one for * sure. If the counter for the region is greater than zero, only then we will * do a BRT lookup and if an entry is found we will decrease the reference * counter in the BRT entry and in the entry counters array. * * The entry counters array is small, but can potentially be larger for very * large VDEVs or smaller regions. In this case we don't want to rewrite entire * array on every change. We then divide the array into 32kB block and keep * a bitmap of dirty blocks within a transaction group. When we sync the * transaction group we can only update the parts of the entry counters array * that were modified. Note: Keeping track of the dirty parts of the entry * counters array is implemented, but updating only parts of the array on disk * is not yet implemented - for now we will update entire array if there was * any change. * * The implementation tries to be economic: if BRT is not used, or no longer * used, there will be no entries in the MOS and no additional memory used (eg. * the entry counters array is only allocated if needed). * * Interaction between Deduplication and Block Cloning. * * If both functionalities are in use, we could end up with a block that is * referenced multiple times in both DDT and BRT. When we free one of the * references we couldn't tell where it belongs, so we would have to decide * what table takes the precedence: do we first clear DDT references or BRT * references? To avoid this dilemma BRT cooperates with DDT - if a given block * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will * lookup DDT entry instead and increase the counter there. No BRT entry * will be created for a block which has the D (dedup) bit set. * BRT may be more efficient for manual deduplication, but if the block is * already in the DDT, then creating additional BRT entry would be less * efficient. This clever idea was proposed by Allan Jude. * * Block Cloning across datasets. * * Block Cloning is not limited to cloning blocks within the same dataset. * It is possible (and very useful) to clone blocks between different datasets. * One use case is recovering files from snapshots. By cloning the files into * dataset we need no additional storage. Without Block Cloning we would need * additional space for those files. * Another interesting use case is moving the files between datasets * (copying the file content to the new dataset and removing the source file). * In that case Block Cloning will only be used briefly, because the BRT entries * will be removed when the source is removed. * Block Cloning across encrypted datasets is supported as long as both * datasets share the same master key (e.g. snapshots and clones) * * Block Cloning flow through ZFS layers. * * Note: Block Cloning can be used both for cloning file system blocks and ZVOL * blocks. As of this writing no interface is implemented that allows for block * cloning within a ZVOL. * FreeBSD and Linux provides copy_file_range(2) system call and we will use it * for blocking cloning. * * ssize_t * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, * size_t len, unsigned int flags); * * Even though offsets and length represent bytes, they have to be * block-aligned or we will return an error so the upper layer can * fallback to the generic mechanism that will just copy the data. * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. * This function was implemented based on zfs_write(), but instead of writing * the given data we first read block pointers using the new dmu_read_l0_bps() * function from the source file. Once we have BPs from the source file we call * the dmu_brt_clone() function on the destination file. This function * allocates BPs for us. We iterate over all source BPs. If the given BP is * a hole or an embedded block, we just copy BP as-is. If it points to a real * data we place this BP on a BRT pending list using the brt_pending_add() * function. * * We use this pending list to keep track of all BPs that got new references * within this transaction group. * * Some special cases to consider and how we address them: * - The block we want to clone may have been created within the same * transaction group that we are trying to clone. Such block has no BP * allocated yet, so cannot be immediately cloned. We return EAGAIN. * - The block we want to clone may have been modified within the same * transaction group. We return EAGAIN. * - A block may be cloned multiple times during one transaction group (that's * why pending list is actually a tree and not an append-only list - this * way we can figure out faster if this block is cloned for the first time * in this txg or consecutive time). * - A block may be cloned and freed within the same transaction group * (see dbuf_undirty()). * - A block may be cloned and within the same transaction group the clone * can be cloned again (see dmu_read_l0_bps()). * - A file might have been deleted, but the caller still has a file descriptor * open to this file and clones it. * * When we free a block we have an additional step in the ZIO pipeline where we * call the zio_brt_free() function. We then call the brt_entry_decref() * that loads the corresponding BRT entry (if one exists) and decreases * reference counter. If this is not the last reference we will stop ZIO * pipeline here. If this is the last reference or the block is not in the * BRT, we continue the pipeline and free the block as usual. * * At the beginning of spa_sync() where there can be no more block cloning, * but before issuing frees we call brt_pending_apply(). This function applies * all the new clones to the BRT table - we load BRT entries and update * reference counters. To sync new BRT entries to disk, we use brt_sync() * function. This function will sync all dirty per-top-level-vdev BRTs, * the entry counters arrays, etc. * * Block Cloning and ZIL. * * Every clone operation is divided into chunks (similar to write) and each * chunk is cloned in a separate transaction. The chunk size is determined by * how many BPs we can fit into a single ZIL entry. * Replaying clone operation is different from the regular clone operation, * as when we log clone operations we cannot use the source object - it may * reside on a different dataset, so we log BPs we want to clone. * The ZIL is replayed when we mount the given dataset, not when the pool is * imported. Taking this into account it is possible that the pool is imported * without mounting datasets and the source dataset is destroyed before the * destination dataset is mounted and its ZIL replayed. * To address this situation we leverage zil_claim() mechanism where ZFS will * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE * entries, we will bump reference counters for their BPs in the BRT. Then * on mount and ZIL replay we bump the reference counters once more, while the * first references are dropped during ZIL destroy by zil_free_clone_range(). * It is possible that after zil_claim() we never mount the destination, so * we never replay its ZIL and just destroy it. In this case the only taken * references will be dropped by zil_free_clone_range(), since the cloning is * not going to ever take place. */ static kmem_cache_t *brt_entry_cache; /* * Enable/disable prefetching of BRT entries that we are going to modify. */ static int brt_zap_prefetch = 1; #ifdef ZFS_DEBUG #define BRT_DEBUG(...) do { \ if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ } \ } while (0) #else #define BRT_DEBUG(...) do { } while (0) #endif static int brt_zap_default_bs = 12; static int brt_zap_default_ibs = 12; static kstat_t *brt_ksp; typedef struct brt_stats { kstat_named_t brt_addref_entry_not_on_disk; kstat_named_t brt_addref_entry_on_disk; kstat_named_t brt_decref_entry_in_memory; kstat_named_t brt_decref_entry_loaded_from_disk; kstat_named_t brt_decref_entry_not_in_memory; kstat_named_t brt_decref_entry_read_lost_race; kstat_named_t brt_decref_entry_still_referenced; kstat_named_t brt_decref_free_data_later; kstat_named_t brt_decref_free_data_now; kstat_named_t brt_decref_no_entry; } brt_stats_t; static brt_stats_t brt_stats = { { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, { "decref_free_data_later", KSTAT_DATA_UINT64 }, { "decref_free_data_now", KSTAT_DATA_UINT64 }, { "decref_no_entry", KSTAT_DATA_UINT64 } }; struct { wmsum_t brt_addref_entry_not_on_disk; wmsum_t brt_addref_entry_on_disk; wmsum_t brt_decref_entry_in_memory; wmsum_t brt_decref_entry_loaded_from_disk; wmsum_t brt_decref_entry_not_in_memory; wmsum_t brt_decref_entry_read_lost_race; wmsum_t brt_decref_entry_still_referenced; wmsum_t brt_decref_free_data_later; wmsum_t brt_decref_free_data_now; wmsum_t brt_decref_no_entry; } brt_sums; #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) static int brt_entry_compare(const void *x1, const void *x2); static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); static void brt_rlock(spa_t *spa) { rw_enter(&spa->spa_brt_lock, RW_READER); } static void brt_wlock(spa_t *spa) { rw_enter(&spa->spa_brt_lock, RW_WRITER); } static void brt_unlock(spa_t *spa) { rw_exit(&spa->spa_brt_lock); } static uint16_t brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) { ASSERT3U(idx, <, brtvd->bv_size); if (unlikely(brtvd->bv_need_byteswap)) { return (BSWAP_16(brtvd->bv_entcount[idx])); } else { return (brtvd->bv_entcount[idx]); } } static void brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) { ASSERT3U(idx, <, brtvd->bv_size); if (unlikely(brtvd->bv_need_byteswap)) { brtvd->bv_entcount[idx] = BSWAP_16(entcnt); } else { brtvd->bv_entcount[idx] = entcnt; } } static void brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) { uint16_t entcnt; ASSERT3U(idx, <, brtvd->bv_size); entcnt = brt_vdev_entcount_get(brtvd, idx); ASSERT(entcnt < UINT16_MAX); brt_vdev_entcount_set(brtvd, idx, entcnt + 1); } static void brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) { uint16_t entcnt; ASSERT3U(idx, <, brtvd->bv_size); entcnt = brt_vdev_entcount_get(brtvd, idx); ASSERT(entcnt > 0); brt_vdev_entcount_set(brtvd, idx, entcnt - 1); } #ifdef ZFS_DEBUG static void brt_vdev_dump(brt_vdev_t *brtvd) { uint64_t idx; uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", (u_longlong_t)brtvd->bv_vdevid, brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, (u_longlong_t)brtvd->bv_size, (u_longlong_t)brtvd->bv_totalcount, (u_longlong_t)nblocks, (size_t)BT_SIZEOFMAP(nblocks)); if (brtvd->bv_totalcount > 0) { zfs_dbgmsg(" entcounts:"); for (idx = 0; idx < brtvd->bv_size; idx++) { uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); if (entcnt > 0) { zfs_dbgmsg(" [%04llu] %hu", (u_longlong_t)idx, entcnt); } } } if (brtvd->bv_entcount_dirty) { char *bitmap; bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); for (idx = 0; idx < nblocks; idx++) { bitmap[idx] = BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } bitmap[idx] = '\0'; zfs_dbgmsg(" dirty: %s", bitmap); kmem_free(bitmap, nblocks + 1); } } #endif static brt_vdev_t * brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) { brt_vdev_t *brtvd = NULL; brt_rlock(spa); if (vdevid < spa->spa_brt_nvdevs) { brtvd = spa->spa_brt_vdevs[vdevid]; } else if (alloc) { /* New VDEV was added. */ brt_unlock(spa); brt_wlock(spa); if (vdevid >= spa->spa_brt_nvdevs) brt_vdevs_expand(spa, vdevid + 1); brtvd = spa->spa_brt_vdevs[vdevid]; } brt_unlock(spa); return (brtvd); } static void brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; ASSERT(brtvd->bv_initiated); ASSERT0(brtvd->bv_mos_brtvdev); ASSERT0(brtvd->bv_mos_entries); uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); VERIFY(mos_entries != 0); VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, &brtvd->bv_mos_entries_dnode)); rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = mos_entries; rw_exit(&brtvd->bv_mos_entries_lock); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); /* * We allocate DMU buffer to store the bv_entcount[] array. * We will keep array size (bv_size) and cummulative count for all * bv_entcount[]s (bv_totalcount) in the bonus buffer. */ brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); VERIFY(brtvd->bv_mos_brtvdev != 0); BRT_DEBUG("MOS BRT VDEV created, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) { vdev_t *vd; uint16_t *entcount; ulong_t *bitmap; uint64_t nblocks, onblocks, size; ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, brtvd->bv_vdevid); size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; spa_config_exit(spa, SCL_VDEV, FTAG); entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); if (!brtvd->bv_initiated) { ASSERT0(brtvd->bv_size); ASSERT0P(brtvd->bv_entcount); ASSERT0P(brtvd->bv_bitmap); } else { ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_bitmap != NULL); /* * TODO: Allow vdev shrinking. We only need to implement * shrinking the on-disk BRT VDEV object. * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, * offset, size, tx); */ ASSERT3U(brtvd->bv_size, <=, size); memcpy(entcount, brtvd->bv_entcount, sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); vmem_free(brtvd->bv_entcount, sizeof (entcount[0]) * brtvd->bv_size); onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), BT_SIZEOFMAP(onblocks))); kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); } brtvd->bv_size = size; brtvd->bv_entcount = entcount; brtvd->bv_bitmap = bitmap; if (!brtvd->bv_initiated) { brtvd->bv_need_byteswap = FALSE; brtvd->bv_initiated = TRUE; BRT_DEBUG("BRT VDEV %llu initiated.", (u_longlong_t)brtvd->bv_vdevid); } } static int brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; int error; ASSERT(!brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, FTAG, &db); if (error != 0) return (error); bvphys = db->db_data; if (spa->spa_brt_rangesize == 0) { spa->spa_brt_rangesize = bvphys->bvp_rangesize; } else { ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); } brt_vdev_realloc(spa, brtvd); /* TODO: We don't support VDEV shrinking. */ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); /* * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. */ error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), brtvd->bv_entcount, DMU_READ_NO_PREFETCH); if (error != 0) return (error); ASSERT(bvphys->bvp_mos_entries != 0); VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, &brtvd->bv_mos_entries_dnode)); rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = bvphys->bvp_mos_entries; rw_exit(&brtvd->bv_mos_entries_lock); brtvd->bv_need_byteswap = (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); brtvd->bv_totalcount = bvphys->bvp_totalcount; brtvd->bv_usedspace = bvphys->bvp_usedspace; brtvd->bv_savedspace = bvphys->bvp_savedspace; dmu_buf_rele(db, FTAG); BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu", (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)brtvd->bv_mos_brtvdev, (u_longlong_t)brtvd->bv_mos_entries); return (0); } static void brt_vdev_dealloc(brt_vdev_t *brtvd) { ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd->bv_initiated); ASSERT0(avl_numnodes(&brtvd->bv_tree)); vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); brtvd->bv_entcount = NULL; uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); brtvd->bv_bitmap = NULL; brtvd->bv_size = 0; brtvd->bv_initiated = FALSE; BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); } static void brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; uint64_t count; ASSERT(brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(brtvd->bv_mos_entries != 0); ASSERT0(brtvd->bv_totalcount); ASSERT0(brtvd->bv_usedspace); ASSERT0(brtvd->bv_savedspace); uint64_t mos_entries = brtvd->bv_mos_entries; rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = 0; rw_exit(&brtvd->bv_mos_entries_lock); dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); brtvd->bv_mos_entries_dnode = NULL; ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); ASSERT0(count); VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); BRT_DEBUG("MOS entries destroyed, object=%llu", (u_longlong_t)mos_entries); VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); brtvd->bv_mos_brtvdev = 0; brtvd->bv_entcount_dirty = FALSE; snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, tx)); BRT_DEBUG("Pool directory object removed, object=%s", name); brtvd->bv_meta_dirty = FALSE; rw_enter(&brtvd->bv_lock, RW_WRITER); brt_vdev_dealloc(brtvd); rw_exit(&brtvd->bv_lock); spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) { brt_vdev_t **vdevs; ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs); if (nvdevs == spa->spa_brt_nvdevs) return; vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); if (spa->spa_brt_nvdevs > 0) { ASSERT(spa->spa_brt_vdevs != NULL); memcpy(vdevs, spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); } spa->spa_brt_vdevs = vdevs; for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); brtvd->bv_vdevid = vdevid; brtvd->bv_initiated = FALSE; rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); avl_create(&brtvd->bv_tree, brt_entry_compare, sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); for (int i = 0; i < TXG_SIZE; i++) { avl_create(&brtvd->bv_pending_tree[i], brt_entry_compare, sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); } mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); spa->spa_brt_vdevs[vdevid] = brtvd; } BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); spa->spa_brt_nvdevs = nvdevs; } static boolean_t brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset) { uint64_t idx = offset / spa->spa_brt_rangesize; if (idx < brtvd->bv_size) { /* VDEV wasn't expanded. */ return (brt_vdev_entcount_get(brtvd, idx) > 0); } return (FALSE); } static void brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize, uint64_t count) { uint64_t idx; ASSERT(brtvd->bv_initiated); brtvd->bv_savedspace += dsize * count; brtvd->bv_meta_dirty = TRUE; if (bre->bre_count > 0) return; brtvd->bv_usedspace += dsize; idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; if (idx >= brtvd->bv_size) { /* VDEV has been expanded. */ rw_enter(&brtvd->bv_lock, RW_WRITER); brt_vdev_realloc(spa, brtvd); rw_exit(&brtvd->bv_lock); } ASSERT3U(idx, <, brtvd->bv_size); brtvd->bv_totalcount++; brt_vdev_entcount_inc(brtvd, idx); brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); } static void brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize) { uint64_t idx; ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd->bv_initiated); brtvd->bv_savedspace -= dsize; brtvd->bv_meta_dirty = TRUE; if (bre->bre_count > 0) return; brtvd->bv_usedspace -= dsize; idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize; ASSERT3U(idx, <, brtvd->bv_size); ASSERT(brtvd->bv_totalcount > 0); brtvd->bv_totalcount--; brt_vdev_entcount_dec(brtvd, idx); brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); } static void brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; ASSERT(brtvd->bv_meta_dirty); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, FTAG, &db)); if (brtvd->bv_entcount_dirty) { /* * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. */ dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), brtvd->bv_entcount, tx); uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); brtvd->bv_entcount_dirty = FALSE; } dmu_buf_will_dirty(db, tx); bvphys = db->db_data; bvphys->bvp_mos_entries = brtvd->bv_mos_entries; bvphys->bvp_size = brtvd->bv_size; if (brtvd->bv_need_byteswap) { bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; } else { bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; } bvphys->bvp_totalcount = brtvd->bv_totalcount; bvphys->bvp_rangesize = spa->spa_brt_rangesize; bvphys->bvp_usedspace = brtvd->bv_usedspace; bvphys->bvp_savedspace = brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); brtvd->bv_meta_dirty = FALSE; } static void brt_vdevs_free(spa_t *spa) { if (spa->spa_brt_vdevs == 0) return; for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; rw_enter(&brtvd->bv_lock, RW_WRITER); if (brtvd->bv_initiated) brt_vdev_dealloc(brtvd); rw_exit(&brtvd->bv_lock); rw_destroy(&brtvd->bv_lock); if (brtvd->bv_mos_entries != 0) dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); rw_destroy(&brtvd->bv_mos_entries_lock); avl_destroy(&brtvd->bv_tree); for (int i = 0; i < TXG_SIZE; i++) avl_destroy(&brtvd->bv_pending_tree[i]); mutex_destroy(&brtvd->bv_pending_lock); kmem_free(brtvd, sizeof (*brtvd)); } kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); } static void brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) { bre->bre_bp = *bp; bre->bre_count = 0; bre->bre_pcount = 0; *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); } static int brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t off = BRE_OFFSET(bre); return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); } /* * Return TRUE if we _can_ have BRT entry for this bp. It might be false * positive, but gives us quick answer if we should look into BRT, which * may require reads and thus will be more expensive. */ boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp) { if (spa->spa_brt_nvdevs == 0) return (B_FALSE); uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); if (brtvd == NULL || !brtvd->bv_initiated) return (FALSE); /* * We don't need locks here, since bv_entcount pointer must be * stable at this point, and we don't care about false positive * races here, while false negative should be impossible, since * all brt_vdev_addref() have already completed by this point. */ uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); return (brt_vdev_lookup(spa, brtvd, off)); } uint64_t brt_get_dspace(spa_t *spa) { if (spa->spa_brt_nvdevs == 0) return (0); brt_rlock(spa); uint64_t s = 0; for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; brt_unlock(spa); return (s); } uint64_t brt_get_used(spa_t *spa) { if (spa->spa_brt_nvdevs == 0) return (0); brt_rlock(spa); uint64_t s = 0; for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; brt_unlock(spa); return (s); } uint64_t brt_get_saved(spa_t *spa) { return (brt_get_dspace(spa)); } uint64_t brt_get_ratio(spa_t *spa) { uint64_t used = brt_get_used(spa); if (used == 0) return (100); return ((used + brt_get_saved(spa)) * 100 / used); } static int brt_kstats_update(kstat_t *ksp, int rw) { brt_stats_t *bs = ksp->ks_data; if (rw == KSTAT_WRITE) return (EACCES); bs->brt_addref_entry_not_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); bs->brt_addref_entry_on_disk.value.ui64 = wmsum_value(&brt_sums.brt_addref_entry_on_disk); bs->brt_decref_entry_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_in_memory); bs->brt_decref_entry_loaded_from_disk.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); bs->brt_decref_entry_not_in_memory.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); bs->brt_decref_entry_read_lost_race.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); bs->brt_decref_entry_still_referenced.value.ui64 = wmsum_value(&brt_sums.brt_decref_entry_still_referenced); bs->brt_decref_free_data_later.value.ui64 = wmsum_value(&brt_sums.brt_decref_free_data_later); bs->brt_decref_free_data_now.value.ui64 = wmsum_value(&brt_sums.brt_decref_free_data_now); bs->brt_decref_no_entry.value.ui64 = wmsum_value(&brt_sums.brt_decref_no_entry); return (0); } static void brt_stat_init(void) { wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); wmsum_init(&brt_sums.brt_decref_free_data_later, 0); wmsum_init(&brt_sums.brt_decref_free_data_now, 0); wmsum_init(&brt_sums.brt_decref_no_entry, 0); brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (brt_ksp != NULL) { brt_ksp->ks_data = &brt_stats; brt_ksp->ks_update = brt_kstats_update; kstat_install(brt_ksp); } } static void brt_stat_fini(void) { if (brt_ksp != NULL) { kstat_delete(brt_ksp); brt_ksp = NULL; } wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); wmsum_fini(&brt_sums.brt_addref_entry_on_disk); wmsum_fini(&brt_sums.brt_decref_entry_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); wmsum_fini(&brt_sums.brt_decref_free_data_later); wmsum_fini(&brt_sums.brt_decref_free_data_now); wmsum_fini(&brt_sums.brt_decref_no_entry); } void brt_init(void) { brt_entry_cache = kmem_cache_create("brt_entry_cache", sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); brt_stat_init(); } void brt_fini(void) { brt_stat_fini(); kmem_cache_destroy(brt_entry_cache); } /* Return TRUE if block should be freed immediately. */ boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp) { brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; uint64_t vdevid; int error; brt_entry_fill(bp, &bre_search, &vdevid); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); rw_enter(&brtvd->bv_lock, RW_WRITER); ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_decref_entry_in_memory); goto out; } else { BRTSTAT_BUMP(brt_decref_entry_not_in_memory); } rw_exit(&brtvd->bv_lock); error = brt_entry_lookup(brtvd, &bre_search); /* bre_search now contains correct bre_count */ if (error == ENOENT) { BRTSTAT_BUMP(brt_decref_no_entry); return (B_TRUE); } ASSERT0(error); rw_enter(&brtvd->bv_lock, RW_WRITER); racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre != NULL) { /* The entry was added when the lock was dropped. */ BRTSTAT_BUMP(brt_decref_entry_read_lost_race); bre = racebre; goto out; } BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); bre->bre_bp = bre_search.bre_bp; bre->bre_count = bre_search.bre_count; bre->bre_pcount = 0; avl_insert(&brtvd->bv_tree, bre, where); out: if (bre->bre_count == 0) { rw_exit(&brtvd->bv_lock); BRTSTAT_BUMP(brt_decref_free_data_now); return (B_TRUE); } bre->bre_pcount--; ASSERT(bre->bre_count > 0); bre->bre_count--; if (bre->bre_count == 0) BRTSTAT_BUMP(brt_decref_free_data_later); else BRTSTAT_BUMP(brt_decref_entry_still_referenced); brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); rw_exit(&brtvd->bv_lock); return (B_FALSE); } uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) { brt_entry_t bre_search, *bre; uint64_t vdevid, refcnt; int error; brt_entry_fill(bp, &bre_search, &vdevid); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); rw_enter(&brtvd->bv_lock, RW_READER); ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { rw_exit(&brtvd->bv_lock); error = brt_entry_lookup(brtvd, &bre_search); if (error == ENOENT) { refcnt = 0; } else { ASSERT0(error); refcnt = bre_search.bre_count; } } else { refcnt = bre->bre_count; rw_exit(&brtvd->bv_lock); } return (refcnt); } static void brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) { if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) return; uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]); rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); if (brtvd->bv_mos_entries != 0) { (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, &off, BRT_KEY_WORDS); } rw_exit(&brtvd->bv_mos_entries_lock); } static int brt_entry_compare(const void *x1, const void *x2) { const brt_entry_t *bre1 = x1, *bre2 = x2; const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp; return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), DVA_GET_OFFSET(&bp2->blk_dva[0]))); } void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { brt_entry_t *bre, *newbre; avl_index_t where; uint64_t txg; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); newbre->bre_bp = *bp; newbre->bre_count = 0; newbre->bre_pcount = 1; mutex_enter(&brtvd->bv_pending_lock); bre = avl_find(pending_tree, newbre, &where); if (bre == NULL) { avl_insert(pending_tree, newbre, where); newbre = NULL; } else { bre->bre_pcount++; } mutex_exit(&brtvd->bv_pending_lock); if (newbre != NULL) { ASSERT(bre != NULL); ASSERT(bre != newbre); kmem_cache_free(brt_entry_cache, newbre); } else { ASSERT0P(bre); /* Prefetch BRT entry for the syncing context. */ brt_prefetch(brtvd, bp); } } void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { brt_entry_t *bre, bre_search; uint64_t txg; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; bre_search.bre_bp = *bp; mutex_enter(&brtvd->bv_pending_lock); bre = avl_find(pending_tree, &bre_search, NULL); ASSERT(bre != NULL); ASSERT(bre->bre_pcount > 0); bre->bre_pcount--; if (bre->bre_pcount == 0) avl_remove(pending_tree, bre); else bre = NULL; mutex_exit(&brtvd->bv_pending_lock); if (bre) kmem_cache_free(brt_entry_cache, bre); } static void brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) { brt_entry_t *bre, *nbre; /* * We are in syncing context, so no other bv_pending_tree accesses * are possible for the TXG. So we don't need bv_pending_lock. */ ASSERT(avl_is_empty(&brtvd->bv_tree)); avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]); for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) { nbre = AVL_NEXT(&brtvd->bv_tree, bre); /* * If the block has DEDUP bit set, it means that it * already exists in the DEDUP table, so we can just * use that instead of creating new entry in the BRT. */ if (BP_GET_DEDUP(&bre->bre_bp)) { while (bre->bre_pcount > 0) { if (!ddt_addref(spa, &bre->bre_bp)) break; bre->bre_pcount--; } if (bre->bre_pcount == 0) { avl_remove(&brtvd->bv_tree, bre); kmem_cache_free(brt_entry_cache, bre); continue; } } /* * Unless we know that the block is definitely not in ZAP, * try to get its reference count from there. */ uint64_t off = BRE_OFFSET(bre); if (brtvd->bv_mos_entries != 0 && brt_vdev_lookup(spa, brtvd, off)) { int error = zap_lookup_uint64_by_dnode( brtvd->bv_mos_entries_dnode, &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count); if (error == 0) { BRTSTAT_BUMP(brt_addref_entry_on_disk); } else { ASSERT3U(error, ==, ENOENT); BRTSTAT_BUMP(brt_addref_entry_not_on_disk); } } } /* * If all the cloned blocks we had were handled by DDT, we don't need * to initiate the vdev. */ if (avl_is_empty(&brtvd->bv_tree)) return; if (!brtvd->bv_initiated) { rw_enter(&brtvd->bv_lock, RW_WRITER); brt_vdev_realloc(spa, brtvd); rw_exit(&brtvd->bv_lock); } /* * Convert pending references into proper ones. This has to be a * separate loop, since entcount modifications would cause false * positives for brt_vdev_lookup() on following iterations. */ for (bre = avl_first(&brtvd->bv_tree); bre; bre = AVL_NEXT(&brtvd->bv_tree, bre)) { brt_vdev_addref(spa, brtvd, bre, bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount); bre->bre_count += bre->bre_pcount; } } void brt_pending_apply(spa_t *spa, uint64_t txg) { brt_rlock(spa); for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; brt_unlock(spa); brt_pending_apply_vdev(spa, brtvd, txg); brt_rlock(spa); } brt_unlock(spa); } static void brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { uint64_t off = BRE_OFFSET(bre); if (bre->bre_pcount == 0) { /* The net change is zero, nothing to do in ZAP. */ } else if (bre->bre_count == 0) { int error = zap_remove_uint64_by_dnode(dn, &off, BRT_KEY_WORDS, tx); VERIFY(error == 0 || error == ENOENT); } else { VERIFY0(zap_update_uint64_by_dnode(dn, &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count, tx)); } } static void brt_sync_table(spa_t *spa, dmu_tx_t *tx) { brt_entry_t *bre; brt_rlock(spa); for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; brt_unlock(spa); if (!brtvd->bv_meta_dirty) { ASSERT(!brtvd->bv_entcount_dirty); ASSERT0(avl_numnodes(&brtvd->bv_tree)); brt_rlock(spa); continue; } ASSERT(!brtvd->bv_entcount_dirty || avl_numnodes(&brtvd->bv_tree) != 0); if (brtvd->bv_mos_brtvdev == 0) brt_vdev_create(spa, brtvd, tx); void *c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); kmem_cache_free(brt_entry_cache, bre); } #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_BRT) brt_vdev_dump(brtvd); #endif if (brtvd->bv_totalcount == 0) brt_vdev_destroy(spa, brtvd, tx); else brt_vdev_sync(spa, brtvd, tx); brt_rlock(spa); } brt_unlock(spa); } void brt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; uint64_t vdevid; ASSERT3U(spa_syncing_txg(spa), ==, txg); brt_rlock(spa); for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) break; } if (vdevid >= spa->spa_brt_nvdevs) { brt_unlock(spa); return; } brt_unlock(spa); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); brt_sync_table(spa, tx); dmu_tx_commit(tx); } static void brt_alloc(spa_t *spa) { rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); spa->spa_brt_vdevs = NULL; spa->spa_brt_nvdevs = 0; spa->spa_brt_rangesize = 0; } void brt_create(spa_t *spa) { brt_alloc(spa); spa->spa_brt_rangesize = BRT_RANGESIZE; } int brt_load(spa_t *spa) { int error = 0; brt_alloc(spa); brt_wlock(spa); for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children; vdevid++) { char name[64]; uint64_t mos_brtvdev; /* Look if this vdev had active block cloning. */ snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)vdevid); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &mos_brtvdev); if (error == ENOENT) { error = 0; continue; } if (error != 0) break; /* If it did, then allocate them all and load this one. */ brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; rw_enter(&brtvd->bv_lock, RW_WRITER); brtvd->bv_mos_brtvdev = mos_brtvdev; error = brt_vdev_load(spa, brtvd); rw_exit(&brtvd->bv_lock); if (error != 0) break; } if (spa->spa_brt_rangesize == 0) spa->spa_brt_rangesize = BRT_RANGESIZE; brt_unlock(spa); return (error); } void brt_unload(spa_t *spa) { if (spa->spa_brt_rangesize == 0) return; brt_vdevs_free(spa); rw_destroy(&spa->spa_brt_lock); spa->spa_brt_rangesize = 0; } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, "Enable prefetching of BRT ZAP entries"); ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, "BRT ZAP leaf blockshift"); ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, "BRT ZAP indirect blockshift"); -/* END CSTYLED */ diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 9c52083603f1..bff2b6c21f44 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -1,2215 +1,2213 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2019 by Delphix. All rights reserved. */ #include #include #include kmem_cache_t *zfs_btree_leaf_cache; /* * Control the extent of the verification that occurs when zfs_btree_verify is * called. Primarily used for debugging when extending the btree logic and * functionality. As the intensity is increased, new verification steps are * added. These steps are cumulative; intensity = 3 includes the intensity = 1 * and intensity = 2 steps as well. * * Intensity 1: Verify that the tree's height is consistent throughout. * Intensity 2: Verify that a core node's children's parent pointers point * to the core node. * Intensity 3: Verify that the total number of elements in the tree matches the * sum of the number of elements in each node. Also verifies that each node's * count obeys the invariants (less than or equal to maximum value, greater than * or equal to half the maximum minus one). * Intensity 4: Verify that each element compares less than the element * immediately after it and greater than the one immediately before it using the * comparator function. For core nodes, also checks that each element is greater * than the last element in the first of the two nodes it separates, and less * than the first element in the second of the two nodes. * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside * of each node is poisoned appropriately. Note that poisoning always occurs if * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal * operation. * * Intensity 4 and 5 are particularly expensive to perform; the previous levels * are a few memory operations per node, while these levels require multiple * operations per element. In addition, when creating large btrees, these * operations are called at every step, resulting in extremely slow operation * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ uint_t zfs_btree_verify_intensity = 0; /* * Convenience functions to silence warnings from memcpy/memmove's * return values and change argument order to src, dest. */ static void bcpy(const void *src, void *dest, size_t size) { (void) memcpy(dest, src, size); } static void bmov(const void *src, void *dest, size_t size) { (void) memmove(dest, src, size); } static boolean_t zfs_btree_is_core(struct zfs_btree_hdr *hdr) { return (hdr->bth_first == -1); } #ifdef _ILP32 #define BTREE_POISON 0xabadb10c #else #define BTREE_POISON 0xabadb10cdeadbeef #endif static void zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { node->btc_children[i] = (zfs_btree_hdr_t *)BTREE_POISON; } (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, (BTREE_CORE_ELEMS - hdr->bth_count) * size); } else { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); (void) memset(leaf->btl_elems + (hdr->bth_first + hdr->bth_count) * size, 0x0f, tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - (hdr->bth_first + hdr->bth_count) * size); } #endif } static inline void zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint32_t idx, uint32_t count) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (zfs_btree_is_core(hdr)) { ASSERT3U(idx, >=, hdr->bth_count); ASSERT3U(idx, <=, BTREE_CORE_ELEMS); ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = 1; i <= count; i++) { node->btc_children[idx + i] = (zfs_btree_hdr_t *)BTREE_POISON; } (void) memset(node->btc_elems + idx * size, 0x0f, count * size); } else { ASSERT3U(idx, <=, tree->bt_leaf_cap); ASSERT3U(idx + count, <=, tree->bt_leaf_cap); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; (void) memset(leaf->btl_elems + (hdr->bth_first + idx) * size, 0x0f, count * size); } #endif } static inline void zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint32_t idx) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (zfs_btree_is_core(hdr)) { ASSERT3U(idx, <, BTREE_CORE_ELEMS); zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; VERIFY3P(node->btc_children[idx + 1], ==, cval); for (size_t i = 0; i < size; i++) VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); } else { ASSERT3U(idx, <, tree->bt_leaf_cap); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; if (idx >= tree->bt_leaf_cap - hdr->bth_first) return; for (size_t i = 0; i < size; i++) { VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) * size + i], ==, 0x0f); } } #endif } void zfs_btree_init(void) { zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_btree_fini(void) { kmem_cache_destroy(zfs_btree_leaf_cache); } static void * zfs_btree_leaf_alloc(zfs_btree_t *tree) { if (tree->bt_leaf_size == BTREE_LEAF_SIZE) return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); else return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); } static void zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) { if (tree->bt_leaf_size == BTREE_LEAF_SIZE) return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); else return (kmem_free(ptr, tree->bt_leaf_size)); } void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), bt_find_in_buf_f bt_find_in_buf, size_t size) { zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, BTREE_LEAF_SIZE); } static void * zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, const void *value, zfs_btree_index_t *where); void zfs_btree_create_custom(zfs_btree_t *tree, int (*compar) (const void *, const void *), bt_find_in_buf_f bt_find_in_buf, size_t size, size_t lsize) { size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); ASSERT3U(size, <=, esize / 2); memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t); tree->bt_height = -1; tree->bt_bulk = NULL; } /* * Find value in the array of elements provided. Uses a simple binary search. */ static void * zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, const void *value, zfs_btree_index_t *where) { uint32_t max = nelems; uint32_t min = 0; while (max > min) { uint32_t idx = (min + max) / 2; uint8_t *cur = buf + idx * tree->bt_elem_size; int comp = tree->bt_compar(cur, value); if (comp < 0) { min = idx + 1; } else if (comp > 0) { max = idx; } else { where->bti_offset = idx; where->bti_before = B_FALSE; return (cur); } } where->bti_offset = max; where->bti_before = B_TRUE; return (NULL); } /* * Find the given value in the tree. where may be passed as null to use as a * membership test or if the btree is being used as a map. */ void * zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) { if (tree->bt_height == -1) { if (where != NULL) { where->bti_node = NULL; where->bti_offset = 0; } ASSERT0(tree->bt_num_elems); return (NULL); } /* * If we're in bulk-insert mode, we check the last spot in the tree * and the last leaf in the tree before doing the normal search, * because for most workloads the vast majority of finds in * bulk-insert mode are to insert new elements. */ zfs_btree_index_t idx; size_t size = tree->bt_elem_size; if (tree->bt_bulk != NULL) { zfs_btree_leaf_t *last_leaf = tree->bt_bulk; int comp = tree->bt_compar(last_leaf->btl_elems + (last_leaf->btl_hdr.bth_first + last_leaf->btl_hdr.bth_count - 1) * size, value); if (comp < 0) { /* * If what they're looking for is after the last * element, it's not in the tree. */ if (where != NULL) { where->bti_node = (zfs_btree_hdr_t *)last_leaf; where->bti_offset = last_leaf->btl_hdr.bth_count; where->bti_before = B_TRUE; } return (NULL); } else if (comp == 0) { if (where != NULL) { where->bti_node = (zfs_btree_hdr_t *)last_leaf; where->bti_offset = last_leaf->btl_hdr.bth_count - 1; where->bti_before = B_FALSE; } return (last_leaf->btl_elems + (last_leaf->btl_hdr.bth_first + last_leaf->btl_hdr.bth_count - 1) * size); } if (tree->bt_compar(last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, value) <= 0) { /* * If what they're looking for is after the first * element in the last leaf, it's in the last leaf or * it's not in the tree. */ void *d = tree->bt_find_in_buf(tree, last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, last_leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)last_leaf; *where = idx; } return (d); } } zfs_btree_core_t *node = NULL; uint32_t child = 0; uint32_t depth = 0; /* * Iterate down the tree, finding which child the value should be in * by comparing with the separators. */ for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); void *d = tree->bt_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)node; *where = idx; } return (d); } ASSERT(idx.bti_before); child = idx.bti_offset; } /* * The value is in this leaf, or it would be if it were in the * tree. Find its proper location and return it. */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)leaf; *where = idx; } return (d); } /* * To explain the following functions, it is useful to understand the four * kinds of shifts used in btree operation. First, a shift is a movement of * elements within a node. It is used to create gaps for inserting new * elements and children, or cover gaps created when things are removed. A * shift has two fundamental properties, each of which can be one of two * values, making four types of shifts. There is the direction of the shift * (left or right) and the shape of the shift (parallelogram or isoceles * trapezoid (shortened to trapezoid hereafter)). The shape distinction only * applies to shifts of core nodes. * * The names derive from the following imagining of the layout of a node: * * Elements: * * * * * * * ... * * * * Children: * * * * * * * * ... * * * * * This layout follows from the fact that the elements act as separators * between pairs of children, and that children root subtrees "below" the * current node. A left and right shift are fairly self-explanatory; a left * shift moves things to the left, while a right shift moves things to the * right. A parallelogram shift is a shift with the same number of elements * and children being moved, while a trapezoid shift is a shift that moves one * more children than elements. An example follows: * * A parallelogram shift could contain the following: * _______________ * \* * * * \ * * * ... * * * * * \ * * * *\ * * * ... * * * * --------------- * A trapezoid shift could contain the following: * ___________ * * / * * * \ * * * ... * * * * * / * * * *\ * * * ... * * * * --------------- * * Note that a parallelogram shift is always shaped like a "left-leaning" * parallelogram, where the starting index of the children being moved is * always one higher than the starting index of the elements being moved. No * "right-leaning" parallelogram shifts are needed (shifts where the starting * element index and starting child index being moved are the same) to achieve * any btree operations, so we ignore them. */ enum bt_shift_shape { BSS_TRAPEZOID, BSS_PARALLELOGRAM }; enum bt_shift_direction { BSD_LEFT, BSD_RIGHT }; /* * Shift elements and children in the provided core node by off spots. The * first element moved is idx, and count elements are moved. The shape of the * shift is determined by shape. The direction is determined by dir. */ static inline void bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, uint32_t count, uint32_t off, enum bt_shift_shape shape, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; ASSERT(zfs_btree_is_core(&node->btc_hdr)); uint8_t *e_start = node->btc_elems + idx * size; uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : e_start + off * size); bmov(e_start, e_out, count * size); zfs_btree_hdr_t **c_start = node->btc_children + idx + (shape == BSS_TRAPEZOID ? 0 : 1); zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bmov(c_start, c_out, c_count * sizeof (*c_start)); } /* * Shift elements and children in the provided core node left by one spot. * The first element moved is idx, and count elements are moved. The * shape of the shift is determined by trap; true if the shift is a trapezoid, * false if it is a parallelogram. */ static inline void bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, uint32_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); } /* * Shift elements and children in the provided core node right by one spot. * Starts with elements[idx] and children[idx] and one more child than element. */ static inline void bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, uint32_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); } /* * Shift elements and children in the provided leaf node by off spots. * The first element moved is idx, and count elements are moved. The direction * is determined by left. */ static inline void bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, uint32_t count, uint32_t off, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = &node->btl_hdr; ASSERT(!zfs_btree_is_core(hdr)); if (count == 0) return; uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; uint8_t *out = (dir == BSD_LEFT ? start - off * size : start + off * size); bmov(start, out, count * size); } /* * Grow leaf for n new elements before idx. */ static void bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, uint32_t n) { zfs_btree_hdr_t *hdr = &leaf->btl_hdr; ASSERT(!zfs_btree_is_core(hdr)); ASSERT3U(idx, <=, hdr->bth_count); uint32_t capacity = tree->bt_leaf_cap; ASSERT3U(hdr->bth_count + n, <=, capacity); boolean_t cl = (hdr->bth_first >= n); boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); if (cl && (!cr || idx <= hdr->bth_count / 2)) { /* Grow left. */ hdr->bth_first -= n; bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); } else if (cr) { /* Grow right. */ bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, BSD_RIGHT); } else { /* Grow both ways. */ uint32_t fn = hdr->bth_first - (capacity - (hdr->bth_count + n)) / 2; hdr->bth_first -= fn; bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, n - fn, BSD_RIGHT); } hdr->bth_count += n; } /* * Shrink leaf for count elements starting from idx. */ static void bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, uint32_t n) { zfs_btree_hdr_t *hdr = &leaf->btl_hdr; ASSERT(!zfs_btree_is_core(hdr)); ASSERT3U(idx, <=, hdr->bth_count); ASSERT3U(idx + n, <=, hdr->bth_count); if (idx <= (hdr->bth_count - n) / 2) { bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); zfs_btree_poison_node_at(tree, hdr, 0, n); hdr->bth_first += n; } else { bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, BSD_LEFT); zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); } hdr->bth_count -= n; } /* * Move children and elements from one core node to another. The shape * parameter behaves the same as it does in the shift logic. */ static inline void bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, uint32_t count, zfs_btree_core_t *dest, uint32_t didx, enum bt_shift_shape shape) { size_t size = tree->bt_elem_size; ASSERT(zfs_btree_is_core(&source->btc_hdr)); ASSERT(zfs_btree_is_core(&dest->btc_hdr)); bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, count * size); uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), c_count * sizeof (*source->btc_children)); } static inline void bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) { size_t size = tree->bt_elem_size; ASSERT(!zfs_btree_is_core(&source->btl_hdr)); ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, count * size); } /* * Find the first element in the subtree rooted at hdr, return its value and * put its location in where if non-null. */ static void * zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) { zfs_btree_hdr_t *node; for (node = hdr; zfs_btree_is_core(node); node = ((zfs_btree_core_t *)node)->btc_children[0]) ; ASSERT(!zfs_btree_is_core(node)); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; if (where != NULL) { where->bti_node = node; where->bti_offset = 0; where->bti_before = B_FALSE; } return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); } /* Insert an element and a child into a core node at the given offset. */ static void zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; ASSERT3P(par_hdr, ==, new_node->bth_parent); ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); if (zfs_btree_verify_intensity >= 5) { zfs_btree_verify_poison_at(tree, par_hdr, par_hdr->bth_count); } /* Shift existing elements and children */ uint32_t count = par_hdr->bth_count - offset; bt_shift_core_right(tree, parent, offset, count, BSS_PARALLELOGRAM); /* Insert new values */ parent->btc_children[offset + 1] = new_node; bcpy(buf, parent->btc_elems + offset * size, size); par_hdr->bth_count++; } /* * Insert new_node into the parent of old_node directly after old_node, with * buf as the dividing element between the two. */ static void zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, zfs_btree_hdr_t *new_node, void *buf) { ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); size_t size = tree->bt_elem_size; zfs_btree_core_t *parent = old_node->bth_parent; /* * If this is the root node we were splitting, we create a new root * and increase the height of the tree. */ if (parent == NULL) { ASSERT3P(old_node, ==, tree->bt_root); tree->bt_num_nodes++; zfs_btree_core_t *new_root = kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * size, KM_SLEEP); zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; new_root_hdr->bth_parent = NULL; new_root_hdr->bth_first = -1; new_root_hdr->bth_count = 1; old_node->bth_parent = new_node->bth_parent = new_root; new_root->btc_children[0] = old_node; new_root->btc_children[1] = new_node; bcpy(buf, new_root->btc_elems, size); tree->bt_height++; tree->bt_root = new_root_hdr; zfs_btree_poison_node(tree, new_root_hdr); return; } /* * Since we have the new separator, binary search for where to put * new_node. */ zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; zfs_btree_index_t idx; ASSERT(zfs_btree_is_core(par_hdr)); VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint32_t offset = idx.bti_offset; ASSERT3U(offset, <=, par_hdr->bth_count); ASSERT3P(parent->btc_children[offset], ==, old_node); /* * If the parent isn't full, shift things to accommodate our insertions * and return. */ if (par_hdr->bth_count != BTREE_CORE_ELEMS) { zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); return; } /* * We need to split this core node into two. Currently there are * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for * BTREE_CORE_ELEMS + 2. Some of the children will be part of the * current node, and the others will be moved to the new core node. * There are BTREE_CORE_ELEMS + 1 elements including the new one. One * will be used as the new separator in our parent, and the others * will be split among the two core nodes. * * Usually we will split the node in half evenly, with * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we * instead move only about a quarter of the elements (and children) to * the new node. Since the average state after a long time is a 3/4 * full node, shortcutting directly to that state improves efficiency. * * We do this in two stages: first we split into two nodes, and then we * reuse our existing logic to insert the new element and child. */ uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 2 : 4)) - 1, 2); uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); tree->bt_num_nodes++; zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * size, KM_SLEEP); zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; new_par_hdr->bth_parent = par_hdr->bth_parent; new_par_hdr->bth_first = -1; new_par_hdr->bth_count = move_count; zfs_btree_poison_node(tree, new_par_hdr); par_hdr->bth_count = keep_count; bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, 0, BSS_TRAPEZOID); /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); bcpy(parent->btc_elems + keep_count * size, tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); if (offset < keep_count) { /* Insert the new node into the left half */ zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); /* * Move the new separator to the existing buffer. */ bcpy(tmp_buf, buf, size); } else if (offset > keep_count) { /* Insert the new node into the right half */ new_node->bth_parent = new_parent; zfs_btree_insert_core_impl(tree, new_parent, offset - keep_count - 1, new_node, buf); /* * Move the new separator to the existing buffer. */ bcpy(tmp_buf, buf, size); } else { /* * Move the new separator into the right half, and replace it * with buf. We also need to shift back the elements in the * right half to accommodate new_node. */ bt_shift_core_right(tree, new_parent, 0, move_count, BSS_TRAPEZOID); new_parent->btc_children[0] = new_node; bcpy(tmp_buf, new_parent->btc_elems, size); new_par_hdr->bth_count++; } kmem_free(tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) new_parent->btc_children[i]->bth_parent = new_parent; for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); /* * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting. */ zfs_btree_insert_into_parent(tree, &parent->btc_hdr, &new_parent->btc_hdr, buf); } /* Insert an element into a leaf node at the given offset. */ static void zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, const void *value) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); if (zfs_btree_verify_intensity >= 5) { zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, leaf->btl_hdr.bth_count); } bt_grow_leaf(tree, leaf, idx, 1); uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; bcpy(value, start, size); } static void zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); /* Helper function for inserting a new value into leaf at the given index. */ static void zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, const void *value, uint32_t idx) { size_t size = tree->bt_elem_size; uint32_t capacity = tree->bt_leaf_cap; /* * If the leaf isn't full, shift the elements after idx and insert * value. */ if (leaf->btl_hdr.bth_count != capacity) { zfs_btree_insert_leaf_impl(tree, leaf, idx, value); return; } /* * Otherwise, we split the leaf node into two nodes. If we're not bulk * inserting, each is of size (capacity / 2). If we are bulk * inserting, we move a quarter of the elements to the new node so * inserts into the old node don't cause immediate splitting but the * tree stays relatively dense. Since the average state after a long * time is a 3/4 full node, shortcutting directly to that state * improves efficiency. At the end of the bulk insertion process * we'll need to go through and fix up any nodes (the last leaf and * its ancestors, potentially) that are below the minimum. * * In either case, we're left with one extra element. The leftover * element will become the new dividing element between the two nodes. */ uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; uint32_t keep_count = capacity - move_count - 1; ASSERT3U(keep_count, >=, 1); /* If we insert on left. move one more to keep leaves balanced. */ if (idx < keep_count) { keep_count--; move_count++; } tree->bt_num_nodes++; zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; new_hdr->bth_parent = leaf->btl_hdr.bth_parent; new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + (idx >= keep_count && idx <= keep_count + move_count / 2); new_hdr->bth_count = move_count; zfs_btree_poison_node(tree, new_hdr); if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) tree->bt_bulk = new_leaf; /* Copy the back part to the new leaf. */ bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); /* We store the new separator in a buffer we control for simplicity. */ uint8_t *buf = kmem_alloc(size, KM_SLEEP); bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, buf, size); bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); if (idx < keep_count) { /* Insert into the existing leaf. */ zfs_btree_insert_leaf_impl(tree, leaf, idx, value); } else if (idx > keep_count) { /* Insert into the new leaf. */ zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - 1, value); } else { /* * Insert planned separator into the new leaf, and use * the new value as the new separator. */ zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); bcpy(value, buf, size); } /* * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting, bur only of core nodes. */ zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, buf); kmem_free(buf, size); } static uint32_t zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { void *buf; if (zfs_btree_is_core(hdr)) { buf = ((zfs_btree_core_t *)hdr)->btc_elems; } else { buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + hdr->bth_first * tree->bt_elem_size; } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); return (idx.bti_offset); } /* * Take the b-tree out of bulk insert mode. During bulk-insert mode, some * nodes may violate the invariant that non-root nodes must be at least half * full. All nodes violating this invariant should be the last node in their * particular level. To correct the invariant, we take values from their left * neighbor until they are half full. They must have a left neighbor at their * level because the last node at a level is not the first node unless it's * the root. */ static void zfs_btree_bulk_finish(zfs_btree_t *tree) { ASSERT3P(tree->bt_bulk, !=, NULL); ASSERT3P(tree->bt_root, !=, NULL); zfs_btree_leaf_t *leaf = tree->bt_bulk; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; zfs_btree_core_t *parent = hdr->bth_parent; size_t size = tree->bt_elem_size; uint32_t capacity = tree->bt_leaf_cap; /* * The invariant doesn't apply to the root node, if that's the only * node in the tree we're done. */ if (parent == NULL) { tree->bt_bulk = NULL; return; } /* First, take elements to rebalance the leaf node. */ if (hdr->bth_count < capacity / 2) { /* * First, find the left neighbor. The simplest way to do this * is to call zfs_btree_prev twice; the first time finds some * ancestor of this node, and the second time finds the left * neighbor. The ancestor found is the lowest common ancestor * of leaf and the neighbor. */ zfs_btree_index_t idx = { .bti_node = hdr, .bti_offset = 0 }; VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); ASSERT(zfs_btree_is_core(idx.bti_node)); zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; uint32_t common_idx = idx.bti_offset; VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); ASSERT(!zfs_btree_is_core(idx.bti_node)); zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; zfs_btree_hdr_t *l_hdr = idx.bti_node; uint32_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, capacity / 2); if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < move_count; i++) { zfs_btree_verify_poison_at(tree, hdr, leaf->btl_hdr.bth_count + i); } } /* First, shift elements in leaf back. */ bt_grow_leaf(tree, leaf, 0, move_count); /* Next, move the separator from the common ancestor to leaf. */ uint8_t *separator = common->btc_elems + common_idx * size; uint8_t *out = leaf->btl_elems + (hdr->bth_first + move_count - 1) * size; bcpy(separator, out, size); /* * Now we move elements from the tail of the left neighbor to * fill the remaining spots in leaf. */ bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - (move_count - 1), move_count - 1, leaf, 0); /* * Finally, move the new last element in the left neighbor to * the separator. */ bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + l_hdr->bth_count - move_count) * size, separator, size); /* Adjust the node's counts, and we're done. */ bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, move_count); ASSERT3U(l_hdr->bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); } /* * Now we have to rebalance any ancestors of leaf that may also * violate the invariant. */ capacity = BTREE_CORE_ELEMS; while (parent->btc_hdr.bth_parent != NULL) { zfs_btree_core_t *cur = parent; zfs_btree_hdr_t *hdr = &cur->btc_hdr; parent = hdr->bth_parent; /* * If the invariant isn't violated, move on to the next * ancestor. */ if (hdr->bth_count >= capacity / 2) continue; /* * Because the smallest number of nodes we can move when * splitting is 2, we never need to worry about not having a * left sibling (a sibling is a neighbor with the same parent). */ uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); ASSERT3U(parent_idx, >, 0); zfs_btree_core_t *l_neighbor = (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; uint32_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, capacity / 2); if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < move_count; i++) { zfs_btree_verify_poison_at(tree, hdr, hdr->bth_count + i); } } /* First, shift things in the right node back. */ bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, BSS_TRAPEZOID, BSD_RIGHT); /* Next, move the separator to the right node. */ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * size); uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); bcpy(separator, e_out, size); /* * Now, move elements and children from the left node to the * right. We move one more child than elements. */ move_count--; uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, BSS_TRAPEZOID); /* * Finally, move the last element in the left node to the * separator's position. */ move_idx--; bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); l_neighbor->btc_hdr.bth_count -= move_count + 1; hdr->bth_count += move_count + 1; ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); for (uint32_t i = 0; i <= hdr->bth_count; i++) cur->btc_children[i]->bth_parent = cur; } tree->bt_bulk = NULL; zfs_btree_verify(tree); } /* * Insert value into tree at the location specified by where. */ void zfs_btree_add_idx(zfs_btree_t *tree, const void *value, const zfs_btree_index_t *where) { zfs_btree_index_t idx = {0}; /* If we're not inserting in the last leaf, end bulk insert mode. */ if (tree->bt_bulk != NULL) { if (where->bti_node != &tree->bt_bulk->btl_hdr) { zfs_btree_bulk_finish(tree); VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); where = &idx; } } tree->bt_num_elems++; /* * If this is the first element in the tree, create a leaf root node * and add the value to it. */ if (where->bti_node == NULL) { ASSERT3U(tree->bt_num_elems, ==, 1); ASSERT3S(tree->bt_height, ==, -1); ASSERT3P(tree->bt_root, ==, NULL); ASSERT0(where->bti_offset); tree->bt_num_nodes++; zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); tree->bt_root = &leaf->btl_hdr; tree->bt_height++; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; hdr->bth_parent = NULL; hdr->bth_first = 0; hdr->bth_count = 0; zfs_btree_poison_node(tree, hdr); zfs_btree_insert_into_leaf(tree, leaf, value, 0); tree->bt_bulk = leaf; } else if (!zfs_btree_is_core(where->bti_node)) { /* * If we're inserting into a leaf, go directly to the helper * function. */ zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)where->bti_node, value, where->bti_offset); } else { /* * If we're inserting into a core node, we can't just shift * the existing element in that slot in the same node without * breaking our ordering invariants. Instead we place the new * value in the node at that spot and then insert the old * separator into the first slot in the subtree to the right. */ zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; /* * We can ignore bti_before, because either way the value * should end up in bti_offset. */ uint32_t off = where->bti_offset; zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; size_t size = tree->bt_elem_size; uint8_t *buf = kmem_alloc(size, KM_SLEEP); bcpy(node->btc_elems + off * size, buf, size); bcpy(value, node->btc_elems + off * size, size); /* * Find the first slot in the subtree to the right, insert * there. */ zfs_btree_index_t new_idx; VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, NULL); ASSERT0(new_idx.bti_offset); ASSERT(!zfs_btree_is_core(new_idx.bti_node)); zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); kmem_free(buf, size); } zfs_btree_verify(tree); } /* * Return the first element in the tree, and put its location in where if * non-null. */ void * zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) { if (tree->bt_height == -1) { ASSERT0(tree->bt_num_elems); return (NULL); } return (zfs_btree_first_helper(tree, tree->bt_root, where)); } /* * Find the last element in the subtree rooted at hdr, return its value and * put its location in where if non-null. */ static void * zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) { zfs_btree_hdr_t *node; for (node = hdr; zfs_btree_is_core(node); node = ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) ; zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; if (where != NULL) { where->bti_node = node; where->bti_offset = node->bth_count - 1; where->bti_before = B_FALSE; } return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * btree->bt_elem_size); } /* * Return the last element in the tree, and put its location in where if * non-null. */ void * zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) { if (tree->bt_height == -1) { ASSERT0(tree->bt_num_elems); return (NULL); } return (zfs_btree_last_helper(tree, tree->bt_root, where)); } /* * This function contains the logic to find the next node in the tree. A * helper function is used because there are multiple internal consumemrs of * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each * node after we've finished with it. */ static void * zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx, void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) { if (idx->bti_node == NULL) { ASSERT3S(tree->bt_height, ==, -1); return (NULL); } uint32_t offset = idx->bti_offset; if (!zfs_btree_is_core(idx->bti_node)) { /* * When finding the next element of an element in a leaf, * there are two cases. If the element isn't the last one in * the leaf, in which case we just return the next element in * the leaf. Otherwise, we need to traverse up our parents * until we find one where our ancestor isn't the last child * of its parent. Once we do, the next element is the * separator after our ancestor in its parent. */ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; uint32_t new_off = offset + (idx->bti_before ? 0 : 1); if (leaf->btl_hdr.bth_count > new_off) { out_idx->bti_node = &leaf->btl_hdr; out_idx->bti_offset = new_off; out_idx->bti_before = B_FALSE; return (leaf->btl_elems + (leaf->btl_hdr.bth_first + new_off) * tree->bt_elem_size); } zfs_btree_hdr_t *prev = &leaf->btl_hdr; for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { zfs_btree_hdr_t *hdr = &node->btc_hdr; ASSERT(zfs_btree_is_core(hdr)); uint32_t i = zfs_btree_find_parent_idx(tree, prev); if (done_func != NULL) done_func(tree, prev); if (i == hdr->bth_count) { prev = hdr; continue; } out_idx->bti_node = hdr; out_idx->bti_offset = i; out_idx->bti_before = B_FALSE; return (node->btc_elems + i * tree->bt_elem_size); } if (done_func != NULL) done_func(tree, prev); /* * We've traversed all the way up and been at the end of the * node every time, so this was the last element in the tree. */ return (NULL); } /* If we were before an element in a core node, return that element. */ ASSERT(zfs_btree_is_core(idx->bti_node)); zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; if (idx->bti_before) { out_idx->bti_before = B_FALSE; return (node->btc_elems + offset * tree->bt_elem_size); } /* * The next element from one in a core node is the first element in * the subtree just to the right of the separator. */ zfs_btree_hdr_t *child = node->btc_children[offset + 1]; return (zfs_btree_first_helper(tree, child, out_idx)); } /* * Return the next valued node in the tree. The same address can be safely * passed for idx and out_idx. */ void * zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) { return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); } /* * Return the previous valued node in the tree. The same value can be safely * passed for idx and out_idx. */ void * zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) { if (idx->bti_node == NULL) { ASSERT3S(tree->bt_height, ==, -1); return (NULL); } uint32_t offset = idx->bti_offset; if (!zfs_btree_is_core(idx->bti_node)) { /* * When finding the previous element of an element in a leaf, * there are two cases. If the element isn't the first one in * the leaf, in which case we just return the previous element * in the leaf. Otherwise, we need to traverse up our parents * until we find one where our previous ancestor isn't the * first child. Once we do, the previous element is the * separator after our previous ancestor. */ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; if (offset != 0) { out_idx->bti_node = &leaf->btl_hdr; out_idx->bti_offset = offset - 1; out_idx->bti_before = B_FALSE; return (leaf->btl_elems + (leaf->btl_hdr.bth_first + offset - 1) * tree->bt_elem_size); } zfs_btree_hdr_t *prev = &leaf->btl_hdr; for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { zfs_btree_hdr_t *hdr = &node->btc_hdr; ASSERT(zfs_btree_is_core(hdr)); uint32_t i = zfs_btree_find_parent_idx(tree, prev); if (i == 0) { prev = hdr; continue; } out_idx->bti_node = hdr; out_idx->bti_offset = i - 1; out_idx->bti_before = B_FALSE; return (node->btc_elems + (i - 1) * tree->bt_elem_size); } /* * We've traversed all the way up and been at the start of the * node every time, so this was the first node in the tree. */ return (NULL); } /* * The previous element from one in a core node is the last element in * the subtree just to the left of the separator. */ ASSERT(zfs_btree_is_core(idx->bti_node)); zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; zfs_btree_hdr_t *child = node->btc_children[offset]; return (zfs_btree_last_helper(tree, child, out_idx)); } /* * Get the value at the provided index in the tree. * * Note that the value returned from this function can be mutated, but only * if it will not change the ordering of the element with respect to any other * elements that could be in the tree. */ void * zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) { ASSERT(!idx->bti_before); size_t size = tree->bt_elem_size; if (!zfs_btree_is_core(idx->bti_node)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; return (leaf->btl_elems + (leaf->btl_hdr.bth_first + idx->bti_offset) * size); } zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; return (node->btc_elems + idx->bti_offset * size); } /* Add the given value to the tree. Must not already be in the tree. */ void zfs_btree_add(zfs_btree_t *tree, const void *node) { zfs_btree_index_t where = {0}; VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); zfs_btree_add_idx(tree, node, &where); } /* Helper function to free a tree node. */ static void zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) { tree->bt_num_nodes--; if (!zfs_btree_is_core(node)) { zfs_btree_leaf_free(tree, node); } else { kmem_free(node, sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * tree->bt_elem_size); } } /* * Remove the rm_hdr and the separator to its left from the parent node. The * buffer that rm_hdr was stored in may already be freed, so its contents * cannot be accessed. */ static void zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_hdr_t *rm_hdr) { size_t size = tree->bt_elem_size; uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; zfs_btree_hdr_t *hdr = &node->btc_hdr; /* * If the node is the root node and rm_hdr is one of two children, * promote the other child to the root. */ if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { ASSERT3U(hdr->bth_count, ==, 1); ASSERT3P(tree->bt_root, ==, node); ASSERT3P(node->btc_children[1], ==, rm_hdr); tree->bt_root = node->btc_children[0]; node->btc_children[0]->bth_parent = NULL; zfs_btree_node_destroy(tree, hdr); tree->bt_height--; return; } uint32_t idx; for (idx = 0; idx <= hdr->bth_count; idx++) { if (node->btc_children[idx] == rm_hdr) break; } ASSERT3U(idx, <=, hdr->bth_count); /* * If the node is the root or it has more than the minimum number of * children, just remove the child and separator, and return. */ if (hdr->bth_parent == NULL || hdr->bth_count > min_count) { /* * Shift the element and children to the right of rm_hdr to * the left by one spot. */ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, BSS_PARALLELOGRAM); hdr->bth_count--; zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); return; } ASSERT3U(hdr->bth_count, ==, min_count); /* * Now we try to take a node from a neighbor. We check left, then * right. If the neighbor exists and has more than the minimum number * of elements, we move the separator between us and them to our * node, move their closest element (last for left, first for right) * to the separator, and move their closest child to our node. Along * the way we need to collapse the gap made by idx, and (for our right * neighbor) the gap made by removing their first element and child. * * Note: this logic currently doesn't support taking from a neighbor * that isn't a sibling (i.e. a neighbor with a different * parent). This isn't critical functionality, but may be worth * implementing in the future for completeness' sake. */ zfs_btree_core_t *parent = hdr->bth_parent; uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ ASSERT(zfs_btree_is_core(l_hdr)); zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; /* * Start by shifting the elements and children in the current * node to the right by one spot. */ bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); /* * Move the separator between node and neighbor to the first * element slot in the current node. */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, node->btc_elems, size); /* Move the last child of neighbor to our first child slot. */ node->btc_children[0] = neighbor->btc_children[l_hdr->bth_count]; node->btc_children[0]->bth_parent = node; /* Move the last element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems + (l_hdr->bth_count - 1) * size; bcpy(take_elem, separator, size); l_hdr->bth_count--; zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); return; } zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ ASSERT(zfs_btree_is_core(r_hdr)); zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; /* * Shift elements in node left by one spot to overwrite rm_hdr * and the separator before it. */ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, BSS_PARALLELOGRAM); /* * Move the separator between node and neighbor to the last * element spot in node. */ uint8_t *separator = parent->btc_elems + parent_idx * size; bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, size); /* * Move the first child of neighbor to the last child spot in * node. */ node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; node->btc_children[hdr->bth_count]->bth_parent = node; /* Move the first element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems; bcpy(take_elem, separator, size); r_hdr->bth_count--; /* * Shift the elements and children of neighbor to cover the * stolen elements. */ bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, BSS_TRAPEZOID); zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); return; } /* * In this case, neither of our neighbors can spare an element, so we * need to merge with one of them. We prefer the left one, * arbitrarily. Move the separator into the leftmost merging node * (which may be us or the left neighbor), and then move the right * merging node's elements. Once that's done, we go back and delete * the element we're removing. Finally, go into the parent and delete * the right merging node and the separator. This may cause further * merging. */ zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; uint32_t new_idx = idx; if (l_hdr != NULL) { keep_hdr = l_hdr; new_rm_hdr = hdr; new_idx += keep_hdr->bth_count + 1; } else { ASSERT3P(r_hdr, !=, NULL); keep_hdr = hdr; new_rm_hdr = r_hdr; parent_idx++; } ASSERT(zfs_btree_is_core(keep_hdr)); ASSERT(zfs_btree_is_core(new_rm_hdr)); zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { zfs_btree_verify_poison_at(tree, keep_hdr, keep_hdr->bth_count + i); } } /* Move the separator into the left node. */ uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, e_out, size); keep_hdr->bth_count++; /* Move all our elements and children into the left node. */ bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, keep_hdr->bth_count, BSS_TRAPEZOID); uint32_t old_count = keep_hdr->bth_count; /* Update bookkeeping */ keep_hdr->bth_count += new_rm_hdr->bth_count; ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); /* * Shift the element and children to the right of rm_hdr to * the left by one spot. */ ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, BSS_PARALLELOGRAM); keep_hdr->bth_count--; /* Reparent all our children to point to the left node. */ zfs_btree_hdr_t **new_start = keep->btc_children + old_count - 1; for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) new_start[i]->bth_parent = keep; for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); ASSERT3P(keep->btc_children[i], !=, rm_hdr); } zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); new_rm_hdr->bth_count = 0; zfs_btree_remove_from_node(tree, parent, new_rm_hdr); zfs_btree_node_destroy(tree, new_rm_hdr); } /* Remove the element at the specific location. */ void zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = where->bti_node; uint32_t idx = where->bti_offset; ASSERT(!where->bti_before); if (tree->bt_bulk != NULL) { /* * Leave bulk insert mode. Note that our index would be * invalid after we correct the tree, so we copy the value * we're planning to remove and find it again after * bulk_finish. */ uint8_t *value = zfs_btree_get(tree, where); uint8_t *tmp = kmem_alloc(size, KM_SLEEP); bcpy(value, tmp, size); zfs_btree_bulk_finish(tree); VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); kmem_free(tmp, size); hdr = where->bti_node; idx = where->bti_offset; } tree->bt_num_elems--; /* * If the element happens to be in a core node, we move a leaf node's * element into its place and then remove the leaf node element. This * makes the rebalance logic not need to be recursive both upwards and * downwards. */ if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; void *new_value = zfs_btree_last_helper(tree, left_subtree, where); ASSERT3P(new_value, !=, NULL); bcpy(new_value, node->btc_elems + idx * size, size); hdr = where->bti_node; idx = where->bti_offset; ASSERT(!where->bti_before); } /* * First, we'll update the leaf's metadata. Then, we shift any * elements after the idx to the left. After that, we rebalance if * needed. */ ASSERT(!zfs_btree_is_core(hdr)); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; ASSERT3U(hdr->bth_count, >, 0); uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; /* * If we're over the minimum size or this is the root, just overwrite * the value and return. */ if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { bt_shrink_leaf(tree, leaf, idx, 1); if (hdr->bth_parent == NULL) { ASSERT0(tree->bt_height); if (hdr->bth_count == 0) { tree->bt_root = NULL; tree->bt_height--; zfs_btree_node_destroy(tree, &leaf->btl_hdr); } } zfs_btree_verify(tree); return; } ASSERT3U(hdr->bth_count, ==, min_count); /* * Now we try to take a node from a sibling. We check left, then * right. If they exist and have more than the minimum number of * elements, we move the separator between us and them to our node * and move their closest element (last for left, first for right) to * the separator. Along the way we need to collapse the gap made by * idx, and (for our right neighbor) the gap made by removing their * first element. * * Note: this logic currently doesn't support taking from a neighbor * that isn't a sibling. This isn't critical functionality, but may be * worth implementing in the future for completeness' sake. */ zfs_btree_core_t *parent = hdr->bth_parent; uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ ASSERT(!zfs_btree_is_core(l_hdr)); zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; /* * Move our elements back by one spot to make room for the * stolen element and overwrite the element being removed. */ bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); /* Move the separator to our first spot. */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); /* Move our neighbor's last element to the separator. */ uint8_t *take_elem = neighbor->btl_elems + (l_hdr->bth_first + l_hdr->bth_count - 1) * size; bcpy(take_elem, separator, size); /* Delete our neighbor's last element. */ bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); zfs_btree_verify(tree); return; } zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ ASSERT(!zfs_btree_is_core(r_hdr)); zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; /* * Move our elements after the element being removed forwards * by one spot to make room for the stolen element and * overwrite the element being removed. */ bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, 1, BSD_LEFT); /* Move the separator between us to our last spot. */ uint8_t *separator = parent->btc_elems + parent_idx * size; bcpy(separator, leaf->btl_elems + (hdr->bth_first + hdr->bth_count - 1) * size, size); /* Move our neighbor's first element to the separator. */ uint8_t *take_elem = neighbor->btl_elems + r_hdr->bth_first * size; bcpy(take_elem, separator, size); /* Delete our neighbor's first element. */ bt_shrink_leaf(tree, neighbor, 0, 1); zfs_btree_verify(tree); return; } /* * In this case, neither of our neighbors can spare an element, so we * need to merge with one of them. We prefer the left one, arbitrarily. * After remove we move the separator into the leftmost merging node * (which may be us or the left neighbor), and then move the right * merging node's elements. Once that's done, we go back and delete * the element we're removing. Finally, go into the parent and delete * the right merging node and the separator. This may cause further * merging. */ zfs_btree_hdr_t *rm_hdr, *k_hdr; if (l_hdr != NULL) { k_hdr = l_hdr; rm_hdr = hdr; } else { ASSERT3P(r_hdr, !=, NULL); k_hdr = hdr; rm_hdr = r_hdr; parent_idx++; } ASSERT(!zfs_btree_is_core(k_hdr)); ASSERT(!zfs_btree_is_core(rm_hdr)); ASSERT3U(k_hdr->bth_count, ==, min_count); ASSERT3U(rm_hdr->bth_count, ==, min_count); zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { zfs_btree_verify_poison_at(tree, k_hdr, k_hdr->bth_count + i); } } /* * Remove the value from the node. It will go below the minimum, * but we'll fix it in no time. */ bt_shrink_leaf(tree, leaf, idx, 1); /* Prepare space for elements to be moved from the right. */ uint32_t k_count = k_hdr->bth_count; bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); ASSERT3U(k_hdr->bth_count, ==, min_count * 2); /* Move the separator into the first open spot. */ uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, out, size); /* Move our elements to the left neighbor. */ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); /* Remove the emptied node from the parent. */ zfs_btree_remove_from_node(tree, parent, rm_hdr); zfs_btree_node_destroy(tree, rm_hdr); zfs_btree_verify(tree); } /* Remove the given value from the tree. */ void zfs_btree_remove(zfs_btree_t *tree, const void *value) { zfs_btree_index_t where = {0}; VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); zfs_btree_remove_idx(tree, &where); } /* Return the number of elements in the tree. */ ulong_t zfs_btree_numnodes(zfs_btree_t *tree) { return (tree->bt_num_elems); } /* * This function is used to visit all the elements in the tree before * destroying the tree. This allows the calling code to perform any cleanup it * needs to do. This is more efficient than just removing the first element * over and over, because it removes all rebalancing. Once the destroy_nodes() * function has been called, no other btree operations are valid until it * returns NULL, which point the only valid operation is zfs_btree_destroy(). * * example: * * zfs_btree_index_t *cookie = NULL; * my_data_t *node; * * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) * free(node->ptr); * zfs_btree_destroy(tree); * */ void * zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) { if (*cookie == NULL) { if (tree->bt_height == -1) return (NULL); *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); return (zfs_btree_first(tree, *cookie)); } void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, zfs_btree_node_destroy); if (rval == NULL) { tree->bt_root = NULL; tree->bt_height = -1; tree->bt_num_elems = 0; kmem_free(*cookie, sizeof (**cookie)); tree->bt_bulk = NULL; } return (rval); } static void zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; for (uint32_t i = 0; i <= hdr->bth_count; i++) zfs_btree_clear_helper(tree, btc->btc_children[i]); } zfs_btree_node_destroy(tree, hdr); } void zfs_btree_clear(zfs_btree_t *tree) { if (tree->bt_root == NULL) { ASSERT0(tree->bt_num_elems); return; } zfs_btree_clear_helper(tree, tree->bt_root); tree->bt_num_elems = 0; tree->bt_root = NULL; tree->bt_num_nodes = 0; tree->bt_height = -1; tree->bt_bulk = NULL; } void zfs_btree_destroy(zfs_btree_t *tree) { ASSERT0(tree->bt_num_elems); ASSERT3P(tree->bt_root, ==, NULL); } /* Verify that every child of this node has the correct parent pointer. */ static void zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (!zfs_btree_is_core(hdr)) return; zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = 0; i <= hdr->bth_count; i++) { VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); } } /* Verify that every node has the correct parent pointer. */ static void zfs_btree_verify_pointers(zfs_btree_t *tree) { if (tree->bt_height == -1) { VERIFY3P(tree->bt_root, ==, NULL); return; } VERIFY3P(tree->bt_root->bth_parent, ==, NULL); zfs_btree_verify_pointers_helper(tree, tree->bt_root); } /* * Verify that all the current node and its children satisfy the count * invariants, and return the total count in the subtree rooted in this node. */ static uint64_t zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (!zfs_btree_is_core(hdr)) { if (tree->bt_root != hdr && tree->bt_bulk && hdr != &tree->bt_bulk->btl_hdr) { VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); } return (hdr->bth_count); } else { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = hdr->bth_count; if (tree->bt_root != hdr && tree->bt_bulk == NULL) VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); for (uint32_t i = 0; i <= hdr->bth_count; i++) { ret += zfs_btree_verify_counts_helper(tree, node->btc_children[i]); } return (ret); } } /* * Verify that all nodes satisfy the invariants and that the total number of * elements is correct. */ static void zfs_btree_verify_counts(zfs_btree_t *tree) { EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); if (tree->bt_height == -1) { return; } VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, tree->bt_num_elems); } /* * Check that the subtree rooted at this node has a uniform height. Returns * the number of nodes under this node, to help verify bt_num_nodes. */ static uint64_t zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, int32_t height) { if (!zfs_btree_is_core(hdr)) { VERIFY0(height); return (1); } zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = 1; for (uint32_t i = 0; i <= hdr->bth_count; i++) { ret += zfs_btree_verify_height_helper(tree, node->btc_children[i], height - 1); } return (ret); } /* * Check that the tree rooted at this node has a uniform height, and that the * bt_height in the tree is correct. */ static void zfs_btree_verify_height(zfs_btree_t *tree) { EQUIV(tree->bt_height == -1, tree->bt_root == NULL); if (tree->bt_height == -1) { return; } VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, tree->bt_height), ==, tree->bt_num_nodes); } /* * Check that the elements in this node are sorted, and that if this is a core * node, the separators are properly between the subtrees they separaate and * that the children also satisfy this requirement. */ static void zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; if (!zfs_btree_is_core(hdr)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; for (uint32_t i = 1; i < hdr->bth_count; i++) { VERIFY3S(tree->bt_compar(leaf->btl_elems + (hdr->bth_first + i - 1) * size, leaf->btl_elems + (hdr->bth_first + i) * size), ==, -1); } return; } zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = 1; i < hdr->bth_count; i++) { VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, node->btc_elems + i * size), ==, -1); } for (uint32_t i = 0; i < hdr->bth_count; i++) { uint8_t *left_child_last = NULL; zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; if (zfs_btree_is_core(left_child_hdr)) { zfs_btree_core_t *left_child = (zfs_btree_core_t *)left_child_hdr; left_child_last = left_child->btc_elems + (left_child_hdr->bth_count - 1) * size; } else { zfs_btree_leaf_t *left_child = (zfs_btree_leaf_t *)left_child_hdr; left_child_last = left_child->btl_elems + (left_child_hdr->bth_first + left_child_hdr->bth_count - 1) * size; } int comp = tree->bt_compar(node->btc_elems + i * size, left_child_last); if (comp <= 0) { panic("btree: compar returned %d (expected 1) at " "%px %d: compar(%px, %px)", comp, node, i, node->btc_elems + i * size, left_child_last); } uint8_t *right_child_first = NULL; zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; if (zfs_btree_is_core(right_child_hdr)) { zfs_btree_core_t *right_child = (zfs_btree_core_t *)right_child_hdr; right_child_first = right_child->btc_elems; } else { zfs_btree_leaf_t *right_child = (zfs_btree_leaf_t *)right_child_hdr; right_child_first = right_child->btl_elems + right_child_hdr->bth_first * size; } comp = tree->bt_compar(node->btc_elems + i * size, right_child_first); if (comp >= 0) { panic("btree: compar returned %d (expected -1) at " "%px %d: compar(%px, %px)", comp, node, i, node->btc_elems + i * size, right_child_first); } } for (uint32_t i = 0; i <= hdr->bth_count; i++) zfs_btree_verify_order_helper(tree, node->btc_children[i]); } /* Check that all elements in the tree are in sorted order. */ static void zfs_btree_verify_order(zfs_btree_t *tree) { EQUIV(tree->bt_height == -1, tree->bt_root == NULL); if (tree->bt_height == -1) { return; } zfs_btree_verify_order_helper(tree, tree->bt_root); } #ifdef ZFS_DEBUG /* Check that all unused memory is poisoned correctly. */ static void zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; if (!zfs_btree_is_core(hdr)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; for (size_t i = 0; i < hdr->bth_first * size; i++) VERIFY3U(leaf->btl_elems[i], ==, 0x0f); size_t esize = tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems); for (size_t i = (hdr->bth_first + hdr->bth_count) * size; i < esize; i++) VERIFY3U(leaf->btl_elems[i], ==, 0x0f); } else { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (size_t i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; i++) VERIFY3U(node->btc_elems[i], ==, 0x0f); for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { VERIFY3P(node->btc_children[i], ==, (zfs_btree_hdr_t *)BTREE_POISON); } for (uint32_t i = 0; i <= hdr->bth_count; i++) { zfs_btree_verify_poison_helper(tree, node->btc_children[i]); } } } #endif /* Check that unused memory in the tree is still poisoned. */ static void zfs_btree_verify_poison(zfs_btree_t *tree) { #ifdef ZFS_DEBUG if (tree->bt_height == -1) return; zfs_btree_verify_poison_helper(tree, tree->bt_root); #endif } void zfs_btree_verify(zfs_btree_t *tree) { if (zfs_btree_verify_intensity == 0) return; zfs_btree_verify_height(tree); if (zfs_btree_verify_intensity == 1) return; zfs_btree_verify_pointers(tree); if (zfs_btree_verify_intensity == 2) return; zfs_btree_verify_counts(tree); if (zfs_btree_verify_intensity == 3) return; zfs_btree_verify_order(tree); if (zfs_btree_verify_intensity == 4) return; zfs_btree_verify_poison(tree); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, "Enable btree verification. Levels above 4 require ZFS be built " "with debugging"); -/* END CSTYLED */ diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 137fe487a997..64924bc4fa61 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -1,266 +1,264 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved. * Copyright (c) 2023, Klara Inc. */ #include #include #include #include #include #include #include #include static unsigned int ddt_zap_default_bs = 15; static unsigned int ddt_zap_default_ibs = 15; #define DDT_ZAP_COMPRESS_BYTEORDER_MASK 0x80 #define DDT_ZAP_COMPRESS_FUNCTION_MASK 0x7f #define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) static size_t ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len) { uchar_t *version = dst++; int cpfunc = ZIO_COMPRESS_ZLE; zio_compress_info_t *ci = &zio_compress_table[cpfunc]; size_t c_len; ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ /* Call compress function directly to avoid hole detection. */ abd_t sabd, dabd; abd_get_from_buf_struct(&sabd, (void *)src, s_len); abd_get_from_buf_struct(&dabd, dst, d_len); c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level); abd_free(&dabd); abd_free(&sabd); if (c_len == s_len) { cpfunc = ZIO_COMPRESS_OFF; memcpy(dst, src, s_len); } *version = cpfunc; if (ZFS_HOST_BYTEORDER) *version |= DDT_ZAP_COMPRESS_BYTEORDER_MASK; return (c_len + 1); } static void ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) { uchar_t version = *src++; int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK; if (zio_compress_table[cpfunc].ci_decompress == NULL) { memcpy(dst, src, d_len); return; } abd_t sabd, dabd; abd_get_from_buf_struct(&sabd, src, s_len); abd_get_from_buf_struct(&dabd, dst, d_len); VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL)); abd_free(&dabd); abd_free(&sabd); if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) != (ZFS_HOST_BYTEORDER != 0)) byteswap_uint64_array(dst, d_len); } static int ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) { zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; if (prehash) flags |= ZAP_FLAG_PRE_HASHED_KEY; *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, ddt_zap_default_bs, ddt_zap_default_ibs, DMU_OT_NONE, 0, tx); if (*objectp == 0) return (SET_ERROR(ENOTSUP)); return (0); } static int ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) { return (zap_destroy(os, object, tx)); } static int ddt_zap_lookup(objset_t *os, uint64_t object, const ddt_key_t *ddk, void *phys, size_t psize) { uchar_t *cbuf; uint64_t one, csize; int error; error = zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, &one, &csize); if (error) return (error); ASSERT3U(one, ==, 1); ASSERT3U(csize, <=, psize + 1); cbuf = kmem_alloc(csize, KM_SLEEP); error = zap_lookup_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, 1, csize, cbuf); if (error == 0) ddt_zap_decompress(cbuf, phys, csize, psize); kmem_free(cbuf, csize); return (error); } static int ddt_zap_contains(objset_t *os, uint64_t object, const ddt_key_t *ddk) { return (zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, NULL, NULL)); } static void ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk) { (void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS); } static void ddt_zap_prefetch_all(objset_t *os, uint64_t object) { (void) zap_prefetch_object(os, object); } static int ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, const void *phys, size_t psize, dmu_tx_t *tx) { const size_t cbuf_size = psize + 1; uchar_t *cbuf = kmem_alloc(cbuf_size, KM_SLEEP); uint64_t csize = ddt_zap_compress(phys, cbuf, psize, cbuf_size); int error = zap_update_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, 1, csize, cbuf, tx); kmem_free(cbuf, cbuf_size); return (error); } static int ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk, dmu_tx_t *tx) { return (zap_remove_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, tx)); } static int ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, void *phys, size_t psize) { zap_cursor_t zc; zap_attribute_t *za; int error; za = zap_attribute_alloc(); if (*walk == 0) { /* * We don't want to prefetch the entire ZAP object, because * it can be enormous. Also the primary use of DDT iteration * is for scrubbing, in which case we will be issuing many * scrub I/Os for each ZAP block that we read in, so * reading the ZAP is unlikely to be the bottleneck. */ zap_cursor_init_noprefetch(&zc, os, object); } else { zap_cursor_init_serialized(&zc, os, object, *walk); } if ((error = zap_cursor_retrieve(&zc, za)) == 0) { uint64_t csize = za->za_num_integers; ASSERT3U(za->za_integer_length, ==, 1); ASSERT3U(csize, <=, psize + 1); uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP); error = zap_lookup_uint64(os, object, (uint64_t *)za->za_name, DDT_KEY_WORDS, 1, csize, cbuf); ASSERT0(error); if (error == 0) { ddt_zap_decompress(cbuf, phys, csize, psize); *ddk = *(ddt_key_t *)za->za_name; } kmem_free(cbuf, csize); zap_cursor_advance(&zc); *walk = zap_cursor_serialize(&zc); } zap_cursor_fini(&zc); zap_attribute_free(za); return (error); } static int ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) { return (zap_count(os, object, count)); } const ddt_ops_t ddt_zap_ops = { "zap", ddt_zap_create, ddt_zap_destroy, ddt_zap_lookup, ddt_zap_contains, ddt_zap_prefetch, ddt_zap_prefetch_all, ddt_zap_update, ddt_zap_remove, ddt_zap_walk, ddt_zap_count, }; -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW, "DDT ZAP leaf blockshift"); ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW, "DDT ZAP indirect blockshift"); -/* END CSTYLED */ diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 4830f4850a31..32609399b79e 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1,2951 +1,2949 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2023, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ static int zfs_nopwrite_enabled = 1; /* * Tunable to control percentage of dirtied L1 blocks from frees allowed into * one TXG. After this threshold is crossed, additional dirty blocks from frees * will wait until the next TXG. * A value of zero will disable this throttle. */ static uint_t zfs_per_txg_dirty_frees_percent = 30; /* * Enable/disable forcing txg sync when dirty checking for holes with lseek(). * By default this is enabled to ensure accurate hole reporting, it can result * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads. * Disabling this option will result in holes never being reported in dirty * files which is always safe. */ static int zfs_dmu_offset_next_sync = 1; /* * Limit the amount we can prefetch with one call to this amount. This * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ #ifdef _ILP32 uint_t dmu_prefetch_max = 8 * 1024 * 1024; #else uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; #endif /* * Override copies= for dedup state objects. 0 means the traditional behaviour * (ie the default for the containing objset ie 3 for the MOS). */ uint_t dmu_ddt_copies = 0; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } }; dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; if (newsize < 0 || newsize > db_fake->db_size) return (SET_ERROR(EINVAL)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; if (!DMU_OT_IS_VALID(type)) return (SET_ERROR(EINVAL)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_object_type_t type; DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * Lookup and hold the bonus buffer for the provided dnode. If the dnode * has not yet been allocated a new bonus dbuf a will be allocated. * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp, uint32_t flags) { dmu_buf_impl_t *db; int error; uint32_t db_flags = DB_RF_MUST_SUCCEED; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (zfs_refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); error = dbuf_read(db, NULL, db_flags); if (error) { dnode_evict_bonus(dn); dbuf_rele(db, tag); *dbp = NULL; return (error); } *dbp = &db->db; return (0); } int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else { dbuf_rele(db, tag); *dbp = NULL; } return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; int err; uint32_t db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_DECRYPT) db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; zstream_t *zs = NULL; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio = NULL; boolean_t missed = B_FALSE; ASSERT(!read || length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; if ((flags & DMU_READ_NO_DECRYPT) != 0) dbuf_flags |= DB_RF_NO_DECRYPT; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); if (read) zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); if ((flags & DMU_READ_NO_PREFETCH) == 0) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we * base predictions on the original less racy request order. */ zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read, B_TRUE); } for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { if (zs) { dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); } rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); if (read) zio_nowait(zio); return (SET_ERROR(EIO)); } /* * Initiate async demand data read. * We check the db_state after calling dbuf_read() because * (1) dbuf_read() may change the state to CACHED due to a * hit in the ARC, and (2) on a cache miss, a child will * have been added to "zio" but not yet completed, so the * state will not yet be CACHED. */ if (read) { if (i == nblks - 1 && blkid + i < dn->dn_maxblkid && offset + length < db->db.db_offset + db->db.db_size) { if (offset <= db->db.db_offset) dbuf_flags |= DB_RF_PARTIAL_FIRST; else dbuf_flags |= DB_RF_PARTIAL_MORE; } (void) dbuf_read(db, zio, dbuf_flags); if (db->db_state != DB_CACHED) missed = B_TRUE; } dbp[i] = &db->db; } /* * If we are doing O_DIRECT we still hold the dbufs, even for reads, * but we do not issue any reads here. We do not want to account for * writes in this case. * * O_DIRECT write/read accounting takes place in * dmu_{write/read}_abd(). */ if (!read && ((flags & DMU_DIRECTIO) == 0)) zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags); if (zs) dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); rw_exit(&dn->dn_struct_rwlock); if (read) { /* wait for async read i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int err; DB_DNODE_ENTER(db); err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. If the range * is too long, prefetch the first dmu_prefetch_max bytes as requested, while * for the rest only a higher level, also fitting within dmu_prefetch_max. It * should primarily help random reads, since for long sequential reads there is * a speculative prefetcher. * * Note that if the indirect blocks above the blocks being prefetched are not * in cache, they will be asynchronously read in. Dnode read by dnode_hold() * is currently synchronous. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; if (dmu_prefetch_max == 0 || len == 0) { dmu_prefetch_dnode(os, object, pri); return; } if (dnode_hold(os, object, FTAG, &dn) != 0) return; dmu_prefetch_by_dnode(dn, level, offset, len, pri); dnode_rele(dn, FTAG); } void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { int64_t level2 = level; uint64_t start, end, start2, end2; /* * Depending on len we may do two prefetches: blocks [start, end) at * level, and following blocks [start2, end2) at higher level2. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift != 0) { /* * The object has multiple blocks. Calculate the full range * of blocks [start, end2) and then split it into two parts, * so that the first [start, end) fits into dmu_prefetch_max. */ start = dbuf_whichblock(dn, level, offset); end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1; uint8_t ibs = dn->dn_indblkshift; uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs; uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs; start2 = end = MIN(end2, start + limit); /* * Find level2 where [start2, end2) fits into dmu_prefetch_max. */ uint8_t ibps = ibs - SPA_BLKPTRSHIFT; limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; do { level2++; start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps; } while (end2 - start2 > limit); } else { /* There is only one block. Prefetch it or nothing. */ start = start2 = end2 = 0; end = start + (level == 0 && offset < dn->dn_datablksz); } for (uint64_t i = start; i < end; i++) dbuf_prefetch(dn, level, i, pri, 0); for (uint64_t i = start2; i < end2; i++) dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); } typedef struct { kmutex_t dpa_lock; kcondvar_t dpa_cv; uint64_t dpa_pending_io; } dmu_prefetch_arg_t; static void dmu_prefetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t issued) { (void) level; (void) blkid; (void)issued; dmu_prefetch_arg_t *dpa = arg; ASSERT0(level); mutex_enter(&dpa->dpa_lock); ASSERT3U(dpa->dpa_pending_io, >, 0); if (--dpa->dpa_pending_io == 0) cv_broadcast(&dpa->dpa_cv); mutex_exit(&dpa->dpa_lock); } static void dmu_prefetch_wait_by_dnode(dnode_t *dn, uint64_t offset, uint64_t len) { dmu_prefetch_arg_t dpa; mutex_init(&dpa.dpa_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dpa.dpa_cv, NULL, CV_DEFAULT, NULL); rw_enter(&dn->dn_struct_rwlock, RW_READER); uint64_t start = dbuf_whichblock(dn, 0, offset); uint64_t end = dbuf_whichblock(dn, 0, offset + len - 1) + 1; dpa.dpa_pending_io = end - start; for (uint64_t blk = start; blk < end; blk++) { (void) dbuf_prefetch_impl(dn, 0, blk, ZIO_PRIORITY_ASYNC_READ, 0, dmu_prefetch_done, &dpa); } rw_exit(&dn->dn_struct_rwlock); /* wait for prefetch L0 reads to finish */ mutex_enter(&dpa.dpa_lock); while (dpa.dpa_pending_io > 0) { cv_wait(&dpa.dpa_cv, &dpa.dpa_lock); } mutex_exit(&dpa.dpa_lock); mutex_destroy(&dpa.dpa_lock); cv_destroy(&dpa.dpa_cv); } /* * Issue prefetch I/Os for the given L0 block range and wait for the I/O * to complete. This does not enforce dmu_prefetch_max and will prefetch * the entire range. The blocks are read from disk into the ARC but no * decompression occurs (i.e., the dbuf cache is not required). */ int dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size) { dnode_t *dn; int err = 0; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); /* * Chunk the requests (16 indirects worth) so that we can be interrupted */ uint64_t chunksize; if (dn->dn_indblkshift) { uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1); chunksize = (nbps * 16) << dn->dn_datablkshift; } else { chunksize = dn->dn_datablksz; } while (size > 0) { uint64_t mylen = MIN(size, chunksize); dmu_prefetch_wait_by_dnode(dn, offset, mylen); offset += mylen; size -= mylen; if (issig()) { err = SET_ERROR(EINTR); break; } } dnode_rele(dn, FTAG); return (err); } /* * Issue prefetch I/Os for the given object's dnode. */ void dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) { if (object == 0 || object >= DN_MAX_OBJECT) return; dnode_t *dn = DMU_META_DNODE(os); rw_enter(&dn->dn_struct_rwlock, RW_READER); uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, 0, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crash in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed and l1blks is set to the number of level 1 * indirect blocks found within the chunk. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) { uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = (uint64_t)dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); /* dn_nlevels == 1 means we don't have any L1 blocks */ if (dn->dn_nlevels <= 1) { *l1blks = 0; *start = minimum; return (0); } /* * Check if we can free the entire range assuming that all of the * L1 blocks in this range have data. If we can, we use this * worst case value as an estimate so we can avoid having to look * at the object's actual data. */ uint64_t total_l1blks = (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / iblkrange; if (total_l1blks <= maxblks) { *l1blks = total_l1blks; *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { *l1blks = blks; return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN_TYPED(*start, iblkrange, uint64_t); } if (*start < minimum) *start = minimum; *l1blks = blks; return (0); } /* * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); #else (void) os; #endif return (B_FALSE); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (dn == NULL) return (SET_ERROR(EINVAL)); object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 20; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t l1blks; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) return (SET_ERROR(EINTR)); chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&dp->dp_lock); uint64_t long_free_dirty = dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees, wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees. */ if (dirty_frees_threshold != 0 && long_free_dirty >= dirty_frees_threshold) { DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); dmu_tx_commit(tx); txg_wait_open(dp, 0, B_TRUE); continue; } /* * In order to prevent unnecessary write throttling, for each * TXG, we track the cumulative size of L1 blocks being dirtied * in dnode_free_range() below. We compare this number to a * tunable threshold, past which we prevent new L1 dirty freeing * blocks from being added into the open TXG. See * dmu_free_long_range_impl() for details. The threshold * prevents write throttle activation due to dirty freeing L1 * blocks taking up a large percentage of zfs_dirty_data_max. */ mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += l1blks << dn->dn_indblkshift; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty, uint64_t, chunk_len, uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { uint64_t newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); memset((char *)buf + newsz, 0, size - newsz); size = newsz; } if (size == 0) return (0); /* Allow Direct I/O when requested and properly aligned */ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) && zfs_dio_aligned(offset, size, PAGESIZE)) { abd_t *data = abd_get_from_buf(buf, size); err = dmu_read_abd(dn, offset, size, data, flags); abd_free(data); return (err); } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(db->db_data != NULL); (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); ASSERT(db->db_data != NULL); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } /* * This interface is not used internally by ZFS but is provided for * use by Lustre which is built on the DMU interfaces. */ int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx, uint32_t flags) { dmu_buf_t **dbp; int numbufs; int error; if (size == 0) return (0); /* Allow Direct I/O when requested and properly aligned */ if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) && zfs_dio_aligned(offset, size, dn->dn_datablksz)) { abd_t *data = abd_get_from_buf((void *)buf, size); error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); abd_free(data); return (error); } VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); return (0); } int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0)); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { int numbufs, i; dmu_buf_t **dbp; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) dmu_buf_redact(dbp[i], tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; if (uio->uio_extflg & UIO_DIRECT) return (dmu_read_uio_direct(dn, uio, size)); /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(db->db_data != NULL); err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); err = dmu_read_uio_dnode(DB_DNODE(db), uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset zfs_uio_offset(uio). */ int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; uint64_t write_size; top: write_size = size; /* * We only allow Direct I/O writes to happen if we are block * sized aligned. Otherwise, we pass the write off to the ARC. */ if ((uio->uio_extflg & UIO_DIRECT) && (write_size >= dn->dn_datablksz)) { if (zfs_dio_aligned(zfs_uio_offset(uio), write_size, dn->dn_datablksz)) { return (dmu_write_uio_direct(dn, uio, size, tx)); } else if (write_size > dn->dn_datablksz && zfs_dio_offset_aligned(zfs_uio_offset(uio), dn->dn_datablksz)) { write_size = dn->dn_datablksz * (write_size / dn->dn_datablksz); err = dmu_write_uio_direct(dn, uio, write_size, tx); if (err == 0) { size -= write_size; goto top; } else { return (err); } } else { write_size = P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz); } } err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (int i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(write_size > 0); offset_t off = zfs_uio_offset(uio); bufoff = off - db->db_offset; tocpy = MIN(db->db_size - bufoff, write_size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx, B_TRUE); else dmu_buf_will_dirty(db, tx); ASSERT(db->db_data != NULL); err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) { /* The fill was reverted. Undo any uio progress. */ zfs_uio_advance(uio, off - zfs_uio_offset(uio)); } if (err) break; write_size -= tocpy; size -= tocpy; } IMPLY(err == 0, write_size == 0); dmu_buf_rele_array(dbp, numbufs, FTAG); if ((uio->uio_extflg & UIO_DIRECT) && size > 0) { goto top; } return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset zfs_uio_offset(uio). */ int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #endif /* _KERNEL */ static void dmu_cached_bps(spa_t *spa, blkptr_t *bps, uint_t nbps, uint64_t *l1sz, uint64_t *l2sz) { int cached_flags; if (bps == NULL) return; for (size_t blk_off = 0; blk_off < nbps; blk_off++) { blkptr_t *bp = &bps[blk_off]; if (BP_IS_HOLE(bp)) continue; cached_flags = arc_cached(spa, bp); if (cached_flags == 0) continue; if ((cached_flags & (ARC_CACHED_IN_L1 | ARC_CACHED_IN_L2)) == ARC_CACHED_IN_L2) *l2sz += BP_GET_LSIZE(bp); else *l1sz += BP_GET_LSIZE(bp); } } /* * Estimate DMU object cached size. */ int dmu_object_cached_size(objset_t *os, uint64_t object, uint64_t *l1sz, uint64_t *l2sz) { dnode_t *dn; dmu_object_info_t doi; int err = 0; *l1sz = *l2sz = 0; if (dnode_hold(os, object, FTAG, &dn) != 0) return (0); if (dn->dn_nlevels < 2) { dnode_rele(dn, FTAG); return (0); } dmu_object_info_from_dnode(dn, &doi); for (uint64_t off = 0; off < doi.doi_max_offset; off += dmu_prefetch_max) { /* dbuf_read doesn't prefetch L1 blocks. */ dmu_prefetch_by_dnode(dn, 1, off, dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ); } /* * Hold all valid L1 blocks, asking ARC the status of each BP * contained in each such L1 block. */ uint_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1); uint64_t l1blks = 1 + (dn->dn_maxblkid / nbps); rw_enter(&dn->dn_struct_rwlock, RW_READER); for (uint64_t blk = 0; blk < l1blks; blk++) { dmu_buf_impl_t *db = NULL; if (issig()) { /* * On interrupt, get out, and bubble up EINTR */ err = EINTR; break; } /* * If we get an i/o error here, the L1 can't be read, * and nothing under it could be cached, so we just * continue. Ignoring the error from dbuf_hold_impl * or from dbuf_read is then a reasonable choice. */ err = dbuf_hold_impl(dn, 1, blk, B_TRUE, B_FALSE, FTAG, &db); if (err != 0) { /* * ignore error and continue */ err = 0; continue; } err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err == 0) { dmu_cached_bps(dmu_objset_spa(os), db->db.db_data, nbps, l1sz, l2sz); } /* * error may be ignored, and we continue */ err = 0; dbuf_rele(db, FTAG); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (err); } /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * A "lightweight" write is faster than a regular write (e.g. * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the * data can not be read or overwritten until the transaction's txg has been * synced. This makes it appropriate for workloads that are known to be * (temporarily) write-only, like "zfs receive". * * A single block is written, starting at the specified offset in bytes. If * the call is successful, it returns 0 and the provided abd has been * consumed (the caller should not free it). */ int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx) { dbuf_dirty_record_t *dr = dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); if (dr == NULL) return (SET_ERROR(EIO)); dr->dt.dll.dr_abd = abd; dr->dt.dll.dr_props = *zp; dr->dt.dll.dr_flags = flags; return (0); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; objset_t *os = dn->dn_objset; uint64_t object = dn->dn_object; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); /* * We can only assign if the offset is aligned and the arc buf is the * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { zfs_racct_write(os->os_spa, blksz, 1, 0); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); } return (0); } int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(db); err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx); DB_DNODE_EXIT(db); return (err); } void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; if (zio->io_error == 0) { dbuf_dirty_record_t *dr = dsa->dsa_dr; blkptr_t *bp = zio->io_bp; if (BP_IS_HOLE(bp)) { dmu_buf_t *db = NULL; if (dr) db = &(dr->dr_dbuf->db); else db = dsa->dsa_zgd->zgd_db; /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); BP_SET_FILL(bp, 1); } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; zgd_t *zgd = dsa->dsa_zgd; /* * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ if (zgd && zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { ASSERT0(dr->dt.dl.dr_has_raw_params); dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); VERIFY(BP_EQUAL(bp, db->db_blkptr)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); VERIFY(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); if (dsa->dsa_done) dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; zgd_t *zgd = dsa->dsa_zgd; if (zio->io_error == 0) { /* * Record the vdev(s) backing this blkptr so they can be * flushed after the writes for the lwb have completed. */ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_free(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; int error; error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL, DB_RF_CANFAIL | DB_RF_NOPREFETCH); if (error != 0) return (error); tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); /* * This transaction does not produce any dirty data or log blocks, so * it should not be throttled. All other cases wait for TXG sync, by * which time the log block we are writing will be obsolete, so we can * skip waiting and just return error here instead. */ if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } /* * In order to prevent the zgd's lwb from being free'd prior to * dmu_sync_late_arrival_done() being called, we have to ensure * the lwb's "max txg" takes this tx's txg into account. */ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; /* * Since we are currently syncing this txg, it's nontrivial to * determine what BP to nopwrite against, so we disable nopwrite. * * When syncing, the db_blkptr is initially the BP of the previous * txg. We can not nopwrite against it because it will be changed * (this is similar to the non-late-arrival case where the dbuf is * dirty in a future txg). * * Then dbuf_write_ready() sets bp_blkptr to the location we will write. * We can not nopwrite against it because although the BP will not * (typically) be changed, the data has not yet been persisted to this * location. * * Finally, when dbuf_write_done() is called, it is theoretically * possible to always nopwrite, because the data that was written in * this txg is the same data that we are trying to write. However we * would need to check that this dbuf is not dirty in any future * txg's (as we do in the normal dmu_sync() path). For simplicity, we * don't nopwrite in this case. */ zp->zp_nopwrite = B_FALSE; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr, *dr_next; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dmu_write_policy(os, DB_DNODE(db), db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } dr_next = list_next(&db->db_dirty_records, dr); ASSERT(dr_next == NULL || dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* * We need to fill in zgd_bp with the current blkptr so that * the nopwrite code can check if we're writing the same * data that's already on disk. We can only nopwrite if we * are sure that after making the copy, db_blkptr will not * change until our i/o completes. We ensure this by * holding the db_mtx, and only allowing nopwrite if the * block is not already dirty (see below). This is verified * by dmu_sync_done(), which VERIFYs that the db_blkptr has * not changed. */ *zgd->zgd_bp = *db->db_blkptr; } /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ if (dr_next != NULL) { zp.zp_nopwrite = B_FALSE; } else { DB_DNODE_ENTER(db); if (dnode_block_freed(DB_DNODE(db), db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); } ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT0(dr->dt.dl.dr_has_raw_params); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_nlevels(dn, nlevels, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (0); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ static const int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; uint8_t complevel = os->os_complevel; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; boolean_t encrypt = B_FALSE; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; switch (os->os_redundant_metadata) { case ZFS_REDUNDANT_METADATA_ALL: copies++; break; case ZFS_REDUNDANT_METADATA_MOST: if (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)) copies++; break; case ZFS_REDUNDANT_METADATA_SOME: if (DMU_OT_IS_CRITICAL(type)) copies++; break; case ZFS_REDUNDANT_METADATA_NONE: break; } if (dmu_ddt_copies > 0) { /* * If this tuneable is set, and this is a write for a * dedup entry store (zap or log), then we treat it * something like ZFS_REDUNDANT_METADATA_MOST on a * regular dataset: this many copies, and one more for * "higher" indirect blocks. This specific exception is * necessary because dedup objects are stored in the * MOS, which always has the highest possible copies. */ dmu_object_type_t stype = dn ? dn->dn_storage_type : DMU_OT_NONE; if (stype == DMU_OT_NONE) stype = type; if (stype == DMU_OT_DDT_ZAP) { copies = dmu_ddt_copies; if (level >= zfs_redundant_metadata_most_ditto_level) copies++; } } } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); complevel = zio_complevel_select(os->os_spa, compress, complevel, complevel); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checksum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } /* * All objects in an encrypted objset are protected from modification * via a MAC. Encrypted objects store their IV and salt in the last DVA * in the bp, so we cannot use all copies. Encrypted objects are also * not subject to nopwrite since writing the same data will still * result in a new ciphertext. Only encrypted blocks can be dedup'd * to avoid ambiguity in the dedup code since the DDT does not store * object types. */ if (os->os_encrypted && (wp & WP_NOFILL) == 0) { encrypt = B_TRUE; if (DMU_OT_IS_ENCRYPTED(type)) { copies = MIN(copies, SPA_DVAS_PER_BP - 1); nopwrite = B_FALSE; } else { dedup = B_FALSE; } if (level <= 0 && (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) { compress = ZIO_COMPRESS_EMPTY; } } zp->zp_compress = compress; zp->zp_complevel = complevel; zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; zp->zp_storage_type = dn ? dn->dn_storage_type : DMU_OT_NONE; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } /* * Reports the location of data and holes in an object. In order to * accurately report holes all dirty data must be synced to disk. This * causes extremely poor performance when seeking for holes in a dirty file. * As a compromise, only provide hole data when the dnode is clean. When * a dnode is dirty report the dnode as having no holes by returning EBUSY * which is always safe to do. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int restarted = 0, err; restart: err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dnode_is_dirty(dn)) { /* * If the zfs_dmu_offset_next_sync module option is enabled * then hole reporting has been requested. Dirty dnodes * must be synced to disk to accurately report holes. * * Provided a RL_READER rangelock spanning 0-UINT64_MAX is * held by the caller only a single restart will be required. * We tolerate callers which do not hold the rangelock by * returning EBUSY and not reporting holes after one restart. */ if (zfs_dmu_offset_next_sync) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (restarted) return (SET_ERROR(EBUSY)); txg_wait_synced(dmu_objset_pool(os), 0); restarted = 1; goto restart; } err = SET_ERROR(EBUSY); } else { err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK | (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (err); } int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, blkptr_t *bps, size_t *nbpsp) { dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; blkptr_t *bp; int error, numbufs; error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, &numbufs, &dbp); if (error != 0) { if (error == ESRCH) { error = SET_ERROR(ENXIO); } return (error); } ASSERT3U(numbufs, <=, *nbpsp); for (int i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; mutex_enter(&db->db_mtx); if (!list_is_empty(&db->db_dirty_records)) { dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); if (dr->dt.dl.dr_brtwrite) { /* * This is very special case where we clone a * block and in the same transaction group we * read its BP (most likely to clone the clone). */ bp = &dr->dt.dl.dr_overridden_by; } else { /* * The block was modified in the same * transaction group. */ mutex_exit(&db->db_mtx); error = SET_ERROR(EAGAIN); goto out; } } else { bp = db->db_blkptr; } mutex_exit(&db->db_mtx); if (bp == NULL) { /* * The file size was increased, but the block was never * written, otherwise we would either have the block * pointer or the dirty record and would not get here. * It is effectively a hole, so report it as such. */ BP_ZERO(&bps[i]); continue; } /* * Make sure we clone only data blocks. */ if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) { error = SET_ERROR(EINVAL); goto out; } /* * If the block was allocated in transaction group that is not * yet synced, we could clone it, but we couldn't write this * operation into ZIL, or it may be impossible to replay, since * the block may appear not yet allocated at that point. */ if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { error = SET_ERROR(EINVAL); goto out; } if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { error = SET_ERROR(EAGAIN); goto out; } bps[i] = *bp; } *nbpsp = numbufs; out: dmu_buf_rele_array(dbp, numbufs, FTAG); return (error); } int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const blkptr_t *bps, size_t nbps) { spa_t *spa; dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; struct dirty_leaf *dl; dbuf_dirty_record_t *dr; const blkptr_t *bp; int error = 0, i, numbufs; spa = os->os_spa; VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG, &numbufs, &dbp)); ASSERT3U(nbps, ==, numbufs); /* * Before we start cloning make sure that the dbufs sizes match new BPs * sizes. If they don't, that's a no-go, as we are not able to shrink * dbufs. */ for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_SPILL_BLKID); if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { error = SET_ERROR(EXDEV); goto out; } } for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; dmu_buf_will_clone_or_dio(dbuf, tx); mutex_enter(&db->db_mtx); dr = list_head(&db->db_dirty_records); VERIFY(dr != NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; ASSERT0(dl->dr_has_raw_params); dl->dr_overridden_by = *bp; if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) { if (!BP_IS_EMBEDDED(bp)) { BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg, BP_GET_BIRTH(bp)); } else { BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); } } dl->dr_brtwrite = B_TRUE; dl->dr_override_state = DR_OVERRIDDEN; mutex_exit(&db->db_mtx); /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. */ if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } out: dmu_buf_rele_array(dbp, numbufs, FTAG); return (error); } void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); *dnsize = DB_DNODE(db)->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } void byteswap_uint8_array(void *vbuf, size_t size) { (void) vbuf, (void) size; } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); dmu_objset_init(); dnode_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } EXPORT_SYMBOL(dmu_bonus_hold); EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_prefetch_by_dnode); EXPORT_SYMBOL(dmu_prefetch_dnode); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_read_uio); EXPORT_SYMBOL(dmu_read_uio_dbuf); EXPORT_SYMBOL(dmu_read_uio_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_write_by_dnode_flags); EXPORT_SYMBOL(dmu_write_uio); EXPORT_SYMBOL(dmu_write_uio_dbuf); EXPORT_SYMBOL(dmu_write_uio_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); EXPORT_SYMBOL(dmu_object_info_from_db); EXPORT_SYMBOL(dmu_object_size_from_db); EXPORT_SYMBOL(dmu_object_dnsize_from_db); EXPORT_SYMBOL(dmu_object_set_nlevels); EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_maxblkid); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); EXPORT_SYMBOL(dmu_offset_next); EXPORT_SYMBOL(dmu_write_policy); EXPORT_SYMBOL(dmu_sync); EXPORT_SYMBOL(dmu_request_arcbuf); EXPORT_SYMBOL(dmu_return_arcbuf); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode); EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW, "Percentage of dirtied blocks from frees in one TXG"); ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, "Enable forcing txg sync to find holes"); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, "Limit one prefetch call to this size"); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW, "Override copies= for dedup objects"); diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c index 56986ea43446..344b0e3750e9 100644 --- a/module/zfs/dmu_object.c +++ b/module/zfs/dmu_object.c @@ -1,525 +1,523 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * Each of the concurrent object allocators will grab * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to * grab 128 slots, which is 4 blocks worth. This was experimentally * determined to be the lowest value that eliminates the measurable effect * of lock contention from this code path. */ uint_t dmu_object_alloc_chunk_shift = 7; static uint64_t dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int dn_slots = dnodesize >> DNODE_SHIFT; boolean_t restarted = B_FALSE; uint64_t *cpuobj = NULL; uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; int error; cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE % os->os_obj_next_percpu_len]; if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; } else { ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); } /* * The "chunk" of dnodes that is assigned to a CPU-specific * allocator needs to be at least one block's worth, to avoid * lock contention on the dbuf. It can be at most one L1 block's * worth, so that the "rescan after polishing off a L1's worth" * logic below will be sure to kick in. */ if (dnodes_per_chunk < DNODES_PER_BLOCK) dnodes_per_chunk = DNODES_PER_BLOCK; if (dnodes_per_chunk > L1_dnode_count) dnodes_per_chunk = L1_dnode_count; /* * The caller requested the dnode be returned as a performance * optimization in order to avoid releasing the hold only to * immediately reacquire it. Since they caller is responsible * for releasing the hold they must provide the tag. */ if (allocated_dnode != NULL) { ASSERT3P(tag, !=, NULL); } else { ASSERT3P(tag, ==, NULL); tag = FTAG; } object = *cpuobj; for (;;) { /* * If we finished a chunk of dnodes, get a new one from * the global allocator. */ if ((P2PHASE(object, dnodes_per_chunk) == 0) || (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < dn_slots)) { DNODE_STAT_BUMP(dnode_alloc_next_chunk); mutex_enter(&os->os_obj_lock); ASSERT0(P2PHASE(os->os_obj_next_chunk, dnodes_per_chunk)); object = os->os_obj_next_chunk; /* * Each time we polish off a L1 bp worth of dnodes * (2^12 objects), move to another L1 bp that's * still reasonably sparse (at most 1/4 full). Look * from the beginning at most once per txg. If we * still can't allocate from that L1 block, search * for an empty L0 block, which will quickly skip * to the end of the metadnode if no nearby L0 * blocks are empty. This fallback avoids a * pathology where full dnode blocks containing * large dnodes appear sparse because they have a * low blk_fill, leading to many failed allocation * attempts. In the long term a better mechanism to * search for sparse metadnode regions, such as * spacemaps, could be implemented. * * os_scan_dnodes is set during txg sync if enough * objects have been freed since the previous * rescan to justify backfilling again. * * Note that dmu_traverse depends on the behavior * that we use multiple blocks of the dnode object * before going back to reuse objects. Any change * to this algorithm should preserve that property * or find another solution to the issues described * in traverse_visitbp. */ if (P2PHASE(object, L1_dnode_count) == 0) { uint64_t offset; uint64_t blkfill; int minlvl; if (os->os_rescan_dnodes) { offset = 0; os->os_rescan_dnodes = B_FALSE; } else { offset = object << DNODE_SHIFT; } blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; minlvl = restarted ? 1 : 2; restarted = B_TRUE; error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0); if (error == 0) { object = offset >> DNODE_SHIFT; } } /* * Note: if "restarted", we may find a L0 that * is not suitably aligned. */ os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) + dnodes_per_chunk; (void) atomic_swap_64(cpuobj, object); mutex_exit(&os->os_obj_lock); } /* * The value of (*cpuobj) before adding dn_slots is the object * ID assigned to us. The value afterwards is the object ID * assigned to whoever wants to do an allocation next. */ object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism * to do so. */ error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, tag, &dn); if (error == 0) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* * Another thread could have allocated it; check * again now that we have the struct lock. */ if (dn->dn_type == DMU_OT_NONE) { dnode_allocate(dn, ot, blocksize, indirect_blockshift, bonustype, bonuslen, dn_slots, tx); rw_exit(&dn->dn_struct_rwlock); dmu_tx_add_new_object(tx, dn); /* * Caller requested the allocated dnode be * returned and is responsible for the hold. */ if (allocated_dnode != NULL) *allocated_dnode = dn; else dnode_rele(dn, tag); return (object); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, tag); DNODE_STAT_BUMP(dnode_alloc_race); } /* * Skip to next known valid starting point on error. This * is the start of the next block of dnodes. */ if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); DNODE_STAT_BUMP(dnode_alloc_next_block); } (void) atomic_swap_64(cpuobj, object); } } uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, bonuslen, 0, NULL, NULL, tx); } uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, bonustype, bonuslen, 0, NULL, NULL, tx); } uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } /* * Allocate a new object and return a pointer to the newly allocated dnode * via the allocated_dnode argument. The returned dnode will be held and * the caller is responsible for releasing the hold by calling dnode_rele(). */ uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); } int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, bonuslen, 0, tx)); } int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { dnode_t *dn; int dn_slots = dnodesize >> DNODE_SHIFT; int err; if (dn_slots == 0) dn_slots = DNODE_MIN_SLOTS; ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, FTAG, &dn); if (err) return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); dmu_tx_add_new_object(tx, dn); dnode_rele(dn, FTAG); return (0); } int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, bonuslen, DNODE_MIN_SIZE, B_FALSE, tx)); } int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx) { dnode_t *dn; int dn_slots = dnodesize >> DNODE_SHIFT; int err; if (dn_slots == 0) dn_slots = DNODE_MIN_SLOTS; if (object == DMU_META_DNODE_OBJECT) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, keep_spill, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (err); } int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int err; ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, FTAG, &dn); if (err) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); /* * If we don't create this free range, we'll leak indirect blocks when * we get to freeing the dnode in syncing context. */ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); return (0); } /* * Return (in *objectp) the next object which is allocated (or a hole) * after *object, taking into account only objects that may have been modified * after the specified txg. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { uint64_t offset; uint64_t start_obj; struct dsl_dataset *ds = os->os_dsl_dataset; int error; if (*objectp == 0) { start_obj = 1; } else if (ds && dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_DNODE)) { uint64_t i = *objectp + 1; uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); dmu_object_info_t doi; /* * Scan through the remaining meta dnode block. The contents * of each slot in the block are known so it can be quickly * checked. If the block is exhausted without a match then * hand off to dnode_next_offset() for further scanning. */ while (i <= last_obj) { if (i == 0) return (SET_ERROR(ESRCH)); error = dmu_object_info(os, i, &doi); if (error == ENOENT) { if (hole) { *objectp = i; return (0); } else { i++; } } else if (error == EEXIST) { i++; } else if (error == 0) { if (hole) { i += doi.doi_dnodesize >> DNODE_SHIFT; } else { *objectp = i; return (0); } } else { return (error); } } start_obj = i; } else { start_obj = *objectp + 1; } offset = start_obj << DNODE_SHIFT; error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; return (error); } /* * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. * * Only for use from syncing context, on MOS objects. */ void dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, dmu_tx_t *tx) { dnode_t *dn; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); if (dn->dn_type == DMU_OTN_ZAP_METADATA) { dnode_rele(dn, FTAG); return; } ASSERT3U(dn->dn_type, ==, old_type); ASSERT0(dn->dn_maxblkid); /* * We must initialize the ZAP data before changing the type, * so that concurrent calls to *_is_zapified() can determine if * the object has been completely zapified by checking the type. */ mzap_create_impl(dn, 0, 0, tx); dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = DMU_OTN_ZAP_METADATA; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); spa_feature_incr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } void dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; dmu_object_type_t t; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); t = dn->dn_type; dnode_rele(dn, FTAG); if (t == DMU_OTN_ZAP_METADATA) { spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } VERIFY0(dmu_object_free(mos, object, tx)); } EXPORT_SYMBOL(dmu_object_alloc); EXPORT_SYMBOL(dmu_object_alloc_ibs); EXPORT_SYMBOL(dmu_object_alloc_dnsize); EXPORT_SYMBOL(dmu_object_alloc_hold); EXPORT_SYMBOL(dmu_object_claim); EXPORT_SYMBOL(dmu_object_claim_dnsize); EXPORT_SYMBOL(dmu_object_reclaim); EXPORT_SYMBOL(dmu_object_reclaim_dnsize); EXPORT_SYMBOL(dmu_object_rm_spill); EXPORT_SYMBOL(dmu_object_free); EXPORT_SYMBOL(dmu_object_next); EXPORT_SYMBOL(dmu_object_zapify); EXPORT_SYMBOL(dmu_object_free_zapified); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW, "CPU-specific allocator grabs 2^N objects at once"); -/* END CSTYLED */ diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index b1cd981cec1d..a33216be6ecf 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1,3846 +1,3845 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019, 2024, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019 Datto Inc. * Copyright (c) 2022 Axcient. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE; static uint_t zfs_recv_queue_ff = 20; static uint_t zfs_recv_write_batch_size = 1024 * 1024; static int zfs_recv_best_effort_corrective = 0; static const void *const dmu_recv_tag = "dmu_recv_tag"; const char *const recv_clone_name = "%recv"; typedef enum { ORNS_NO, ORNS_YES, ORNS_MAYBE } or_need_sync_t; static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, void *buf); struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* * If the record is a WRITE or SPILL, pointer to the abd containing the * payload. */ abd_t *abd; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; /* * These three members are used to signal to the main thread when * we're done. */ kmutex_t mutex; kcondvar_t cv; boolean_t done; int err; const char *tofs; boolean_t heal; boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ boolean_t full; /* this is a full send stream */ uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ list_t write_batch; /* Encryption parameters for the last received DRR_OBJECT_RANGE */ boolean_t or_crypt_params_present; uint64_t or_firstobj; uint64_t or_numslots; uint8_t or_salt[ZIO_DATA_SALT_LEN]; uint8_t or_iv[ZIO_DATA_IV_LEN]; uint8_t or_mac[ZIO_DATA_MAC_LEN]; boolean_t or_byteorder; zio_t *heal_pio; /* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */ or_need_sync_t or_need_sync; }; typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; proc_t *drba_proc; dsl_crypto_params_t *drba_dcp; } dmu_recv_begin_arg_t; static void byteswap_record(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO32(drr_object.drr_raw_bonuslen); DO64(drr_object.drr_toguid); DO64(drr_object.drr_maxblkid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_logical_size); DO64(drr_write.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); DO64(drr_write_embedded.drr_length); DO64(drr_write_embedded.drr_toguid); DO32(drr_write_embedded.drr_lsize); DO32(drr_write_embedded.drr_psize); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); DO64(drr_spill.drr_compressed_size); DO32(drr_spill.drr_type); break; case DRR_OBJECT_RANGE: DO64(drr_object_range.drr_firstobj); DO64(drr_object_range.drr_numslots); DO64(drr_object_range.drr_toguid); break; case DRR_REDACT: DO64(drr_redact.drr_object); DO64(drr_redact.drr_offset); DO64(drr_redact.drr_length); DO64(drr_redact.drr_toguid); break; case DRR_END: DO64(drr_end.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); break; default: break; } if (drr->drr_type != DRR_BEGIN) { ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); } #undef DO64 #undef DO32 } static boolean_t redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } /* * Check that the new stream we're trying to receive is redacted with respect to * a subset of the snapshots that the origin was redacted with respect to. For * the reasons behind this, see the man page on redacted zfs sends and receives. */ static boolean_t compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps, uint64_t *redact_snaps, uint64_t num_redact_snaps) { /* * Short circuit the comparison; if we are redacted with respect to * more snapshots than the origin, we can't be redacted with respect * to a subset. */ if (num_redact_snaps > origin_num_snaps) { return (B_FALSE); } for (int i = 0; i < num_redact_snaps; i++) { if (!redact_snaps_contains(origin_snaps, origin_num_snaps, redact_snaps[i])) { return (B_FALSE); } } return (B_TRUE); } static boolean_t redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) { uint64_t *origin_snaps; uint64_t origin_num_snaps; dmu_recv_cookie_t *drc = drba->drba_cookie; struct drr_begin *drrb = drc->drc_drrb; int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); int err = 0; boolean_t ret = B_TRUE; uint64_t *redact_snaps; uint_t numredactsnaps; /* * If this is a full send stream, we're safe no matter what. */ if (drrb->drr_fromguid == 0) return (ret); VERIFY(dsl_dataset_get_uint64_array_feature(origin, SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps)); if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == 0) { /* * If the send stream was sent from the redaction bookmark or * the redacted version of the dataset, then we're safe. Verify * that this is from the a compatible redaction bookmark or * redacted dataset. */ if (!compatible_redact_snaps(origin_snaps, origin_num_snaps, redact_snaps, numredactsnaps)) { err = EINVAL; } } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { /* * If the stream is redacted, it must be redacted with respect * to a subset of what the origin is redacted with respect to. * See case number 2 in the zfs man page section on redacted zfs * send. */ err = nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps); if (err != 0 || !compatible_redact_snaps(origin_snaps, origin_num_snaps, redact_snaps, numredactsnaps)) { err = EINVAL; } } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps, drrb->drr_toguid)) { /* * If the stream isn't redacted but the origin is, this must be * one of the snapshots the origin is redacted with respect to. * See case number 1 in the zfs man page section on redacted zfs * send. */ err = EINVAL; } if (err != 0) ret = B_FALSE; return (ret); } /* * If we previously received a stream with --large-block, we don't support * receiving an incremental on top of it without --large-block. This avoids * forcing a read-modify-write or trying to re-aggregate a string of WRITE * records. */ static int recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) { if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); return (0); } static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) { uint64_t obj; uint64_t children; int error; dsl_dataset_t *snap; dsl_pool_t *dp = ds->ds_dir->dd_pool; boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &obj); if (error != ENOENT) return (error == 0 ? SET_ERROR(EBUSY) : error); /* Resume state must not be set. */ if (dsl_dataset_has_resume_receive_state(ds)) return (SET_ERROR(EBUSY)); /* New snapshot name must not exist if we're not healing it. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &obj); if (drba->drba_cookie->drc_heal) { if (error != 0) return (error); } else if (error != ENOENT) { return (error == 0 ? SET_ERROR(EEXIST) : error); } /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children); if (error != 0) return (error); if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS && children > 0) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Check snapshot limit before receiving. We'll recheck again at the * end, but might as well abort before receiving if we're already over * the limit. * * Note that we do not check the file system limit with * dsl_dir_fscount_check because the temporary %clones don't count * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) return (error); if (drba->drba_cookie->drc_heal) { /* Encryption is incompatible with embedded data. */ if (encrypted && embed) return (SET_ERROR(EINVAL)); /* Healing is not supported when in 'force' mode. */ if (drba->drba_cookie->drc_force) return (SET_ERROR(EINVAL)); /* Must have keys loaded if doing encrypted non-raw recv. */ if (encrypted && !raw) { if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object, NULL, NULL) != 0) return (SET_ERROR(EACCES)); } error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (error); /* * When not doing best effort corrective recv healing can only * be done if the send stream is for the same snapshot as the * one we are trying to heal. */ if (zfs_recv_best_effort_corrective == 0 && drba->drba_cookie->drc_drrb->drr_toguid != dsl_dataset_phys(snap)->ds_guid) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENOTSUP)); } dsl_dataset_rele(snap, FTAG); } else if (fromguid != 0) { /* Sanity check the incremental recv */ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Can't perform a raw receive on top of a non-raw receive */ if (!encrypted && raw) return (SET_ERROR(EINVAL)); /* Encryption is incompatible with embedded data */ if (encrypted && embed) return (SET_ERROR(EINVAL)); /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (SET_ERROR(ENODEV)); if (snap->ds_dir != ds->ds_dir) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { drba->drba_cookie->drc_fromsnapobj = obj; } else { /* * If we are not forcing, there must be no * changes since fromsnap. Raw sends have an * additional constraint that requires that * no "noop" snapshots exist between fromsnap * and tosnap for the IVset checking code to * work properly. */ if (dsl_dataset_modified_since_snap(ds, snap) || (raw && dsl_dataset_phys(ds)->ds_prev_snap_obj != snap->ds_object)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } drba->drba_cookie->drc_fromsnapobj = ds->ds_prev->ds_object; } if (dsl_dataset_feature_is_active(snap, SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba, snap)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(EINVAL)); } error = recv_check_large_blocks(snap, featureflags); if (error != 0) { dsl_dataset_rele(snap, FTAG); return (error); } dsl_dataset_rele(snap, FTAG); } else { /* If full and not healing then must be forced. */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); /* * We don't support using zfs recv -F to blow away * encrypted filesystems. This would require the * dsl dir to point to the old encryption key and * the new one at the same time during the receive. */ if ((!encrypted && raw) || encrypted) return (SET_ERROR(EINVAL)); /* * Perform the same encryption checks we would if * we were creating a new dataset from scratch. */ if (!raw) { boolean_t will_encrypt; error = dmu_objset_create_crypt_check( ds->ds_dir->dd_parent, drba->drba_dcp, &will_encrypt); if (error != 0) return (error); if (will_encrypt && embed) return (SET_ERROR(EINVAL)); } } return (0); } /* * Check that any feature flags used in the data stream we're receiving are * supported by the pool we are receiving into. * * Note that some of the features we explicitly check here have additional * (implicit) features they depend on, but those dependencies are enforced * through the zfeature_register() calls declaring the features that we * explicitly check. */ static int recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) { /* * Check if there are any unsupported feature flags. */ if (!DMU_STREAM_SUPPORTED(featureflags)) { return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE)); } /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); /* * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks, * and large_dnodes in the stream can only be used if those pool * features are enabled because we don't attempt to decompress / * un-embed / un-mooch / split up the blocks / dnodes during the * receive process. */ if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) && !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) && !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) return (SET_ERROR(ENOTSUP)); /* * Receiving redacted streams requires that redacted datasets are * enabled. */ if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) && !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS)) return (SET_ERROR(ENOTSUP)); /* * If the LONGNAME is not enabled on the target, fail that request. */ if ((featureflags & DMU_BACKUP_FEATURE_LONGNAME) && !spa_feature_is_enabled(spa, SPA_FEATURE_LONGNAME)) return (SET_ERROR(ENOTSUP)); return (0); } static int dmu_recv_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa); if (error != 0) return (error); /* Resumable receives require extensible datasets */ if (drba->drba_cookie->drc_resumable && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) return (SET_ERROR(ENOTSUP)); if (featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require the encryption feature */ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) return (SET_ERROR(ENOTSUP)); /* embedded data is incompatible with encryption and raw recv */ if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (SET_ERROR(EINVAL)); /* raw receives require spill block allocation flag */ if (!(flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); } else { /* * We support unencrypted datasets below encrypted ones now, * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing * with a dataset we may encrypt. */ if (drba->drba_dcp == NULL || drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) { dsflags |= DS_HOLD_FLAG_DECRYPT; } } error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE || drba->drba_origin) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid, featureflags); dsl_dataset_rele_flags(ds, dsflags, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[ZFS_MAX_DATASET_NAME_LEN]; objset_t *os; /* healing recv must be done "into" an existing snapshot */ if (drba->drba_cookie->drc_heal == B_TRUE) return (SET_ERROR(ENOTSUP)); /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) || drba->drba_origin)) return (SET_ERROR(ENOENT)); /* * If we're receiving a full send as a clone, and it doesn't * contain all the necessary free records and freeobject * records, reject it. */ if (fromguid == 0 && drba->drba_origin != NULL && !(flags & DRR_FLAG_FREERECORDS)) return (SET_ERROR(EINVAL)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && drba->drba_origin == NULL) { boolean_t will_encrypt; /* * Check that we aren't breaking any encryption rules * and that we have all the parameters we need to * create an encrypted dataset if necessary. If we are * making an encrypted dataset the stream can't have * embedded data. */ error = dmu_objset_create_crypt_check(ds->ds_dir, drba->drba_dcp, &will_encrypt); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (will_encrypt && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } } /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred, drba->drba_proc); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* can't recv below anything but filesystems (eg. no ZVOLs) */ error = dmu_objset_from_ds(ds, &os); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (dmu_objset_type(os) != DMU_OST_ZFS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold_flags(dp, drba->drba_origin, dsflags, FTAG, &origin); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (!origin->ds_is_snapshot) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } if (origin->ds_dir->dd_crypto_obj != 0 && (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* * If the origin is redacted we need to verify that this * send stream can safely be received on top of the * origin. */ if (dsl_dataset_feature_is_active(origin, SPA_FEATURE_REDACTED_DATASETS)) { if (!redact_check(drba, origin)) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } } error = recv_check_large_blocks(ds, featureflags); if (error != 0) { dsl_dataset_rele_flags(origin, dsflags, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } dsl_dataset_rele_flags(origin, dsflags, FTAG); } dsl_dataset_rele(ds, FTAG); error = 0; } return (error); } static void dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; dmu_recv_cookie_t *drc = drba->drba_cookie; struct drr_begin *drrb = drc->drc_drrb; const char *tofs = drc->drc_tofs; uint64_t featureflags = drc->drc_featureflags; dsl_dataset_t *ds, *newds; objset_t *os; uint64_t dsobj; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t crflags = 0; dsl_crypto_params_t dummy_dcp = { 0 }; dsl_crypto_params_t *dcp = drba->drba_dcp; if (drrb->drr_flags & DRR_FLAG_CI_DATA) crflags |= DS_FLAG_CI_DATASET; if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) dsflags |= DS_HOLD_FLAG_DECRYPT; /* * Raw, non-incremental recvs always use a dummy dcp with * the raw cmd set. Raw incremental recvs do not use a dcp * since the encryption parameters are already set in stone. */ if (dcp == NULL && drrb->drr_fromguid == 0 && drba->drba_origin == NULL) { ASSERT3P(dcp, ==, NULL); dcp = &dummy_dcp; if (featureflags & DMU_BACKUP_FEATURE_RAW) dcp->cp_cmd = DCP_CMD_RAW_RECV; } error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* Create temporary clone unless we're doing corrective recv */ dsl_dataset_t *snap = NULL; if (drba->drba_cookie->drc_fromsnapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); ASSERT3P(dcp, ==, NULL); } if (drc->drc_heal) { /* When healing we want to use the provided snapshot */ VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap, &dsobj)); } else { dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, dcp, tx); } if (drba->drba_cookie->drc_fromsnapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); } else { dsl_dir_t *dd; const char *tail; dsl_dataset_t *origin = NULL; VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); ASSERT3P(dcp, ==, NULL); } /* Create new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, origin, crflags, drba->drba_cred, dcp, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drc->drc_newfs = B_TRUE; } VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, &newds)); if (dsl_dataset_feature_is_active(newds, SPA_FEATURE_REDACTED_DATASETS)) { /* * If the origin dataset is redacted, the child will be redacted * when we create it. We clear the new dataset's * redaction info; if it should be redacted, we'll fill * in its information later. */ dsl_dataset_deactivate_feature(newds, SPA_FEATURE_REDACTED_DATASETS, tx); } VERIFY0(dmu_objset_from_ds(newds, &os)); if (drc->drc_resumable) { dsl_dataset_zapify(newds, tx); if (drrb->drr_fromguid != 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, 8, 1, &drrb->drr_fromguid, tx)); } VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, 8, 1, &drrb->drr_toguid, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); uint64_t one = 1; uint64_t zero = 0; VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, 8, 1, &one, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, 8, 1, &one, tx)); } if (featureflags & DMU_BACKUP_FEATURE_RAW) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, 8, 1, &one, tx)); } uint64_t *redact_snaps; uint_t numredactsnaps; if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, sizeof (*redact_snaps), numredactsnaps, redact_snaps, tx)); } } /* * Usually the os->os_encrypted value is tied to the presence of a * DSL Crypto Key object in the dd. However, that will not be received * until dmu_recv_stream(), so we set the value manually for now. */ if (featureflags & DMU_BACKUP_FEATURE_RAW) { os->os_encrypted = B_TRUE; drba->drba_cookie->drc_raw = B_TRUE; } if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { uint64_t *redact_snaps; uint_t numredactsnaps; VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps)); dsl_dataset_activate_redaction(newds, redact_snaps, numredactsnaps, tx); } if (featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) { /* * The source has seen a large microzap at least once in its * life, so we activate the feature here to match. It's not * strictly necessary since a large microzap is usable without * the feature active, but if that object is sent on from here, * we need this info to know to add the stream feature. * * There may be no large microzap in the incoming stream, or * ever again, but this is a very niche feature and its very * difficult to spot a large microzap in the stream, so its * not worth the effort of trying harder to activate the * feature at first use. */ dsl_dataset_activate_feature(dsobj, SPA_FEATURE_LARGE_MICROZAP, (void *)B_TRUE, tx); } dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * Activate longname feature if received */ if (featureflags & DMU_BACKUP_FEATURE_LONGNAME && !dsl_dataset_feature_is_active(newds, SPA_FEATURE_LONGNAME)) { dsl_dataset_activate_feature(newds->ds_object, SPA_FEATURE_LONGNAME, (void *)B_TRUE, tx); newds->ds_feature[SPA_FEATURE_LONGNAME] = (void *)B_TRUE; } /* * If we actually created a non-clone, we need to create the objset * in our new dataset. If this is a raw send we postpone this until * dmu_recv_stream() so that we can allocate the metadnode with the * properties from the DRR_BEGIN payload. */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && !drc->drc_heal) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } rrw_exit(&newds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = newds; drba->drba_cookie->drc_os = os; spa_history_log_internal_ds(newds, "receive", tx, " "); } static int dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dmu_recv_cookie_t *drc = drba->drba_cookie; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drc->drc_drrb; int error; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; dsl_dataset_t *ds; const char *tofs = drc->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES) return (SET_ERROR(EINVAL)); /* * This is mostly a sanity check since we should have already done these * checks during a previous attempt to receive the data. */ error = recv_begin_check_feature_flags_impl(drc->drc_featureflags, dp->dp_spa); if (error != 0) return (error); /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { /* raw receives require spill block allocation flag */ if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)) return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } boolean_t recvexist = B_TRUE; if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ recvexist = B_FALSE; error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error != 0) return (error); } /* * Resume of full/newfs recv on existing dataset should be done with * force flag */ if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(ZFS_ERR_RESUME_EXISTS)); } /* check that ds is marked inconsistent */ if (!DS_IS_INCONSISTENT(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* check that there is resuming data, and that the toguid matches */ if (!dsl_dataset_is_zapified(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } uint64_t val; error = zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); if (error != 0 || drrb->drr_toguid != val) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* * Check if the receive is still running. If so, it will be owned. * Note that nothing else can own the dataset (e.g. after the receive * fails) because it will be marked inconsistent. */ if (dsl_dataset_has_owner(ds)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EBUSY)); } /* There should not be any snapshots of this fs yet. */ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* * Note: resume point will be checked when we process the first WRITE * record. */ /* check that the origin matches */ val = 0; (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); if (drrb->drr_fromguid != val) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } if (ds->ds_prev != NULL && drrb->drr_fromguid != 0) drc->drc_fromsnapobj = ds->ds_prev->ds_object; /* * If we're resuming, and the send is redacted, then the original send * must have been redacted, and must have been redacted with respect to * the same snapshots. */ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) { uint64_t num_ds_redact_snaps; uint64_t *ds_redact_snaps; uint_t num_stream_redact_snaps; uint64_t *stream_redact_snaps; if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, BEGINNV_REDACT_SNAPS, &stream_redact_snaps, &num_stream_redact_snaps) != 0) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } if (!dsl_dataset_get_uint64_array_feature(ds, SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps, &ds_redact_snaps)) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } for (int i = 0; i < num_ds_redact_snaps; i++) { if (!redact_snaps_contains(ds_redact_snaps, num_ds_redact_snaps, stream_redact_snaps[i])) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } } } error = recv_check_large_blocks(ds, drc->drc_featureflags); if (error != 0) { dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } static void dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); const char *tofs = drba->drba_cookie->drc_tofs; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (featureflags & DMU_BACKUP_FEATURE_RAW) { drba->drba_cookie->drc_raw = B_TRUE; } else { dsflags |= DS_HOLD_FLAG_DECRYPT; } if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) != 0) { /* %recv does not exist; continue in tofs */ VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, &ds)); drba->drba_cookie->drc_newfs = B_TRUE; } ASSERT(DS_IS_INCONSISTENT(ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || drba->drba_cookie->drc_raw); rrw_exit(&ds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = ds; VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); drba->drba_cookie->drc_should_save = B_TRUE; spa_history_log_internal_ds(ds, "resume receive", tx, " "); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(const char *tofs, const char *tosnap, dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal, boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args, const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp, offset_t *voffp) { dmu_recv_begin_arg_t drba = { 0 }; int err = 0; memset(drc, 0, sizeof (dmu_recv_cookie_t)); drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; drc->drc_heal = heal; drc->drc_resumable = resumable; drc->drc_cred = CRED(); drc->drc_proc = curproc; drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); } drc->drc_fp = fp; drc->drc_voff = *voffp; drc->drc_featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; /* * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard * upper limit. Systems with less than 1GB of RAM will see a lower * limit from `arc_all_memory() / 4`. */ if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4))) return (E2BIG); if (payloadlen != 0) { void *payload = vmem_alloc(payloadlen, KM_SLEEP); /* * For compatibility with recursive send streams, we don't do * this here if the stream could be part of a package. Instead, * we'll do it in dmu_recv_stream. If we pull the next header * too early, and it's the END record, we break the `recv_skip` * logic. */ err = receive_read_payload_and_next_header(drc, payloadlen, payload); if (err != 0) { vmem_free(payload, payloadlen); return (err); } err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl, KM_SLEEP); vmem_free(payload, payloadlen); if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); return (err); } } if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK) drc->drc_spill = B_TRUE; drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); drba.drba_proc = curproc; if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = dsl_sync_task(tofs, dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL); } else { /* * For non-raw, non-incremental, non-resuming receives the * user can specify encryption parameters on the command line * with "zfs recv -o". For these receives we create a dcp and * pass it to the sync task. Creating the dcp will implicitly * remove the encryption params from the localprops nvlist, * which avoids errors when trying to set these normally * read-only properties. Any other kind of receive that * attempts to set these properties will fail as a result. */ if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RAW) == 0 && origin == NULL && drc->drc_drrb->drr_fromguid == 0) { err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, localprops, hidden_args, &drba.drba_dcp); } if (err == 0) { err = dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL); dsl_crypto_params_free(drba.drba_dcp, !!err); } } if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); nvlist_free(drc->drc_begin_nvl); } return (err); } /* * Holds data need for corrective recv callback */ typedef struct cr_cb_data { uint64_t size; zbookmark_phys_t zb; spa_t *spa; } cr_cb_data_t; static void corrective_read_done(zio_t *zio) { cr_cb_data_t *data = zio->io_private; /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) { spa_remove_error(data->spa, &data->zb, BP_GET_LOGICAL_BIRTH(zio->io_bp)); } kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); } /* * zio_rewrite the data pointed to by bp with the data from the rrd's abd. */ static int do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, struct receive_record_arg *rrd, blkptr_t *bp) { int err; zio_t *io; zbookmark_phys_t zb; dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; if (rwa->raw) flags |= ZIO_FLAG_RAW; err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn); if (err != 0) return (err); SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0, dbuf_whichblock(dn, 0, drrw->drr_offset)); dnode_rele(dn, FTAG); if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) { /* Decompress the stream data */ abd_t *dabd = abd_alloc_linear( drrw->drr_logical_size, B_FALSE); err = zio_decompress_data(drrw->drr_compressiontype, abd, dabd, abd_get_size(abd), abd_get_size(dabd), NULL); if (err != 0) { abd_free(dabd); return (err); } /* Swap in the newly decompressed data into the abd */ abd_free(abd); abd = dabd; } if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* Recompress the data */ abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp), abd, &cabd, abd_get_size(abd), BP_GET_PSIZE(bp), rwa->os->os_complevel); abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize); /* Swap in newly compressed data into the abd */ abd_free(abd); abd = cabd; flags |= ZIO_FLAG_RAW_COMPRESS; } /* * The stream is not encrypted but the data on-disk is. * We need to re-encrypt the buf using the same * encryption type, salt, iv, and mac that was used to encrypt * the block previosly. */ if (!rwa->raw && BP_USES_CRYPT(bp)) { dsl_dataset_t *ds; dsl_crypto_key_t *dck = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; dsl_pool_t *dp = dmu_objset_pool(rwa->os); abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_flags(dp, rwa->tofs, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (err != 0) { dsl_pool_config_exit(dp, FTAG); abd_free(eabd); return (SET_ERROR(EACCES)); } /* Look up the key from the spa's keystore */ err = spa_keystore_lookup_key(rwa->os->os_spa, zb.zb_objset, FTAG, &dck); if (err != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_pool_config_exit(dp, FTAG); abd_free(eabd); return (SET_ERROR(EACCES)); } err = zio_do_crypt_abd(B_TRUE, &dck->dck_key, BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, abd_get_size(abd), abd, eabd, &no_crypt); spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_pool_config_exit(dp, FTAG); ASSERT0(no_crypt); if (err != 0) { abd_free(eabd); return (err); } /* Swap in the newly encrypted data into the abd */ abd_free(abd); abd = eabd; /* * We want to prevent zio_rewrite() from trying to * encrypt the data again */ flags |= ZIO_FLAG_RAW_ENCRYPT; } rrd->abd = abd; io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp, abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) || abd_get_size(abd) == BP_GET_PSIZE(bp)); /* compute new bp checksum value and make sure it matches the old one */ zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd)); if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) { zio_destroy(io); if (zfs_recv_best_effort_corrective != 0) return (0); return (SET_ERROR(ECKSUM)); } /* Correct the corruption in place */ err = zio_wait(io); if (err == 0) { cr_cb_data_t *cb_data = kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP); cb_data->spa = rwa->os->os_spa; cb_data->size = drrw->drr_logical_size; cb_data->zb = zb; /* Test if healing worked by re-reading the bp */ err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp, abd_alloc_for_io(drrw->drr_logical_size, B_FALSE), drrw->drr_logical_size, corrective_read_done, cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL)); } if (err != 0 && zfs_recv_best_effort_corrective != 0) err = 0; return (err); } static int receive_read(dmu_recv_cookie_t *drc, int len, void *buf) { int done = 0; /* * The code doesn't rely on this (lengths being multiples of 8). See * comment in dump_bytes. */ ASSERT(len % 8 == 0 || (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); while (done < len) { ssize_t resid = len - done; zfs_file_t *fp = drc->drc_fp; int err = zfs_file_read(fp, (char *)buf + done, len - done, &resid); if (err == 0 && resid == len - done) { /* * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates * that the receive was interrupted and can * potentially be resumed. */ err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); } drc->drc_voff += len - done - resid; done = len - resid; if (err != 0) return (err); } drc->drc_bytes_read += len; ASSERT3U(done, ==, len); return (0); } static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { if (bonus_type == DMU_OT_SA) { return (1); } else { return (1 + ((DN_OLD_MAX_BONUSLEN - MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); } } static void save_resume_state(struct receive_writer_arg *rwa, uint64_t object, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (!rwa->resumable) return; /* * We use ds_resume_bytes[] != 0 to indicate that we need to * update this on disk, so it must not be 0. */ ASSERT(rwa->bytes_read != 0); /* * We only resume from write records, which have a valid * (non-meta-dnode) object number. */ ASSERT(object != 0); /* * For resuming to work correctly, we must receive records in order, * sorted by object,offset. This is checked by the callers, but * assert it here for good measure. */ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); ASSERT3U(rwa->bytes_read, >=, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } static int receive_object_is_same_generation(objset_t *os, uint64_t object, dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, const void *new_bonus, boolean_t *samegenp) { zfs_file_info_t zoi; int err; dmu_buf_t *old_bonus_dbuf; err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); if (err != 0) return (err); err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, &zoi); dmu_buf_rele(old_bonus_dbuf, FTAG); if (err != 0) return (err); uint64_t old_gen = zoi.zfi_generation; err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); if (err != 0) return (err); uint64_t new_gen = zoi.zfi_generation; *samegenp = (old_gen == new_gen); return (0); } static int receive_handle_existing_object(const struct receive_writer_arg *rwa, const struct drr_object *drro, const dmu_object_info_t *doi, const void *bonus_data, uint64_t *object_to_hold, uint32_t *new_blksz) { uint32_t indblksz = drro->drr_indblkshift ? 1ULL << drro->drr_indblkshift : 0; int nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; boolean_t do_free_range = B_FALSE; int err; *object_to_hold = drro->drr_object; /* nblkptr should be bounded by the bonus size and type */ if (rwa->raw && nblkptr != drro->drr_nblkptr) return (SET_ERROR(EINVAL)); /* * After the previous send stream, the sending system may * have freed this object, and then happened to re-allocate * this object number in a later txg. In this case, we are * receiving a different logical file, and the block size may * appear to be different. i.e. we may have a different * block size for this object than what the send stream says. * In this case we need to remove the object's contents, * so that its structure can be changed and then its contents * entirely replaced by subsequent WRITE records. * * If this is a -L (--large-block) incremental stream, and * the previous stream was not -L, the block size may appear * to increase. i.e. we may have a smaller block size for * this object than what the send stream says. In this case * we need to keep the object's contents and block size * intact, so that we don't lose parts of the object's * contents that are not changed by this incremental send * stream. * * We can distinguish between the two above cases by using * the ZPL's generation number (see * receive_object_is_same_generation()). However, we only * want to rely on the generation number when absolutely * necessary, because with raw receives, the generation is * encrypted. We also want to minimize dependence on the * ZPL, so that other types of datasets can also be received * (e.g. ZVOLs, although note that ZVOLS currently do not * reallocate their objects or change their structure). * Therefore, we check a number of different cases where we * know it is safe to discard the object's contents, before * using the ZPL's generation number to make the above * distinction. */ if (drro->drr_blksz != doi->doi_data_block_size) { if (rwa->raw) { /* * RAW streams always have large blocks, so * we are sure that the data is not needed * due to changing --large-block to be on. * Which is fortunate since the bonus buffer * (which contains the ZPL generation) is * encrypted, and the key might not be * loaded. */ do_free_range = B_TRUE; } else if (rwa->full) { /* * This is a full send stream, so it always * replaces what we have. Even if the * generation numbers happen to match, this * can not actually be the same logical file. * This is relevant when receiving a full * send as a clone. */ do_free_range = B_TRUE; } else if (drro->drr_type != DMU_OT_PLAIN_FILE_CONTENTS || doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { /* * PLAIN_FILE_CONTENTS are the only type of * objects that have ever been stored with * large blocks, so we don't need the special * logic below. ZAP blocks can shrink (when * there's only one block), so we don't want * to hit the error below about block size * only increasing. */ do_free_range = B_TRUE; } else if (doi->doi_max_offset <= doi->doi_data_block_size) { /* * There is only one block. We can free it, * because its contents will be replaced by a * WRITE record. This can not be the no-L -> * -L case, because the no-L case would have * resulted in multiple blocks. If we * supported -L -> no-L, it would not be safe * to free the file's contents. Fortunately, * that is not allowed (see * recv_check_large_blocks()). */ do_free_range = B_TRUE; } else { boolean_t is_same_gen; err = receive_object_is_same_generation(rwa->os, drro->drr_object, doi->doi_bonus_type, drro->drr_bonustype, bonus_data, &is_same_gen); if (err != 0) return (SET_ERROR(EINVAL)); if (is_same_gen) { /* * This is the same logical file, and * the block size must be increasing. * It could only decrease if * --large-block was changed to be * off, which is checked in * recv_check_large_blocks(). */ if (drro->drr_blksz <= doi->doi_data_block_size) return (SET_ERROR(EINVAL)); /* * We keep the existing blocksize and * contents. */ *new_blksz = doi->doi_data_block_size; } else { do_free_range = B_TRUE; } } } /* nblkptr can only decrease if the object was reallocated */ if (nblkptr < doi->doi_nblkptr) do_free_range = B_TRUE; /* number of slots can only change on reallocation */ if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) do_free_range = B_TRUE; /* * For raw sends we also check a few other fields to * ensure we are preserving the objset structure exactly * as it was on the receive side: * - A changed indirect block size * - A smaller nlevels */ if (rwa->raw) { if (indblksz != doi->doi_metadata_block_size) do_free_range = B_TRUE; if (drro->drr_nlevels < doi->doi_indirection) do_free_range = B_TRUE; } if (do_free_range) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } /* * The dmu does not currently support decreasing nlevels or changing * indirect block size if there is already one, same as changing the * number of of dnode slots on an object. For non-raw sends this * does not matter and the new object can just use the previous one's * parameters. For raw sends, however, the structure of the received * dnode (including indirects and dnode slots) must match that of the * send side. Therefore, instead of using dmu_object_reclaim(), we * must free the object completely and call dmu_object_claim_dnsize() * instead. */ if ((rwa->raw && ((doi->doi_indirection > 1 && indblksz != doi->doi_metadata_block_size) || drro->drr_nlevels < doi->doi_indirection)) || dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_object(rwa->os, drro->drr_object); if (err != 0) return (SET_ERROR(EINVAL)); txg_wait_synced(dmu_objset_pool(rwa->os), 0); *object_to_hold = DMU_NEW_OBJECT; } /* * For raw receives, free everything beyond the new incoming * maxblkid. Normally this would be done with a DRR_FREE * record that would come after this DRR_OBJECT record is * processed. However, for raw receives we manually set the * maxblkid from the drr_maxblkid and so we must first free * everything above that blkid to ensure the DMU is always * consistent with itself. We will never free the first block * of the object here because a maxblkid of 0 could indicate * an object with a single block or one with no blocks. This * free may be skipped when dmu_free_long_range() was called * above since it covers the entire object's contents. */ if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { err = dmu_free_long_range(rwa->os, drro->drr_object, (drro->drr_maxblkid + 1) * doi->doi_data_block_size, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } return (0); } noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; int err; uint32_t new_blksz = drro->drr_blksz; uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || dn_slots > (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { return (SET_ERROR(EINVAL)); } if (rwa->raw) { /* * We should have received a DRR_OBJECT_RANGE record * containing this block and stored it in rwa. */ if (drro->drr_object < rwa->or_firstobj || drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || drro->drr_raw_bonuslen < drro->drr_bonuslen || drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || drro->drr_nlevels > DN_MAX_LEVELS || drro->drr_nblkptr > DN_MAX_NBLKPTR || DN_SLOTS_TO_BONUSLEN(dn_slots) < drro->drr_raw_bonuslen) return (SET_ERROR(EINVAL)); } else { /* * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN * record indicates this by setting DRR_FLAG_SPILL_BLOCK. */ if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) || (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) { return (SET_ERROR(EINVAL)); } if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 || drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) { return (SET_ERROR(EINVAL)); } } err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT && err != EEXIST) return (SET_ERROR(EINVAL)); if (drro->drr_object > rwa->max_object) rwa->max_object = drro->drr_object; /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. * Raw receives will also check that the indirect structure of the * dnode hasn't changed. */ uint64_t object_to_hold; if (err == 0) { err = receive_handle_existing_object(rwa, drro, &doi, data, &object_to_hold, &new_blksz); if (err != 0) return (err); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a * multi-slot dnode. This will be resolved when the next txg * is synced out, since the send stream will have told us * to free this slot when we freed the associated dnode * earlier in the stream. */ txg_wait_synced(dmu_objset_pool(rwa->os), 0); if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT) return (SET_ERROR(EINVAL)); /* object was freed and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } else { /* * If the only record in this range so far was DRR_FREEOBJECTS * with at least one actually freed object, it's possible that * the block will now be converted to a hole. We need to wait * for the txg to sync to prevent races. */ if (rwa->or_need_sync == ORNS_YES) txg_wait_synced(dmu_objset_pool(rwa->os), 0); /* object is free and we are about to allocate a new one */ object_to_hold = DMU_NEW_OBJECT; } /* Only relevant for the first object in the range */ rwa->or_need_sync = ORNS_NO; /* * If this is a multi-slot dnode there is a chance that this * object will expand into a slot that is already used by * another object from the previous snapshot. We must free * these objects before we attempt to allocate the new dnode. */ if (dn_slots > 1) { boolean_t need_sync = B_FALSE; for (uint64_t slot = drro->drr_object + 1; slot < drro->drr_object + dn_slots; slot++) { dmu_object_info_t slot_doi; err = dmu_object_info(rwa->os, slot, &slot_doi); if (err == ENOENT || err == EEXIST) continue; else if (err != 0) return (err); err = dmu_free_long_object(rwa->os, slot); if (err != 0) return (err); need_sync = B_TRUE; } if (need_sync) txg_wait_synced(dmu_objset_pool(rwa->os), 0); } tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object_to_hold); dmu_tx_hold_write(tx, object_to_hold, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (object_to_hold == DMU_NEW_OBJECT) { /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || new_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, rwa->spill ? DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) { /* * Currently allocated, the existing version of this object * may reference a spill block that is no longer allocated * at the source and needs to be freed. */ err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx); } if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } if (rwa->or_crypt_params_present) { /* * Set the crypt params for the buffer associated with this * range of dnodes. This causes the blkptr_t to have the * same crypt params (byteorder, salt, iv, mac) as on the * sending side. * * Since we are committing this tx now, it is possible for * the dnode block to end up on-disk with the incorrect MAC, * if subsequent objects in this block are received in a * different txg. However, since the dataset is marked as * inconsistent, no code paths will do a non-raw read (or * decrypt the block / verify the MAC). The receive code and * scrub code can safely do raw reads and verify the * checksum. They don't need to verify the MAC. */ dmu_buf_t *db = NULL; uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE; err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os), offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } dmu_buf_set_crypt_params(db, rwa->or_byteorder, rwa->or_salt, rwa->or_iv, rwa->or_mac, tx); dmu_buf_rele(db, FTAG); rwa->or_crypt_params_present = B_FALSE; } dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); /* handle more restrictive dnode structuring for raw recvs */ if (rwa->raw) { /* * Set the indirect block size, block shift, nlevels. * This will not fail because we ensured all of the * blocks were freed earlier if this is a new object. * For non-new objects block size and indirect block * shift cannot change and nlevels can only increase. */ ASSERT3U(new_blksz, ==, drro->drr_blksz); VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, drro->drr_nlevels, tx)); /* * Set the maxblkid. This will always succeed because * we freed all blocks beyond the new maxblkid above. */ VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, drro->drr_maxblkid, tx)); } if (data != NULL) { dmu_buf_t *db; dnode_t *dn; uint32_t flags = DMU_READ_NO_PREFETCH; if (rwa->raw) flags |= DMU_READ_NO_DECRYPT; VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn)); VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro)); /* * Raw bonus buffers have their byteorder determined by the * DRR_OBJECT_RANGE record. */ if (rwa->byteswap && !rwa->raw) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); } dmu_buf_rele(db, FTAG); dnode_rele(dn, FTAG); } /* * If the receive fails, we want the resume stream to start with the * same record that we last successfully received. There is no way to * request resume from the object record, but we can benefit from the * fact that sender always sends object record before anything else, * after which it will "resend" data at offset 0 and resume normally. */ save_resume_state(rwa, drro->drr_object, 0, tx); dmu_tx_commit(tx); return (0); } noinline static int receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && obj < DN_MAX_OBJECT && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { dmu_object_info_t doi; int err; err = dmu_object_info(rwa->os, obj, &doi); if (err == ENOENT) continue; else if (err != 0) return (err); err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); if (rwa->or_need_sync == ORNS_MAYBE) rwa->or_need_sync = ORNS_YES; } if (next_err != ESRCH) return (next_err); return (0); } /* * Note: if this fails, the caller will clean up any records left on the * rwa->write_batch list. */ static int flush_write_batch_impl(struct receive_writer_arg *rwa) { dnode_t *dn; int err; if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0) return (SET_ERROR(EINVAL)); struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch); struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write; struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; ASSERT3U(rwa->last_object, ==, last_drrw->drr_object); ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset); dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset, last_drrw->drr_offset - first_drrw->drr_offset + last_drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); dnode_rele(dn, FTAG); return (err); } struct receive_record_arg *rrd; while ((rrd = list_head(&rwa->write_batch)) != NULL) { struct drr_write *drrw = &rrd->header.drr_u.drr_write; abd_t *abd = rrd->abd; ASSERT3U(drrw->drr_object, ==, rwa->last_object); if (drrw->drr_logical_size != dn->dn_datablksz) { /* * The WRITE record is larger than the object's block * size. We must be receiving an incremental * large-block stream into a dataset that previously did * a non-large-block receive. Lightweight writes must * be exactly one block, so we need to decompress the * data (if compressed) and do a normal dmu_write(). */ ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); if (DRR_WRITE_COMPRESSED(drrw)) { abd_t *decomp_abd = abd_alloc_linear(drrw->drr_logical_size, B_FALSE); err = zio_decompress_data( drrw->drr_compressiontype, abd, decomp_abd, abd_get_size(abd), abd_get_size(decomp_abd), NULL); if (err == 0) { dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, abd_to_buf(decomp_abd), tx); } abd_free(decomp_abd); } else { dmu_write_by_dnode(dn, drrw->drr_offset, drrw->drr_logical_size, abd_to_buf(abd), tx); } if (err == 0) abd_free(abd); } else { zio_prop_t zp = {0}; dmu_write_policy(rwa->os, dn, 0, 0, &zp); zio_flag_t zio_flags = 0; if (rwa->raw) { zp.zp_encrypt = B_TRUE; zp.zp_compress = drrw->drr_compressiontype; zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ rwa->byteswap; memcpy(zp.zp_salt, drrw->drr_salt, ZIO_DATA_SALT_LEN); memcpy(zp.zp_iv, drrw->drr_iv, ZIO_DATA_IV_LEN); memcpy(zp.zp_mac, drrw->drr_mac, ZIO_DATA_MAC_LEN); if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { zp.zp_nopwrite = B_FALSE; zp.zp_copies = MIN(zp.zp_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (DRR_WRITE_COMPRESSED(drrw)) { ASSERT3U(drrw->drr_compressed_size, >, 0); ASSERT3U(drrw->drr_logical_size, >=, drrw->drr_compressed_size); zp.zp_compress = drrw->drr_compressiontype; zio_flags |= ZIO_FLAG_RAW_COMPRESS; } else if (rwa->byteswap) { /* * Note: compressed blocks never need to be * byteswapped, because WRITE records for * metadata blocks are never compressed. The * exception is raw streams, which are written * in the original byteorder, and the byteorder * bit is preserved in the BP by setting * zp_byteorder above. */ dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func( abd_to_buf(abd), DRR_WRITE_PAYLOAD_SIZE(drrw)); } /* * Since this data can't be read until the receive * completes, we can do a "lightweight" write for * improved performance. */ err = dmu_lightweight_write_by_dnode(dn, drrw->drr_offset, abd, &zp, zio_flags, tx); } if (err != 0) { /* * This rrd is left on the list, so the caller will * free it (and the abd). */ break; } /* * Note: If the receive fails, we want the resume stream to * start with the same record that we last successfully * received (as opposed to the next record), so that we can * verify that we are resuming from the correct location. */ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); list_remove(&rwa->write_batch, rrd); kmem_free(rrd, sizeof (*rrd)); } dmu_tx_commit(tx); dnode_rele(dn, FTAG); return (err); } noinline static int flush_write_batch(struct receive_writer_arg *rwa) { if (list_is_empty(&rwa->write_batch)) return (0); int err = rwa->err; if (err == 0) err = flush_write_batch_impl(rwa); if (err != 0) { struct receive_record_arg *rrd; while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { abd_free(rrd->abd); kmem_free(rrd, sizeof (*rrd)); } } ASSERT(list_is_empty(&rwa->write_batch)); return (err); } noinline static int receive_process_write_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err = 0; ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE); struct drr_write *drrw = &rrd->header.drr_u.drr_write; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); if (rwa->heal) { blkptr_t *bp; dmu_buf_t *dbp; int flags = DB_RF_CANFAIL; if (rwa->raw) flags |= DB_RF_NO_DECRYPT; if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd), DRR_WRITE_PAYLOAD_SIZE(drrw)); } err = dmu_buf_hold_noread(rwa->os, drrw->drr_object, drrw->drr_offset, FTAG, &dbp); if (err != 0) return (err); /* Try to read the object to see if it needs healing */ err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags); /* * We only try to heal when dbuf_read() returns a ECKSUMs. * Other errors (even EIO) get returned to caller. * EIO indicates that the device is not present/accessible, * so writing to it will likely fail. * If the block is healthy, we don't want to overwrite it * unnecessarily. */ if (err != ECKSUM) { dmu_buf_rele(dbp, FTAG); return (err); } /* Make sure the on-disk block and recv record sizes match */ if (drrw->drr_logical_size != dbp->db_size) { err = ENOTSUP; dmu_buf_rele(dbp, FTAG); return (err); } /* Get the block pointer for the corrupted block */ bp = dmu_buf_get_blkptr(dbp); err = do_corrective_recv(rwa, drrw, rrd, bp); dmu_buf_rele(dbp, FTAG); return (err); } /* * For resuming to work, records must be in increasing order * by (object, offset). */ if (drrw->drr_object < rwa->last_object || (drrw->drr_object == rwa->last_object && drrw->drr_offset < rwa->last_offset)) { return (SET_ERROR(EINVAL)); } struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; uint64_t batch_size = MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2); if (first_rrd != NULL && (drrw->drr_object != first_drrw->drr_object || drrw->drr_offset >= first_drrw->drr_offset + batch_size)) { err = flush_write_batch(rwa); if (err != 0) return (err); } rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; if (rwa->last_object > rwa->max_object) rwa->max_object = rwa->last_object; list_insert_tail(&rwa->write_batch, rrd); /* * Return EAGAIN to indicate that we will use this rrd again, * so the caller should not free it */ return (EAGAIN); } static int receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (SET_ERROR(EINVAL)); if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (SET_ERROR(EINVAL)); if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (SET_ERROR(EINVAL)); if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (SET_ERROR(EINVAL)); if (rwa->raw) return (SET_ERROR(EINVAL)); if (drrwe->drr_object > rwa->max_object) rwa->max_object = drrwe->drr_object; tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write_embedded(rwa->os, drrwe->drr_object, drrwe->drr_offset, data, drrwe->drr_etype, drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); /* See comment in restore_write. */ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, abd_t *abd) { dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); /* * This is an unmodified spill block which was added to the stream * to resolve an issue with incorrectly removing spill blocks. It * should be ignored by current versions of the code which support * the DRR_FLAG_SPILL_BLOCK flag. */ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { abd_free(abd); return (0); } if (rwa->raw) { if (!DMU_OT_IS_VALID(drrs->drr_type) || drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || drrs->drr_compressed_size == 0) return (SET_ERROR(EINVAL)); } if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrs->drr_object > rwa->max_object) rwa->max_object = drrs->drr_object; VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } /* * Spill blocks may both grow and shrink. When a change in size * occurs any existing dbuf must be updated to match the logical * size of the provided arc_buf_t. */ if (db_spill->db_size != drrs->drr_length) { dmu_buf_will_fill(db_spill, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } arc_buf_t *abuf; if (rwa->raw) { boolean_t byteorder = ZFS_HOST_BYTEORDER ^ !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ rwa->byteswap; abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os), drrs->drr_object, byteorder, drrs->drr_salt, drrs->drr_iv, drrs->drr_mac, drrs->drr_type, drrs->drr_compressed_size, drrs->drr_length, drrs->drr_compressiontype, 0); } else { abuf = arc_loan_buf(dmu_objset_spa(rwa->os), DMU_OT_IS_METADATA(drrs->drr_type), drrs->drr_length); if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrs->drr_type); dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); } } memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs)); abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } noinline static int receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); if (drrf->drr_object > rwa->max_object) rwa->max_object = drrf->drr_object; err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } static int receive_object_range(struct receive_writer_arg *rwa, struct drr_object_range *drror) { /* * By default, we assume this block is in our native format * (ZFS_HOST_BYTEORDER). We then take into account whether * the send stream is byteswapped (rwa->byteswap). Finally, * we need to byteswap again if this particular block was * in non-native format on the send side. */ boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^ !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags); /* * Since dnode block sizes are constant, we should not need to worry * about making sure that the dnode block size is the same on the * sending and receiving sides for the time being. For non-raw sends, * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE * record at all). Raw sends require this record type because the * encryption parameters are used to protect an entire block of bonus * buffers. If the size of dnode blocks ever becomes variable, * handling will need to be added to ensure that dnode block sizes * match on the sending and receiving side. */ if (drror->drr_numslots != DNODES_PER_BLOCK || P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 || !rwa->raw) return (SET_ERROR(EINVAL)); if (drror->drr_firstobj > rwa->max_object) rwa->max_object = drror->drr_firstobj; /* * The DRR_OBJECT_RANGE handling must be deferred to receive_object() * so that the block of dnodes is not written out when it's empty, * and converted to a HOLE BP. */ rwa->or_crypt_params_present = B_TRUE; rwa->or_firstobj = drror->drr_firstobj; rwa->or_numslots = drror->drr_numslots; memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN); memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN); memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN); rwa->or_byteorder = byteorder; rwa->or_need_sync = ORNS_MAYBE; return (0); } /* * Until we have the ability to redact large ranges of data efficiently, we * process these records as frees. */ noinline static int receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr) { struct drr_free drrf = {0}; drrf.drr_length = drrr->drr_length; drrf.drr_object = drrr->drr_object; drrf.drr_offset = drrr->drr_offset; drrf.drr_toguid = drrr->drr_toguid; return (receive_free(rwa, &drrf)); } /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; ds_hold_flags_t dsflags; dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has * been written out to disk. For raw receives, this ensures * that the user accounting code will not attempt to do anything * after we stopped receiving the dataset. */ txg_wait_synced(ds->ds_dir->dd_pool, 0); ds->ds_objset->os_raw_receive = B_FALSE; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); if (drc->drc_resumable && drc->drc_should_save && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); if (!drc->drc_heal) (void) dsl_destroy_head(name); } } static void receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf) { if (drc->drc_byteswap) { (void) fletcher_4_incremental_byteswap(buf, len, &drc->drc_cksum); } else { (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum); } } /* * Read the payload into a buffer of size len, and update the current record's * payload field. * Allocate drc->drc_next_rrd and read the next record's header into * drc->drc_next_rrd->header. * Verify checksum of payload and next record. */ static int receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) { int err; if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); err = receive_read(drc, len, buf); if (err != 0) return (err); receive_cksum(drc, len, buf); /* note: rrd is NULL when reading the begin record's payload */ if (drc->drc_rrd != NULL) { drc->drc_rrd->payload = buf; drc->drc_rrd->payload_size = len; drc->drc_rrd->bytes_read = drc->drc_bytes_read; } } else { ASSERT3P(buf, ==, NULL); } drc->drc_prev_cksum = drc->drc_cksum; drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP); err = receive_read(drc, sizeof (drc->drc_next_rrd->header), &drc->drc_next_rrd->header); drc->drc_next_rrd->bytes_read = drc->drc_bytes_read; if (err != 0) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (err); } if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (SET_ERROR(EINVAL)); } /* * Note: checksum is of everything up to but not including the * checksum itself. */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); receive_cksum(drc, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &drc->drc_next_rrd->header); zio_cksum_t cksum_orig = drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; zio_cksum_t *cksump = &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; if (drc->drc_byteswap) byteswap_record(&drc->drc_next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) { kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); drc->drc_next_rrd = NULL; return (SET_ERROR(ECKSUM)); } receive_cksum(drc, sizeof (cksum_orig), &cksum_orig); return (0); } /* * Issue the prefetch reads for any necessary indirect blocks. * * We use the object ignore list to tell us whether or not to issue prefetches * for a given object. We do this for both correctness (in case the blocksize * of an object has changed) and performance (if the object doesn't exist, don't * needlessly try to issue prefetches). We also trim the list as we go through * the stream to prevent it from growing to an unbounded size. * * The object numbers within will always be in sorted order, and any write * records we see will also be in sorted order, but they're not sorted with * respect to each other (i.e. we can get several object records before * receiving each object's write records). As a result, once we've reached a * given object number, we can safely remove any reference to lower object * numbers in the ignore list. In practice, we receive up to 32 object records * before receiving write records, so the list can have up to 32 nodes in it. */ static void receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset, uint64_t length) { if (!objlist_exists(drc->drc_ignore_objlist, object)) { dmu_prefetch(drc->drc_os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } } /* * Read records off the stream, issuing any necessary prefetches. */ static int receive_read_record(dmu_recv_cookie_t *drc) { int err; switch (drc->drc_rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &drc->drc_rrd->header.drr_u.drr_object; uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); void *buf = NULL; dmu_object_info_t doi; if (size != 0) buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } err = dmu_object_info(drc->drc_os, drro->drr_object, &doi); /* * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ if (err == ENOENT || err == EEXIST || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { objlist_insert(drc->drc_ignore_objlist, drro->drr_object); err = 0; } return (err); } case DRR_FREEOBJECTS: { err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_WRITE: { struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; int size = DRR_WRITE_PAYLOAD_SIZE(drrw); abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); if (err != 0) { abd_free(abd); return (err); } drc->drc_rrd->abd = abd; receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &drc->drc_rrd->header.drr_u.drr_write_embedded; uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); void *buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(drc, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); return (err); } case DRR_FREE: case DRR_REDACT: { /* * It might be beneficial to prefetch indirect blocks here, but * we don't really have the data to decide for sure. */ err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } case DRR_END: { struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum, drre->drr_checksum)) return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: { struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; int size = DRR_SPILL_PAYLOAD_SIZE(drrs); abd_t *abd = abd_alloc_linear(size, B_FALSE); err = receive_read_payload_and_next_header(drc, size, abd_to_buf(abd)); if (err != 0) abd_free(abd); else drc->drc_rrd->abd = abd; return (err); } case DRR_OBJECT_RANGE: { err = receive_read_payload_and_next_header(drc, 0, NULL); return (err); } default: return (SET_ERROR(EINVAL)); } } static void dprintf_drr(struct receive_record_arg *rrd, int err) { #ifdef ZFS_DEBUG switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; dprintf("drr_type = OBJECT obj = %llu type = %u " "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u " "compress = %u dn_slots = %u err = %d\n", (u_longlong_t)drro->drr_object, drro->drr_type, drro->drr_bonustype, drro->drr_blksz, drro->drr_bonuslen, drro->drr_checksumtype, drro->drr_compress, drro->drr_dn_slots, err); break; } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; dprintf("drr_type = FREEOBJECTS firstobj = %llu " "numobjs = %llu err = %d\n", (u_longlong_t)drrfo->drr_firstobj, (u_longlong_t)drrfo->drr_numobjs, err); break; } case DRR_WRITE: { struct drr_write *drrw = &rrd->header.drr_u.drr_write; dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu " "lsize = %llu cksumtype = %u flags = %u " "compress = %u psize = %llu err = %d\n", (u_longlong_t)drrw->drr_object, drrw->drr_type, (u_longlong_t)drrw->drr_offset, (u_longlong_t)drrw->drr_logical_size, drrw->drr_checksumtype, drrw->drr_flags, drrw->drr_compressiontype, (u_longlong_t)drrw->drr_compressed_size, err); break; } case DRR_WRITE_BYREF: { struct drr_write_byref *drrwbr = &rrd->header.drr_u.drr_write_byref; dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu " "length = %llu toguid = %llx refguid = %llx " "refobject = %llu refoffset = %llu cksumtype = %u " "flags = %u err = %d\n", (u_longlong_t)drrwbr->drr_object, (u_longlong_t)drrwbr->drr_offset, (u_longlong_t)drrwbr->drr_length, (u_longlong_t)drrwbr->drr_toguid, (u_longlong_t)drrwbr->drr_refguid, (u_longlong_t)drrwbr->drr_refobject, (u_longlong_t)drrwbr->drr_refoffset, drrwbr->drr_checksumtype, drrwbr->drr_flags, err); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu " "length = %llu compress = %u etype = %u lsize = %u " "psize = %u err = %d\n", (u_longlong_t)drrwe->drr_object, (u_longlong_t)drrwe->drr_offset, (u_longlong_t)drrwe->drr_length, drrwe->drr_compression, drrwe->drr_etype, drrwe->drr_lsize, drrwe->drr_psize, err); break; } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; dprintf("drr_type = FREE obj = %llu offset = %llu " "length = %lld err = %d\n", (u_longlong_t)drrf->drr_object, (u_longlong_t)drrf->drr_offset, (longlong_t)drrf->drr_length, err); break; } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; dprintf("drr_type = SPILL obj = %llu length = %llu " "err = %d\n", (u_longlong_t)drrs->drr_object, (u_longlong_t)drrs->drr_length, err); break; } case DRR_OBJECT_RANGE: { struct drr_object_range *drror = &rrd->header.drr_u.drr_object_range; dprintf("drr_type = OBJECT_RANGE firstobj = %llu " "numslots = %llu flags = %u err = %d\n", (u_longlong_t)drror->drr_firstobj, (u_longlong_t)drror->drr_numslots, drror->drr_flags, err); break; } default: return; } #endif } /* * Commit the records to the pool. */ static int receive_process_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err; /* Processing in order, therefore bytes_read should be increasing. */ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; /* We can only heal write records; other ones get ignored */ if (rwa->heal && rrd->header.drr_type != DRR_WRITE) { if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } return (0); } if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) { err = flush_write_batch(rwa); if (err != 0) { if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } return (err); } } switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; err = receive_object(rwa, drro, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; err = receive_freeobjects(rwa, drrfo); break; } case DRR_WRITE: { err = receive_process_write_record(rwa, rrd); if (rwa->heal) { /* * If healing - always free the abd after processing */ abd_free(rrd->abd); rrd->abd = NULL; } else if (err != EAGAIN) { /* * On success, a non-healing * receive_process_write_record() returns * EAGAIN to indicate that we do not want to free * the rrd or arc_buf. */ ASSERT(err != 0); abd_free(rrd->abd); rrd->abd = NULL; } break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; err = receive_write_embedded(rwa, drrwe, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; break; } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; err = receive_free(rwa, drrf); break; } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; err = receive_spill(rwa, drrs, rrd->abd); if (err != 0) abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; break; } case DRR_OBJECT_RANGE: { struct drr_object_range *drror = &rrd->header.drr_u.drr_object_range; err = receive_object_range(rwa, drror); break; } case DRR_REDACT: { struct drr_redact *drrr = &rrd->header.drr_u.drr_redact; err = receive_redact(rwa, drrr); break; } default: err = (SET_ERROR(EINVAL)); } if (err != 0) dprintf_drr(rrd, err); return (err); } /* * dmu_recv_stream's worker thread; pull records off the queue, and then call * receive_process_record When we're done, signal the main thread and exit. */ static __attribute__((noreturn)) void receive_writer_thread(void *arg) { struct receive_writer_arg *rwa = arg; struct receive_record_arg *rrd; fstrans_cookie_t cookie = spl_fstrans_mark(); for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; rrd = bqueue_dequeue(&rwa->q)) { /* * If there's an error, the main thread will stop putting things * on the queue, but we need to clear everything in it before we * can exit. */ int err = 0; if (rwa->err == 0) { err = receive_process_record(rwa, rrd); } else if (rrd->abd != NULL) { abd_free(rrd->abd); rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } /* * EAGAIN indicates that this record has been saved (on * raw->write_batch), and will be used again, so we don't * free it. * When healing data we always need to free the record. */ if (err != EAGAIN || rwa->heal) { if (rwa->err == 0) rwa->err = err; kmem_free(rrd, sizeof (*rrd)); } } kmem_free(rrd, sizeof (*rrd)); if (rwa->heal) { zio_wait(rwa->heal_pio); } else { int err = flush_write_batch(rwa); if (rwa->err == 0) rwa->err = err; } mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); mutex_exit(&rwa->mutex); spl_fstrans_unmark(cookie); thread_exit(); } static int resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) { uint64_t val; objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset; uint64_t dsobj = dmu_objset_id(drc->drc_os); uint64_t resume_obj, resume_off; if (nvlist_lookup_uint64(begin_nvl, "resume_object", &resume_obj) != 0 || nvlist_lookup_uint64(begin_nvl, "resume_offset", &resume_off) != 0) { return (SET_ERROR(EINVAL)); } VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); if (resume_obj != val) return (SET_ERROR(EINVAL)); VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); if (resume_off != val) return (SET_ERROR(EINVAL)); return (0); } /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a * worker thread, read the records off the stream one by one, and issue * prefetches for any necessary indirect blocks. It will then push the records * onto an internal blocking queue. The worker thread will pull the records off * the queue, and actually write the data into the DMU. This way, the worker * thread doesn't have to wait for reads to complete, since everything it needs * (the indirect blocks) will be prefetched. * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) { int err = 0; struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) { uint64_t bytes = 0; (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, sizeof (bytes), 1, &bytes); drc->drc_bytes_read += bytes; } drc->drc_ignore_objlist = objlist_create(); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); ASSERT0(drc->drc_os->os_encrypted && (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); /* handle DSL encryption key payload */ if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { nvlist_t *keynvl = NULL; ASSERT(drc->drc_os->os_encrypted); ASSERT(drc->drc_raw); err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata", &keynvl); if (err != 0) goto out; if (!drc->drc_heal) { /* * If this is a new dataset we set the key immediately. * Otherwise we don't want to change the key until we * are sure the rest of the receive succeeded so we * stash the keynvl away until then. */ err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), drc->drc_ds->ds_object, drc->drc_fromsnapobj, drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); if (err != 0) goto out; } /* see comment in dmu_recv_end_sync() */ drc->drc_ivset_guid = 0; (void) nvlist_lookup_uint64(keynvl, "to_ivset_guid", &drc->drc_ivset_guid); if (!drc->drc_newfs) drc->drc_keynvl = fnvlist_dup(keynvl); } if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = resume_check(drc, drc->drc_begin_nvl); if (err != 0) goto out; } /* * For compatibility with recursive send streams, we do this here, * rather than in dmu_recv_begin. If we pull the next header too * early, and it's the END record, we break the `recv_skip` logic. */ if (drc->drc_drr_begin->drr_payloadlen == 0) { err = receive_read_payload_and_next_header(drc, 0, NULL); if (err != 0) goto out; } /* * If we failed before this point we will clean up any new resume * state that was created. Now that we've gotten past the initial * checks we are ok to retain that resume state. */ drc->drc_should_save = B_TRUE; (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), offsetof(struct receive_record_arg, node)); cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = drc->drc_os; rwa->byteswap = drc->drc_byteswap; rwa->heal = drc->drc_heal; rwa->tofs = drc->drc_tofs; rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; if (drc->drc_heal) { rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL, ZIO_FLAG_GODFATHER); } list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, TS_RUN, minclsyspri); /* * We're reading rwa->err without locks, which is safe since we are the * only reader, and the worker thread is the only writer. It's ok if we * miss a write for an iteration or two of the loop, since the writer * thread will keep freeing records we send it until we send it an eos * marker. * * We can leave this loop in 3 ways: First, if rwa->err is * non-zero. In that case, the writer thread will free the rrd we just * pushed. Second, if we're interrupted; in that case, either it's the * first loop and drc->drc_rrd was never allocated, or it's later, and * drc->drc_rrd has been handed off to the writer thread who will free * it. Finally, if receive_read_record fails or we're at the end of the * stream, then we free drc->drc_rrd and exit. */ while (rwa->err == 0) { if (issig()) { err = SET_ERROR(EINTR); break; } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = drc->drc_next_rrd; drc->drc_next_rrd = NULL; /* Allocates and loads header into drc->drc_next_rrd */ err = receive_read_record(drc); if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) { kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd)); drc->drc_rrd = NULL; break; } bqueue_enqueue(&rwa->q, drc->drc_rrd, sizeof (struct receive_record_arg) + drc->drc_rrd->payload_size); drc->drc_rrd = NULL; } ASSERT3P(drc->drc_rrd, ==, NULL); drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP); drc->drc_rrd->eos_marker = B_TRUE; bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1); mutex_enter(&rwa->mutex); while (!rwa->done) { /* * We need to use cv_wait_sig() so that any process that may * be sleeping here can still fork. */ (void) cv_wait_sig(&rwa->cv, &rwa->mutex); } mutex_exit(&rwa->mutex); /* * If we are receiving a full stream as a clone, all object IDs which * are greater than the maximum ID referenced in the stream are * by definition unused and must be freed. */ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { uint64_t obj = rwa->max_object + 1; int free_err = 0; int next_err = 0; while (next_err == 0) { free_err = dmu_free_long_object(rwa->os, obj); if (free_err != 0 && free_err != ENOENT) break; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0); } if (err == 0) { if (free_err != 0 && free_err != ENOENT) err = free_err; else if (next_err != ESRCH) err = next_err; } } cv_destroy(&rwa->cv); mutex_destroy(&rwa->mutex); bqueue_destroy(&rwa->q); list_destroy(&rwa->write_batch); if (err == 0) err = rwa->err; out: /* * If we hit an error before we started the receive_writer_thread * we need to clean up the next_rrd we create by processing the * DRR_BEGIN record. */ if (drc->drc_next_rrd != NULL) kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); /* * The objset will be invalidated by dmu_recv_end() when we do * dsl_dataset_clone_swap_sync_impl(). */ drc->drc_os = NULL; kmem_free(rwa, sizeof (*rwa)); nvlist_free(drc->drc_begin_nvl); if (err != 0) { /* * Clean up references. If receive is not resumable, * destroy what we created, so we don't leave it in * the inconsistent state. */ dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); } objlist_destroy(drc->drc_ignore_objlist); drc->drc_ignore_objlist = NULL; *voffp = drc->drc_voff; return (err); } static int dmu_recv_end_check(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int error; ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); if (drc->drc_heal) { error = 0; } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); if (error != 0) return (error); if (drc->drc_force) { /* * We will destroy any snapshots in tofs (i.e. before * origin_head) that are after the origin (which is * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) break; if (snap->ds_dir != origin_head->ds_dir) error = SET_ERROR(EINVAL); if (error == 0) { error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) break; } if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } if (drc->drc_keynvl != NULL) { error = dsl_crypto_recv_raw_key_check(drc->drc_ds, drc->drc_keynvl, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred, drc->drc_proc); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred, drc->drc_proc); } return (error); } static void dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; uint64_t newsnapobj = 0; spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; if (drc->drc_heal) { if (drc->drc_keynvl != NULL) { nvlist_free(drc->drc_keynvl); drc->drc_keynvl = NULL; } } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head)); if (drc->drc_force) { /* * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); } } if (drc->drc_keynvl != NULL) { dsl_crypto_recv_raw_key_sync(drc->drc_ds, drc->drc_keynvl, tx); nvlist_free(drc->drc_keynvl); drc->drc_keynvl = NULL; } VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); /* * The objset was evicted by dsl_dataset_clone_swap_sync_impl, * so drc_os is no longer valid. */ drc->drc_os = NULL; dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; newsnapobj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); if (drc->drc_owner != NULL) VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(ds->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(ds->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; if (dsl_dataset_has_resume_receive_state(ds)) { (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx); } newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } /* * If this is a raw receive, the crypt_keydata nvlist will include * a to_ivset_guid for us to set on the new snapshot. This value * will override the value generated by the snapshot code. However, * this value may not be present, because older implementations of * the raw send code did not include this value, and we are still * allowed to receive them if the zfs_disable_ivset_guid_check * tunable is set, in which case we will leave the newly-generated * value. */ if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) { dmu_object_zapify(dp->dp_meta_objset, newsnapobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj, DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, &drc->drc_ivset_guid, tx)); } /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode * we can evict its bonus buffer. Since the dataset may be destroyed * at this point (and therefore won't have a valid pointer to the spa) * we release the key mapping manually here while we do have a valid * pointer, if it exists. */ if (!drc->drc_raw && encrypted) { (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, drc->drc_ds->ds_object, drc->drc_ds); } dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); drc->drc_ds = NULL; } static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); #endif return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { return (dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { int error; drc->drc_owner = owner; if (drc->drc_newfs) error = dmu_recv_new_end(drc); else error = dmu_recv_existing_end(drc); if (error != 0) { dmu_recv_cleanup_ds(drc); nvlist_free(drc->drc_keynvl); } else if (!drc->drc_heal) { if (drc->drc_newfs) { zvol_create_minor(drc->drc_tofs); } char *snapname = kmem_asprintf("%s@%s", drc->drc_tofs, drc->drc_tosnap); zvol_create_minor(snapname); kmem_strfree(snapname); } return (error); } /* * Return TRUE if this objset is currently being received into. */ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && os->os_dsl_dataset->ds_owner == dmu_recv_tag); } ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW, "Maximum receive queue length"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW, "Receive queue fill fraction"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW, "Maximum amount of writes to batch into one transaction"); ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW, "Ignore errors during corrective receive"); -/* END CSTYLED */ diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 15cc2885e805..aa0434f3c722 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -1,823 +1,822 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ static int32_t send_holes_without_birth_time = 1; static uint_t zfs_traverse_indirect_prefetch_limit = 32; typedef struct prefetch_data { kmutex_t pd_mtx; kcondvar_t pd_cv; int32_t pd_bytes_fetched; int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; zbookmark_phys_t pd_resume; } prefetch_data_t; typedef struct traverse_data { spa_t *td_spa; uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; zbookmark_phys_t *td_resume; int td_flags; prefetch_data_t *td_pfd; boolean_t td_paused; uint64_t td_hole_birth_enabled_txg; blkptr_cb_t *td_func; void *td_arg; boolean_t td_realloc_possible; } traverse_data_t; static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp, uint64_t objset, uint64_t object); static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, uint64_t objset, uint64_t object); static int traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { traverse_data_t *td = arg; zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 && BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa)) return (-1); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); return (0); } static int traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, uint64_t claim_txg) { traverse_data_t *td = arg; if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); } return (0); } static void traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; /* * We only want to visit blocks that have been claimed but not yet * replayed; plus blocks that are already stable in read-only mode. */ if (claim_txg == 0 && spa_writeable(td->td_spa)) return; zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT)); zil_free(zilog); } typedef enum resume_skip { RESUME_SKIP_ALL, RESUME_SKIP_NONE, RESUME_SKIP_CHILDREN } resume_skip_t; /* * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and * the block indicated by zb does not need to be visited at all. Returns * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the * resume point. This indicates that this block should be visited but not its * children (since they must have been visited in a previous traversal). * Otherwise returns RESUME_SKIP_NONE. */ static resume_skip_t resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { if (td->td_resume != NULL) { /* * If we already visited this bp & everything below, * don't bother doing it again. */ if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) { if (td->td_flags & TRAVERSE_POST) return (RESUME_SKIP_CHILDREN); } } return (RESUME_SKIP_NONE); } /* * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE. */ static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return (B_FALSE); /* * If this bp is before the resume point, it may have already been * freed. */ if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE) return (B_FALSE); if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return (B_FALSE); ASSERT(!BP_IS_REDACTED(bp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); return (B_TRUE); } static boolean_t prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) { ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp)) return (B_FALSE); return (B_TRUE); } static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: return (0); case RESUME_SKIP_CHILDREN: goto post; case RESUME_SKIP_NONE: break; default: ASSERT(0); } if (BP_GET_LOGICAL_BIRTH(bp) == 0) { /* * Since this block has a birth time of 0 it must be one of * two things: a hole created before the * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole * which has always been a hole in an object. * * If a file is written sparsely, then the unwritten parts of * the file were "always holes" -- that is, they have been * holes since this object was allocated. However, we (and * our callers) can not necessarily tell when an object was * allocated. Therefore, if it's possible that this object * was freed and then its object number reused, we need to * visit all the holes with birth==0. * * If it isn't possible that the object number was reused, * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote * all the blocks we will visit as part of this traversal, * then this hole must have always existed, so we can skip * it. We visit blocks born after (exclusive) td_min_txg. * * Note that the meta-dnode cannot be reallocated. */ if (!send_holes_without_birth_time && (!td->td_realloc_possible || zb->zb_object == DMU_META_DNODE_OBJECT) && td->td_hole_birth_enabled_txg <= td->td_min_txg) return (0); } else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) { return (0); } if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { uint64_t size = BP_GET_LSIZE(bp); mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_bytes_fetched >= 0); while (pd->pd_bytes_fetched < size && !pd->pd_exited) cv_wait_sig(&pd->pd_cv, &pd->pd_mtx); pd->pd_bytes_fetched -= size; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err != 0) goto post; return (0); } if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err != 0) goto post; } if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_FLAG_WAIT; int32_t i, ptidx, pidx; uint32_t prefetchlimit; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; ASSERT(!BP_IS_PROTECTED(bp)); err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); /* * When performing a traversal it is beneficial to * asynchronously read-ahead the upcoming indirect * blocks since they will be needed shortly. However, * since a 128k indirect (non-L0) block may contain up * to 1024 128-byte block pointers, its preferable to not * prefetch them all at once. Issuing a large number of * async reads may effect performance, and the earlier * the indirect blocks are prefetched the less likely * they are to still be resident in the ARC when needed. * Therefore, prefetching indirect blocks is limited to * zfs_traverse_indirect_prefetch_limit=32 blocks by * default. * * pidx: Index for which next prefetch to be issued. * ptidx: Index at which next prefetch to be triggered. */ ptidx = 0; pidx = 1; prefetchlimit = zfs_traverse_indirect_prefetch_limit; for (i = 0; i < epb; i++) { if (prefetchlimit && i == ptidx) { ASSERT3S(ptidx, <=, pidx); for (uint32_t prefetched = 0; pidx < epb && prefetched < prefetchlimit; pidx++) { SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + pidx); if (traverse_prefetch_metadata(td, dnp, &((blkptr_t *)buf->b_data)[pidx], czb) == B_TRUE) { prefetched++; if (prefetched == MAX(prefetchlimit / 2, 1)) ptidx = pidx; } } } /* recursively visitbp() blocks below this */ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &((blkptr_t *)buf->b_data)[i], czb); if (err != 0) break; } kmem_free(czb, sizeof (zbookmark_phys_t)); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_FLAG_WAIT; uint32_t zio_flags = ZIO_FLAG_CANFAIL; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; dnode_phys_t *child_dnp; /* * dnode blocks might have their bonus buffers encrypted, so * we must be careful to honor TRAVERSE_NO_DECRYPT */ if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err != 0) goto post; child_dnp = buf->b_data; for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { prefetch_dnode_metadata(td, &child_dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { err = traverse_dnode(td, bp, &child_dnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err != 0) goto post; osp = buf->b_data; prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); /* * See the block comment above for the goal of this variable. * If the maxblkid of the meta-dnode is 0, then we know that * we've never had more than DNODES_PER_BLOCK objects in the * dataset, which means we can't have reused any object ids. */ if (osp->os_meta_dnode.dn_maxblkid == 0) td->td_realloc_possible = B_FALSE; if (OBJSET_BUF_HAS_USERUSED(buf)) { if (OBJSET_BUF_HAS_PROJECTUSED(buf)) prefetch_dnode_metadata(td, &osp->os_projectused_dnode, zb->zb_objset, DMU_PROJECTUSED_OBJECT); prefetch_dnode_metadata(td, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); prefetch_dnode_metadata(td, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) { if (OBJSET_BUF_HAS_PROJECTUSED(buf)) err = traverse_dnode(td, bp, &osp->os_projectused_dnode, zb->zb_objset, DMU_PROJECTUSED_OBJECT); if (err == 0) err = traverse_dnode(td, bp, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); if (err == 0) err = traverse_dnode(td, bp, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } } if (buf) arc_buf_destroy(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) { /* * Ignore this disk error as requested by the HARD flag, * and continue traversal. */ err = 0; } /* * If we are stopping here, set td_resume. */ if (td->td_resume != NULL && err != 0 && !td->td_paused) { td->td_resume->zb_objset = zb->zb_objset; td->td_resume->zb_object = zb->zb_object; td->td_resume->zb_level = 0; /* * If we have stopped on an indirect block (e.g. due to * i/o error), we have not visited anything below it. * Set the bookmark to the first level-0 block that we need * to visit. This way, the resuming code does not need to * deal with resuming from indirect blocks. * * Note, if zb_level <= 0, dnp may be NULL, so we don't want * to dereference it. */ td->td_resume->zb_blkid = zb->zb_blkid; if (zb->zb_level > 0) { td->td_resume->zb_blkid <<= zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); } td->td_paused = B_TRUE; } return (err); } static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j; zbookmark_phys_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); } } static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j, err = 0; zbookmark_phys_t czb; if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && object < td->td_resume->zb_object) return (0); if (td->td_flags & TRAVERSE_PRE) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err != 0) return (err); } for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); if (err != 0) break; } if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); } if (err == 0 && (td->td_flags & TRAVERSE_POST)) { SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, ZB_DNODE_BLKID); err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err != 0) return (err); } return (err); } static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { (void) zilog, (void) dnp; prefetch_data_t *pfd = arg; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); if (zb->zb_level == ZB_DNODE_LEVEL) return (0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); if (!prefetch_needed(pfd, bp)) return (0); mutex_enter(&pfd->pd_mtx); while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx); pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &aflags, zb); return (0); } static void traverse_prefetch_thread(void *arg) { traverse_data_t *td_main = arg; traverse_data_t td = *td_main; zbookmark_phys_t czb; fstrans_cookie_t cookie = spl_fstrans_mark(); td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; td.td_resume = &td_main->td_pfd->pd_resume; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); mutex_enter(&td_main->td_pfd->pd_mtx); td_main->td_pfd->pd_exited = B_TRUE; cv_broadcast(&td_main->td_pfd->pd_cv); mutex_exit(&td_main->td_pfd->pd_mtx); spl_fstrans_unmark(cookie); } /* * NB: dataset must not be changing on-disk (eg, is a snapshot or we are * in syncing context). */ static int traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { traverse_data_t *td; prefetch_data_t *pd; zbookmark_phys_t *czb; int err; ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP); pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP); czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); td->td_spa = spa; td->td_objset = objset; td->td_rootbp = rootbp; td->td_min_txg = txg_start; td->td_resume = resume; td->td_func = func; td->td_arg = arg; td->td_pfd = pd; td->td_flags = flags; td->td_paused = B_FALSE; td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { VERIFY(spa_feature_enabled_txg(spa, SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg)); } else { td->td_hole_birth_enabled_txg = UINT64_MAX; } pd->pd_flags = flags; if (resume != NULL) pd->pd_resume = *resume; mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); SET_BOOKMARK(czb, td->td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; ASSERT(!BP_IS_REDACTED(rootbp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(rootbp)) zio_flags |= ZIO_FLAG_RAW; err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb); if (err != 0) { /* * If both TRAVERSE_HARD and TRAVERSE_PRE are set, * continue to visitbp so that td_func can be called * in pre stage, and err will reset to zero. */ if (!(td->td_flags & TRAVERSE_HARD) || !(td->td_flags & TRAVERSE_PRE)) goto out; } else { osp = buf->b_data; traverse_zil(td, &osp->os_zil_header); arc_buf_destroy(buf, &buf); } } if (!(flags & TRAVERSE_PREFETCH_DATA) || taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread, td, TQ_NOQUEUE) == TASKQID_INVALID) pd->pd_exited = B_TRUE; err = traverse_visitbp(td, NULL, rootbp, czb); mutex_enter(&pd->pd_mtx); pd->pd_cancel = B_TRUE; cv_broadcast(&pd->pd_cv); while (!pd->pd_exited) cv_wait_sig(&pd->pd_cv, &pd->pd_mtx); mutex_exit(&pd->pd_mtx); out: mutex_destroy(&pd->pd_mtx); cv_destroy(&pd->pd_cv); kmem_free(czb, sizeof (zbookmark_phys_t)); kmem_free(pd, sizeof (struct prefetch_data)); kmem_free(td, sizeof (struct traverse_data)); return (err); } /* * NB: dataset must not be changing on-disk (eg, is a snapshot or we are * in syncing context). */ int traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); } int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); } int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, blkptr, txg_start, resume, flags, func, arg)); } /* * NB: pool must not be changing on-disk (eg, from zdb or sync context). */ int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { int err; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), txg_start, NULL, flags, func, arg); if (err != 0) return (err); /* visit each dataset */ for (uint64_t obj = 1; err == 0; err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); if (err != 0) { if (hard) continue; break; } if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { dsl_dataset_t *ds; uint64_t txg = txg_start; dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); dsl_pool_config_exit(dp, FTAG); if (err != 0) { if (hard) continue; break; } if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } } if (err == ESRCH) err = 0; return (err); } EXPORT_SYMBOL(traverse_dataset); EXPORT_SYMBOL(traverse_pool); ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, "Max number of bytes to prefetch"); ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW, "Traverse prefetch number of blocks pointed by indirect block"); #if defined(_KERNEL) module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644); MODULE_PARM_DESC(ignore_hole_birth, "Alias for send_holes_without_birth_time"); #endif -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW, "Ignore hole_birth txg for zfs send"); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 8788ba11aea9..71f151b14d9b 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -1,2499 +1,2498 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" /* * This controls if we verify the ZVOL quota or not. * Currently, quotas are not implemented for ZVOLs. * The quota size is the size of the ZVOL. * The size of the volume already implies the ZVOL size quota. * The quota mechanism can introduce a significant performance drop. */ static int zvol_enforce_quotas = B_TRUE; /* * Filesystem and Snapshot Limits * ------------------------------ * * These limits are used to restrict the number of filesystems and/or snapshots * that can be created at a given level in the tree or below. A typical * use-case is with a delegated dataset where the administrator wants to ensure * that a user within the zone is not creating too many additional filesystems * or snapshots, even though they're not exceeding their space quota. * * The filesystem and snapshot counts are stored as extensible properties. This * capability is controlled by a feature flag and must be enabled to be used. * Once enabled, the feature is not active until the first limit is set. At * that point, future operations to create/destroy filesystems or snapshots * will validate and update the counts. * * Because the count properties will not exist before the feature is active, * the counts are updated when a limit is first set on an uninitialized * dsl_dir node in the tree (The filesystem/snapshot count on a node includes * all of the nested filesystems/snapshots. Thus, a new leaf node has a * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and * snapshot count properties on a node indicate uninitialized counts on that * node.) When first setting a limit on an uninitialized node, the code starts * at the filesystem with the new limit and descends into all sub-filesystems * to add the count properties. * * In practice this is lightweight since a limit is typically set when the * filesystem is created and thus has no children. Once valid, changing the * limit value won't require a re-traversal since the counts are already valid. * When recursively fixing the counts, if a node with a limit is encountered * during the descent, the counts are known to be valid and there is no need to * descend into that filesystem's children. The counts on filesystems above the * one with the new limit will still be uninitialized, unless a limit is * eventually set on one of those filesystems. The counts are always recursively * updated when a limit is set on a dataset, unless there is already a limit. * When a new limit value is set on a filesystem with an existing limit, it is * possible for the new limit to be less than the current count at that level * since a user who can change the limit is also allowed to exceed the limit. * * Once the feature is active, then whenever a filesystem or snapshot is * created, the code recurses up the tree, validating the new count against the * limit at each initialized level. In practice, most levels will not have a * limit set. If there is a limit at any initialized level up the tree, the * check must pass or the creation will fail. Likewise, when a filesystem or * snapshot is destroyed, the counts are recursively adjusted all the way up * the initialized nodes in the tree. Renaming a filesystem into different point * in the tree will first validate, then update the counts on each branch up to * the common ancestor. A receive will also validate the counts and then update * them. * * An exception to the above behavior is that the limit is not enforced if the * user has permission to modify the limit. This is primarily so that * recursive snapshots in the global zone always work. We want to prevent a * denial-of-service in which a lower level delegated dataset could max out its * limit and thus block recursive snapshots from being taken in the global zone. * Because of this, it is possible for the snapshot count to be over the limit * and snapshots taken in the global zone could cause a lower level dataset to * hit or exceed its limit. The administrator taking the global zone recursive * snapshot should be aware of this side-effect and behave accordingly. * For consistency, the filesystem limit is also not enforced if the user can * modify the limit. * * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by * dsl_dir_init_fs_ss_count(). */ static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); typedef struct ddulrt_arg { dsl_dir_t *ddulrta_dd; uint64_t ddlrta_txg; } ddulrt_arg_t; static void dsl_dir_evict_async(void *dbu) { dsl_dir_t *dd = dbu; int t; dsl_pool_t *dp __maybe_unused = dd->dd_pool; dd->dd_dbuf = NULL; for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); ASSERT(dd->dd_space_towrite[t] == 0); } if (dd->dd_parent) dsl_dir_async_rele(dd->dd_parent, dd); spa_async_close(dd->dd_pool->dp_spa, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, const void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; dmu_object_info_t doi; int err; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); if (dd == NULL) { dsl_dir_t *winner; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); dsl_prop_init(dd); if (dsl_dir_is_zapified(dd)) { err = zap_lookup(dp->dp_meta_objset, ddobj, DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj); if (err == 0) { /* check for on-disk format errata */ if (dsl_dir_incompatible_encryption_version( dd)) { dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; } } else if (err != ENOENT) { goto errout; } } if (dsl_dir_phys(dd)->dd_parent_obj) { err = dsl_dir_hold_obj(dp, dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, &dd->dd_parent); if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strlcpy(dd->dd_myname, tail, sizeof (dd->dd_myname)); } else { err = zap_value_search(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, ddobj, 0, dd->dd_myname, sizeof (dd->dd_myname)); } if (err != 0) goto errout; } else { (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa), sizeof (dd->dd_myname)); } if (dsl_dir_is_clone(dd)) { dmu_buf_t *origin_bonus; dsl_dataset_phys_t *origin_phys; /* * We can't open the origin dataset, because * that would require opening this dsl_dir. * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); if (dsl_dir_is_zapified(dd)) { uint64_t obj; err = zap_lookup(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); if (err == 0) { err = dsl_dir_livelist_open(dd, obj); if (err != 0) goto errout; } else if (err != ENOENT) goto errout; } } if (dsl_dir_is_zapified(dd)) { inode_timespec_t t = {0}; (void) zap_lookup(dp->dp_meta_objset, ddobj, DD_FIELD_SNAPSHOTS_CHANGED, sizeof (uint64_t), sizeof (inode_timespec_t) / sizeof (uint64_t), &t); dd->dd_snap_cmtime = t; } dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, &dd->dd_dbuf); winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; } else { spa_open_ref(dp->dp_spa, dd); } } /* * The dsl_dir_t has both open-to-close and instantiate-to-evict * holds on the spa. We need the open-to-close holds because * otherwise the spa_refcnt wouldn't change when we open a * dir which the spa also has open, so we could incorrectly * think it was OK to unload/export/destroy the pool. We need * the instantiate-to-evict hold because the dsl_dir_t has a * pointer to the dd_pool, which has a pointer to the spa_t. */ spa_open_ref(dp->dp_spa, tag); ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); } void dsl_dir_rele(dsl_dir_t *dd, const void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* * Remove a reference to the given dsl dir that is being asynchronously * released. Async releases occur from a taskq performing eviction of * dsl datasets and dirs. This process is identical to a normal release * with the exception of using the async API for releasing the reference on * the spa. */ void dsl_dir_async_rele(dsl_dir_t *dd, const void *tag) { dprintf_dd(dd, "%s\n", ""); spa_async_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } else { buf[0] = '\0'; } if (!MUTEX_HELD(&dd->dd_lock)) { /* * recursive mutex so that we can use * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&dd->dd_lock); } else { VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } } /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { int result = 0; if (dd->dd_parent) { /* parent's name + 1 for the "/" */ result = dsl_dir_namelen(dd->dd_parent) + 1; } if (!MUTEX_HELD(&dd->dd_lock)) { /* see dsl_dir_name */ mutex_enter(&dd->dd_lock); result += strlen(dd->dd_myname); mutex_exit(&dd->dd_lock); } else { result += strlen(dd->dd_myname); } return (result); } static int getcomponent(const char *path, char *component, const char **nextp) { char *p; if ((path == NULL) || (path[0] == '\0')) return (SET_ERROR(ENOENT)); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ return (SET_ERROR(EINVAL)); } if (p == NULL || p == path) { /* * if the first thing is an @ or /, it had better be an * @ and it had better not have any more ats or slashes, * and it had better have something after the @. */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (SET_ERROR(EINVAL)); if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN); p = NULL; } else if (p[0] == '/') { if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(component, path, p - path + 1); p++; } else if (p[0] == '@') { /* * if the next separator is an @, there better not be * any more slashes. */ if (strchr(path, '/')) return (SET_ERROR(EINVAL)); if (p - path >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(component, path, p - path + 1); } else { panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* * Return the dsl_dir_t, and possibly the last component which couldn't * be found in *tail. The name must be in the specified dsl_pool_t. This * thread must hold the dp_config_rwlock for the pool. Returns NULL if the * path is bogus, or if tail==NULL and we couldn't parse the whole name. * (*tail)[0] == '@' means that the last component is a snapshot. */ int dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag, dsl_dir_t **ddp, const char **tailp) { char *buf; const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; uint64_t ddobj; buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); err = getcomponent(name, buf, &next); if (err != 0) goto error; /* Make sure the name is in the specified pool. */ spaname = spa_name(dp->dp_spa); if (strcmp(buf, spaname) != 0) { err = SET_ERROR(EXDEV); goto error; } ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); if (err != 0) { goto error; } while (next != NULL) { dsl_dir_t *child_dd; err = getcomponent(next, buf, &nextnext); if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err != 0) { if (err == ENOENT) err = 0; break; } err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); if (err != 0) break; dsl_dir_rele(dd, tag); dd = child_dd; next = nextnext; } if (err != 0) { dsl_dir_rele(dd, tag); goto error; } /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. */ if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = SET_ERROR(ENOENT); } if (tailp != NULL) *tailp = next; if (err == 0) *ddp = dd; error: kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN); return (err); } /* * If the counts are already initialized for this filesystem and its * descendants then do nothing, otherwise initialize the counts. * * The counts on this filesystem, and those below, may be uninitialized due to * either the use of a pre-existing pool which did not support the * filesystem/snapshot limit feature, or one in which the feature had not yet * been enabled. * * Recursively descend the filesystem tree and update the filesystem/snapshot * counts on each filesystem below, then update the cumulative count on the * current filesystem. If the filesystem already has a count set on it, * then we know that its counts, and the counts on the filesystems below it, * are already correct, so we don't have to update this filesystem. */ static void dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) { uint64_t my_fs_cnt = 0; uint64_t my_ss_cnt = 0; dsl_pool_t *dp = dd->dd_pool; objset_t *os = dp->dp_meta_objset; zap_cursor_t *zc; zap_attribute_t *za; dsl_dataset_t *ds; ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); ASSERT(dsl_pool_config_held(dp)); ASSERT(dmu_tx_is_syncing(tx)); dsl_dir_zapify(dd, tx); /* * If the filesystem count has already been initialized then we * don't need to recurse down any further. */ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) return; zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = zap_attribute_alloc(); /* Iterate my child dirs */ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dir_t *chld_dd; uint64_t count; VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, &chld_dd)); /* * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets. */ if (chld_dd->dd_myname[0] == '$') { dsl_dir_rele(chld_dd, FTAG); continue; } my_fs_cnt++; /* count this child */ dsl_dir_init_fs_ss_count(chld_dd, tx); VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); my_fs_cnt += count; VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); my_ss_cnt += count; dsl_dir_rele(chld_dd, FTAG); } zap_cursor_fini(zc); /* Count my snapshots (we counted children's snapshots above) */ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { /* Don't count temporary snapshots */ if (za->za_name[0] != '%') my_ss_cnt++; } zap_cursor_fini(zc); dsl_dataset_rele(ds, FTAG); kmem_free(zc, sizeof (zap_cursor_t)); zap_attribute_free(za); /* we're in a sync task, update counts */ dmu_buf_will_dirty(dd->dd_dbuf, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); } static int dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; int error; error = dsl_dataset_hold(dp, ddname, FTAG, &ds); if (error != 0) return (error); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } dd = ds->ds_dir; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && dsl_dir_is_zapified(dd) && zap_contains(dp->dp_meta_objset, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EALREADY)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; spa_t *spa; VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); spa = dsl_dataset_get_spa(ds); if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Since the feature was not active and we're now setting a * limit, increment the feature-active counter so that the * feature becomes active for the first time. * * We are already in a sync task so we can update the MOS. */ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); } /* * Since we are now setting a non-UINT64_MAX limit on the filesystem, * we need to ensure the counts are correct. Descend down the tree from * this point and update all of the counts to be accurate. */ dsl_dir_init_fs_ss_count(ds->ds_dir, tx); dsl_dataset_rele(ds, FTAG); } /* * Make sure the feature is enabled and activate it if necessary. * Since we're setting a limit, ensure the on-disk counts are valid. * This is only called by the ioctl path when setting a limit value. * * We do not need to validate the new limit, since users who can change the * limit are also allowed to exceed the limit. */ int dsl_dir_activate_fs_ss_limit(const char *ddname) { int error; error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, ZFS_SPACE_CHECK_RESERVED); if (error == EALREADY) error = 0; return (error); } /* * Used to determine if the filesystem_limit or snapshot_limit should be * enforced. We allow the limit to be exceeded if the user has permission to * write the property value. We pass in the creds that we got in the open * context since we will always be the GZ root in syncing context. We also have * to handle the case where we are allowed to change the limit on the current * dataset, but there may be another limit in the tree above. * * We can never modify these two properties within a non-global zone. In * addition, the other checks are modeled on zfs_secpolicy_write_perms. We * can't use that function since we are already holding the dp_config_rwlock. * In addition, we already have the dd and dealing with snapshots is simplified * in this code. */ typedef enum { ENFORCE_ALWAYS, ENFORCE_NEVER, ENFORCE_ABOVE } enforce_res_t; static enforce_res_t dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr, proc_t *proc) { enforce_res_t enforce = ENFORCE_ALWAYS; uint64_t obj; dsl_dataset_t *ds; uint64_t zoned; const char *zonedstr; ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); #ifdef _KERNEL if (crgetzoneid(cr) != GLOBAL_ZONEID) return (ENFORCE_ALWAYS); /* * We are checking the saved credentials of the user process, which is * not the current process. Note that we can't use secpolicy_zfs(), * because it only works if the cred is that of the current process (on * Linux). */ if (secpolicy_zfs_proc(cr, proc) == 0) return (ENFORCE_NEVER); #else (void) proc; #endif if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) return (ENFORCE_ALWAYS); ASSERT(dsl_pool_config_held(dd->dd_pool)); if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) return (ENFORCE_ALWAYS); zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED); if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) { /* Only root can access zoned fs's from the GZ */ enforce = ENFORCE_ALWAYS; } else { if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) enforce = ENFORCE_ABOVE; } dsl_dataset_rele(ds, FTAG); return (enforce); } /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. * The prop argument indicates which limit to check. * * Note that all filesystem limits up to the root (or the highest * initialized) filesystem or the given ancestor must be satisfied. */ int dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, dsl_dir_t *ancestor, cred_t *cr, proc_t *proc) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t limit, count; const char *count_prop; enforce_res_t enforce; int err = 0; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { /* * We don't enforce the limit for temporary snapshots. This is * indicated by a NULL cred_t argument. */ if (cr == NULL) return (0); count_prop = DD_FIELD_SNAPSHOT_COUNT; } else { count_prop = DD_FIELD_FILESYSTEM_COUNT; } /* * If we're allowed to change the limit, don't enforce the limit * e.g. this can happen if a snapshot is taken by an administrative * user in the global zone (i.e. a recursive snapshot by root). * However, we must handle the case of delegated permissions where we * are allowed to change the limit on the current dataset, but there * is another limit in the tree above. */ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc); if (enforce == ENFORCE_NEVER) return (0); /* * e.g. if renaming a dataset with no snapshots, count adjustment * is 0. */ if (delta == 0) return (0); /* * If an ancestor has been provided, stop checking the limit once we * hit that dir. We need this during rename so that we don't overcount * the check once we recurse up to the common ancestor. */ if (ancestor == dd) return (0); /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know there is no limit here (or above). The counts are * not valid on this node and we know we won't touch this node's counts. */ if (!dsl_dir_is_zapified(dd)) return (0); err = zap_lookup(os, dd->dd_object, count_prop, sizeof (count), 1, &count); if (err == ENOENT) return (0); if (err != 0) return (err); err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, B_FALSE); if (err != 0) return (err); /* Is there a limit which we've hit? */ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) return (SET_ERROR(EDQUOT)); if (dd->dd_parent != NULL) err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, ancestor, cr, proc); return (err); } /* * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all * parents. When a new filesystem/snapshot is created, increment the count on * all parents, and when a filesystem/snapshot is destroyed, decrement the * count. */ void dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, dmu_tx_t *tx) { int err; objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t count; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); /* * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$' && strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) { return; } /* * e.g. if renaming a dataset with no snapshots, count adjustment is 0 */ if (delta == 0) return; /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know the counts are not valid on this node and we * know we shouldn't touch this node's counts. An uninitialized count * on the node indicates that either the feature has not yet been * activated or there are no limits on this part of the tree. */ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, prop, sizeof (count), 1, &count)) == ENOENT) return; VERIFY0(err); count += delta; /* Use a signed verify to make sure we're not neg. */ VERIFY3S(count, >=, 0); VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, tx)); /* Roll up this additional count into our ancestors */ if (dd->dd_parent != NULL) dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); } uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); if (pds) { ddphys->dd_parent_obj = pds->dd_object; /* update the filesystem counts */ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); } ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || dsl_dir_phys(dd)->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } uint64_t dsl_dir_get_used(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_bytes); } uint64_t dsl_dir_get_compressed(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_compressed_bytes); } uint64_t dsl_dir_get_quota(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_quota); } uint64_t dsl_dir_get_reservation(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_reserved); } uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd) { /* a fixed point number, 100x the ratio */ return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / dsl_dir_phys(dd)->dd_compressed_bytes)); } uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_uncompressed_bytes); } uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); } uint64_t dsl_dir_get_usedds(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); } uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); } uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); } void dsl_dir_get_origin(dsl_dir_t *dd, char *buf) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) { if (dsl_dir_is_zapified(dd)) { objset_t *os = dd->dd_pool->dp_meta_objset; return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (*count), 1, count)); } else { return (SET_ERROR(ENOENT)); } } int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) { if (dsl_dir_is_zapified(dd)) { objset_t *os = dd->dd_pool->dp_meta_objset; return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (*count), 1, count)); } else { return (SET_ERROR(ENOENT)); } } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dsl_dir_get_quota(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dsl_dir_get_reservation(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, dsl_dir_get_logicalused(dd)); if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dsl_dir_get_usedsnap(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, dsl_dir_get_usedds(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, dsl_dir_get_usedrefreserv(dd)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, dsl_dir_get_usedchild(dd)); } mutex_exit(&dd->dd_lock); uint64_t count; if (dsl_dir_get_filesystem_count(dd, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, count); } if (dsl_dir_get_snapshot_count(dd, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } if (dsl_dir_is_clone(dd)) { char buf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dir_get_origin(dd, buf); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } } void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; ASSERT(dsl_dir_phys(dd)); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } } static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); uint64_t new_accounted = MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); return (new_accounted - old_accounted); } void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg, (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (int i = 0; i < TXG_SIZE; i++) space += dd->dd_space_towrite[i & TXG_MASK]; return (space); } /* * How much space would dd have available if ancestor had delta applied * to it? If ondiskonly is set, we're only interested in what's * on-disk, not estimated pending changes. */ uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly) { uint64_t parentspace, myspace, quota, used; /* * If there are no restrictions otherwise, assume we have * unlimited space available. */ quota = UINT64_MAX; parentspace = UINT64_MAX; if (dd->dd_parent != NULL) { parentspace = dsl_dir_space_available(dd->dd_parent, ancestor, delta, ondiskonly); } mutex_enter(&dd->dd_lock); if (dsl_dir_phys(dd)->dd_quota != 0) quota = dsl_dir_phys(dd)->dd_quota; used = dsl_dir_phys(dd)->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, ZFS_SPACE_CHECK_NORMAL); quota = MIN(quota, poolsize); } if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ parentspace += dsl_dir_phys(dd)->dd_reserved - used; } if (dd == ancestor) { ASSERT(delta <= 0); ASSERT(used >= -delta); used += delta; if (parentspace != UINT64_MAX) parentspace -= delta; } if (used > quota) { /* over quota */ myspace = 0; } else { /* * the lesser of the space provided by our parent and * the space left in our quota */ myspace = MIN(parentspace, quota - used); } mutex_exit(&dd->dd_lock); return (myspace); } struct tempreserve { list_node_t tr_node; dsl_dir_t *tr_ds; uint64_t tr_size; }; static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, boolean_t ignorequota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg; uint64_t quota; struct tempreserve *tr; int retval; uint64_t ext_quota; uint64_t ref_rsrv; top_of_function: txg = tx->tx_txg; retval = EDQUOT; ref_rsrv = 0; ASSERT3U(txg, !=, 0); ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ uint64_t est_inflight = dsl_dir_space_towrite(dd); for (int i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and * refreservation values. Also, if checkrefquota is set, test if * allocating this space would exceed the dataset's refquota. */ if (first && tx->tx_objset) { int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, !netfree, asize, est_inflight, &used_on_disk, &ref_rsrv); if (error != 0) { mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (error); } } /* * If this transaction will result in a net free of space, * we want to let it through. */ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 || (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL && zvol_enforce_quotas == B_FALSE)) quota = UINT64_MAX; else quota = dsl_dir_phys(dd)->dd_quota; /* * Adjust the quota against the actual pool size at the root * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of * removes to get through. */ if (dd->dd_parent == NULL) { uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, (netfree) ? ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); if (avail < quota) { quota = avail; retval = SET_ERROR(ENOSPC); } } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes * or deferred frees (which may free up space for us). */ ext_quota = quota >> 5; if (quota == UINT64_MAX) ext_quota = 0; if (used_on_disk >= quota) { if (retval == ENOSPC && (used_on_disk - quota) < dsl_pool_deferred_space(dd->dd_pool)) { retval = SET_ERROR(ERESTART); } /* Quota exceeded */ mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (retval); } else if (used_on_disk + est_inflight >= quota + ext_quota) { dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK\n", (u_longlong_t)used_on_disk>>10, (u_longlong_t)est_inflight>>10, (u_longlong_t)quota>>10, (u_longlong_t)asize>>10); mutex_exit(&dd->dd_lock); DMU_TX_STAT_BUMP(dmu_tx_quota); return (SET_ERROR(ERESTART)); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txg & TXG_MASK] += asize; uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent != NULL && parent_rsrv != 0) { /* * Recurse on our parent without recursion. This has been * observed to be potentially large stack usage even within * the test suite. Largest seen stack was 7632 bytes on linux. */ dd = dd->dd_parent; asize = parent_rsrv; ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); first = B_FALSE; goto top_of_function; } return (0); } /* * Reserve space in this dsl_dir, to be used in this tx's txg. * After the space has been dirtied (and dsl_dir_willuse_space() * has been called), the reservation should be canceled, using * dsl_dir_tempreserve_clear(). */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; if (asize == 0) { *tr_cookiep = NULL; return (0); } tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); } else { if (err == EAGAIN) { /* * If arc_memory_throttle() detected that pageout * is running and we are low on memory, we delay new * non-pageout transactions to give pageout an * advantage. * * It is unfortunate to be delaying while the caller's * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, MSEC2NSEC(10), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } if (err == 0) { err = dsl_dir_tempreserve_impl(dd, asize, netfree, B_FALSE, tr_list, tx, B_TRUE); } if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; return (err); } /* * Clear a temporary reservation that we previously made with * dsl_dir_tempreserve_space(). */ void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) { int txgidx = tx->tx_txg & TXG_MASK; list_t *tr_list = tr_cookie; struct tempreserve *tr; ASSERT3U(tx->tx_txg, !=, 0); if (tr_cookie == NULL) return; while ((tr = list_remove_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); } else { arc_tempreserve_clear(tr->tr_size); } kmem_free(tr, sizeof (struct tempreserve)); } kmem_free(tr_list, sizeof (list_t)); } /* * This should be called from open context when we think we're going to write * or free space, for example when dirtying data. Be conservative; it's okay * to write less space or free more, but we don't want to write more or free * less than the amount specified. * * NOTE: The behavior of this function is identical to the Illumos / FreeBSD * version however it has been adjusted to use an iterative rather than * recursive algorithm to minimize stack usage. */ void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; do { mutex_enter(&dd->dd_lock); if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); /* Make sure that we clean up dd_space_to* */ dsl_dir_dirty(dd, tx); dd = dd->dd_parent; space = parent_space; } while (space && dd); } /* call from syncing context when we actually write/free space for this dd */ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update * ds->ds_reserved and the dsl_dir accounting, so that * dsl_dataset_check_quota() can see dataset and dir accounting * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); if (needlock) mutex_enter(&dd->dd_lock); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || ddp->dd_uncompressed_bytes >= -uncompressed); ddp->dd_used_bytes += used; ddp->dd_uncompressed_bytes += uncompressed; ddp->dd_compressed_bytes += compressed; if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used); ddp->dd_used_breakdown[type] += used; #ifdef ZFS_DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += ddp->dd_used_breakdown[t]; ASSERT3U(u, ==, ddp->dd_used_bytes); } #endif } if (needlock) mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_transfer_space(dd->dd_parent, accounted_delta, compressed, uncompressed, used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); if (delta == 0 || !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? ddp->dd_used_breakdown[oldtype] >= delta : ddp->dd_used_breakdown[newtype] >= -delta); ASSERT(ddp->dd_used_bytes >= ABS(delta)); ddp->dd_used_breakdown[oldtype] -= delta; ddp->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } void dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used, int64_t compressed, int64_t uncompressed, int64_t tonew, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { int64_t accounted_delta; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); dsl_dir_phys_t *ddp = dsl_dir_phys(dd); accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used); ASSERT(used >= 0 || ddp->dd_used_bytes >= -used); ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || ddp->dd_uncompressed_bytes >= -uncompressed); ddp->dd_used_bytes += used; ddp->dd_uncompressed_bytes += uncompressed; ddp->dd_compressed_bytes += compressed; if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(tonew - used <= 0 || ddp->dd_used_breakdown[oldtype] >= tonew - used); ASSERT(tonew >= 0 || ddp->dd_used_breakdown[newtype] >= -tonew); ddp->dd_used_breakdown[oldtype] -= tonew - used; ddp->dd_used_breakdown[newtype] += tonew; #ifdef ZFS_DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += ddp->dd_used_breakdown[t]; ASSERT3U(u, ==, ddp->dd_used_bytes); } #endif } mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_transfer_space(dd->dd_parent, accounted_delta, compressed, uncompressed, used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dir_set_qr_arg_t; static int dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t towrite, newval; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); error = dsl_prop_predict(ds->ds_dir, "quota", ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { error = SET_ERROR(ENOSPC); } mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); } dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); dsl_dir_phys(ds->ds_dir)->dd_quota = newval; mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = quota; return (dsl_sync_task(ddname, dsl_dir_set_quota_check, dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static int dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t newval, used, avail; int error; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { avail = dsl_pool_adjustedsize(dd->dd_pool, ZFS_SPACE_CHECK_NORMAL) - used; } if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { uint64_t delta = MAX(used, newval) - MAX(used, dsl_dir_phys(dd)->dd_reserved); if (delta > avail || (dsl_dir_phys(dd)->dd_quota > 0 && newval > dsl_dir_phys(dd)->dd_quota)) error = SET_ERROR(ENOSPC); } dsl_dataset_rele(ds, FTAG); return (error); } void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; int64_t delta; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); dsl_dir_phys(dd)->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); } static void dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_RESERVATION), (longlong_t)newval); } dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = reservation; return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static dsl_dir_t * closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) { for (; ds1; ds1 = ds1->dd_parent) { dsl_dir_t *dd; for (dd = ds2; dd; dd = dd->dd_parent) { if (ds1 == dd) return (dd); } } return (NULL); } /* * If delta is applied to dd, how much of that delta would be applied to * ancestor? Syncing context only. */ static int64_t would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) { if (dd == ancestor) return (delta); mutex_enter(&dd->dd_lock); delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; cred_t *ddra_cred; proc_t *ddra_proc; } dsl_dir_rename_arg_t; typedef struct dsl_valid_rename_arg { int char_delta; int nest_delta; } dsl_valid_rename_arg_t; static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) dp; dsl_valid_rename_arg_t *dvra = arg; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, namebuf); ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); int namelen = strlen(namebuf) + dvra->char_delta; int depth = get_dataset_depth(namebuf) + dvra->nest_delta; if (namelen >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int dsl_dir_rename_check(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; dsl_valid_rename_arg_t dvra; dsl_dataset_t *parentds; objset_t *parentos; const char *mynewname; int error; /* target dir should exist */ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); if (error != 0) return (error); /* new parent should exist */ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname); if (error != 0) { dsl_dir_rele(dd, FTAG); return (error); } /* can't rename to different pool */ if (dd->dd_pool != newparent->dd_pool) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EXDEV)); } /* new name should not already exist */ if (mynewname == NULL) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EEXIST)); } /* can't rename below anything but filesystems (eg. no ZVOLs) */ error = dsl_dataset_hold_obj(newparent->dd_pool, dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } error = dmu_objset_from_ds(parentds, &parentos); if (error != 0) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } if (dmu_objset_type(parentos) != DMU_OST_ZFS) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } dsl_dataset_rele(parentds, FTAG); ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); dvra.char_delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) - get_dataset_depth(ddra->ddra_oldname); /* if the name length is growing, validate child name lengths */ if (dvra.char_delta > 0 || dvra.nest_delta > 0) { error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } if (dmu_tx_is_syncing(tx)) { if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Although this is the check function and we don't * normally make on-disk changes in check functions, * we need to do that here. * * Ensure this portion of the tree's counts have been * initialized in case the new parent has limits set. */ dsl_dir_init_fs_ss_count(dd, tx); } } if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_reserved); objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; if (dsl_dir_is_zapified(dd)) { int err; err = zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } /* * have to add 1 for the filesystem itself that we're * moving */ fs_cnt++; err = zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } } /* check for encryption errors */ error = dsl_dir_rename_crypt_check(dd, newparent); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EACCES)); } /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dir_transfer_possible(dd->dd_parent, newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred, ddra->ddra_proc); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (0); } static void dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname)); ASSERT3P(mynewname, !=, NULL); /* Log this before we change the name. */ spa_history_log_internal_dd(dd, "rename", tx, "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; /* * We already made sure the dd counts were initialized in the * check function. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt)); /* add 1 for the filesystem itself that we're moving */ fs_cnt++; VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt)); } dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(newparent, fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_fs_ss_count_adjust(newparent, ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dsl_dir_phys(dd)->dd_used_bytes, -dsl_dir_phys(dd)->dd_compressed_bytes, -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD, dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_compressed_bytes, dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); if (dsl_dir_phys(dd)->dd_reserved > dsl_dir_phys(dd)->dd_used_bytes) { uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - dsl_dir_phys(dd)->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ VERIFY0(zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx)); (void) strlcpy(dd->dd_myname, mynewname, sizeof (dd->dd_myname)); dsl_dir_rele(dd->dd_parent, dd); dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; VERIFY0(dsl_dir_hold_obj(dp, newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); /* TODO: A rename callback to avoid these layering violations. */ zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname, B_TRUE); dsl_prop_notify_all(dd); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); } int dsl_dir_rename(const char *oldname, const char *newname) { dsl_dir_rename_arg_t ddra; ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; ddra.ddra_cred = CRED(); ddra.ddra_proc = curproc; return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3, ZFS_SPACE_CHECK_RESERVED)); } int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr, proc_t *proc) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) return (SET_ERROR(ENOSPC)); err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, ancestor, cr, proc); if (err != 0) return (err); err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, ancestor, cr, proc); if (err != 0) return (err); return (0); } inode_timespec_t dsl_dir_snap_cmtime(dsl_dir_t *dd) { inode_timespec_t t; mutex_enter(&dd->dd_lock); t = dd->dd_snap_cmtime; mutex_exit(&dd->dd_lock); return (t); } void dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); inode_timespec_t t; gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t ddobj = dd->dd_object; dsl_dir_zapify(dd, tx); VERIFY0(zap_update(mos, ddobj, DD_FIELD_SNAPSHOTS_CHANGED, sizeof (uint64_t), sizeof (inode_timespec_t) / sizeof (uint64_t), &t, tx)); } mutex_exit(&dd->dd_lock); } void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); } boolean_t dsl_dir_is_zapified(dsl_dir_t *dd) { dmu_object_info_t doi; dmu_object_info_from_db(dd->dd_dbuf, &doi); return (doi.doi_type == DMU_OTN_ZAP_METADATA); } int dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) { objset_t *mos = dd->dd_pool->dp_meta_objset; ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, SPA_FEATURE_LIVELIST)); int err = dsl_deadlist_open(&dd->dd_livelist, mos, obj); if (err != 0) return (err); bplist_create(&dd->dd_pending_allocs); bplist_create(&dd->dd_pending_frees); return (0); } void dsl_dir_livelist_close(dsl_dir_t *dd) { dsl_deadlist_close(&dd->dd_livelist); bplist_destroy(&dd->dd_pending_allocs); bplist_destroy(&dd->dd_pending_frees); } void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) { uint64_t obj; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; livelist_condense_entry_t to_condense = spa->spa_to_condense; if (!dsl_deadlist_is_open(&dd->dd_livelist)) return; /* * If the livelist being removed is set to be condensed, stop the * condense zthr and indicate the cancellation in the spa_to_condense * struct in case the condense no-wait synctask has already started */ zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL && (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { /* * We use zthr_wait_cycle_done instead of zthr_cancel * because we don't want to destroy the zthr, just have * it skip its current task. */ spa->spa_to_condense.cancelled = B_TRUE; zthr_wait_cycle_done(ll_condense_thread); /* * If we've returned from zthr_wait_cycle_done without * clearing the to_condense data structure it's either * because the no-wait synctask has started (which is * indicated by 'syncing' field of to_condense) and we * can expect it to clear to_condense on its own. * Otherwise, we returned before the zthr ran. The * checkfunc will now fail as cancelled == B_TRUE so we * can safely NULL out ds, allowing a different dir's * livelist to be condensed. * * We can be sure that the to_condense struct will not * be repopulated at this stage because both this * function and dsl_livelist_try_condense execute in * syncing context. */ if ((spa->spa_to_condense.ds != NULL) && !spa->spa_to_condense.syncing) { dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; } } dsl_dir_livelist_close(dd); VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, DD_FIELD_LIVELIST, tx)); if (total) { dsl_deadlist_free(dp->dp_meta_objset, obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); } } static int dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); switch (activity) { case ZFS_WAIT_DELETEQ: { #ifdef _KERNEL objset_t *os; error = dmu_objset_from_ds(ds, &os); if (error != 0) break; mutex_enter(&os->os_user_ptr_lock); void *user = dmu_objset_get_user(os); mutex_exit(&os->os_user_ptr_lock); if (dmu_objset_type(os) != DMU_OST_ZFS || user == NULL || zfs_get_vfs_flag_unmounted(os)) { *in_progress = B_FALSE; return (0); } uint64_t readonly = B_FALSE; error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, NULL); if (error != 0) break; if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { *in_progress = B_FALSE; return (0); } uint64_t count, unlinked_obj; error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &unlinked_obj); if (error != 0) { dsl_dataset_rele(ds, FTAG); break; } error = zap_count(os, unlinked_obj, &count); if (error == 0) *in_progress = (count != 0); break; #else /* * The delete queue is ZPL specific, and libzpool doesn't have * it. It doesn't make sense to wait for it. */ (void) ds; *in_progress = B_FALSE; break; #endif } default: panic("unrecognized value for activity %d", activity); } return (error); } int dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, boolean_t *waited) { int error = 0; boolean_t in_progress; dsl_pool_t *dp = dd->dd_pool; for (;;) { dsl_pool_config_enter(dp, FTAG); error = dsl_dir_activity_in_progress(dd, ds, activity, &in_progress); dsl_pool_config_exit(dp, FTAG); if (error != 0 || !in_progress) break; *waited = B_TRUE; if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == 0 || dd->dd_activity_cancelled) { error = SET_ERROR(EINTR); break; } } return (error); } void dsl_dir_cancel_waiters(dsl_dir_t *dd) { mutex_enter(&dd->dd_activity_lock); dd->dd_activity_cancelled = B_TRUE; cv_broadcast(&dd->dd_activity_cv); while (dd->dd_activity_waiters > 0) cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); mutex_exit(&dd->dd_activity_lock); } #if defined(_KERNEL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); #endif -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW, "Enable strict ZVOL quota enforcment"); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 19fa76931b6e..3eba4cb35cc6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1,5348 +1,5347 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Grand theory statement on scan queue sorting * * Scanning is implemented by recursively traversing all indirection levels * in an object and reading all blocks referenced from said objects. This * results in us approximately traversing the object from lowest logical * offset to the highest. For best performance, we would want the logical * blocks to be physically contiguous. However, this is frequently not the * case with pools given the allocation patterns of copy-on-write filesystems. * So instead, we put the I/Os into a reordering queue and issue them in a * way that will most benefit physical disks (LBA-order). * * Queue management: * * Ideally, we would want to scan all metadata and queue up all block I/O * prior to starting to issue it, because that allows us to do an optimal * sorting job. This can however consume large amounts of memory. Therefore * we continuously monitor the size of the queues and constrain them to 5% * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this * limit, we clear out a few of the largest extents at the head of the queues * to make room for more scanning. Hopefully, these extents will be fairly * large and contiguous, allowing us to approach sequential I/O throughput * even without a fully sorted tree. * * Metadata scanning takes place in dsl_scan_visit(), which is called from * dsl_scan_sync() every spa_sync(). If we have either fully scanned all * metadata on the pool, or we need to make room in memory because our * queues are too large, dsl_scan_visit() is postponed and * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies * that metadata scanning and queued I/O issuing are mutually exclusive. This * allows us to provide maximum sequential I/O throughput for the majority of * I/O's issued since sequential I/O performance is significantly negatively * impacted if it is interleaved with random I/O. * * Implementation Notes * * One side effect of the queued scanning algorithm is that the scanning code * needs to be notified whenever a block is freed. This is needed to allow * the scanning code to remove these I/Os from the issuing queue. Additionally, * we do not attempt to queue gang blocks to be issued sequentially since this * is very hard to do and would have an extremely limited performance benefit. * Instead, we simply issue gang I/Os as soon as we find them using the legacy * algorithm. * * Backwards compatibility * * This new algorithm is backwards compatible with the legacy on-disk data * structures (and therefore does not require a new feature flag). * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan * will stop scanning metadata (in logical order) and wait for all outstanding * sorted I/O to complete. Once this is done, we write out a checkpoint * bookmark, indicating that we have scanned everything logically before it. * If the pool is imported on a machine without the new sorting algorithm, * the scan simply resumes from the last checkpoint using the legacy algorithm. */ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static int scan_ds_queue_compare(const void *a, const void *b); static int scan_prefetch_queue_compare(const void *a, const void *b); static void scan_ds_queue_clear(dsl_scan_t *scn); static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn); static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg); static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); static uint64_t dsl_scan_count_data_disks(spa_t *spa); static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; /* * 'zpool status' uses bytes processed per pass to report throughput and * estimate time remaining. We define a pass to start when the scanning * phase completes for a sequential resilver. Optionally, this value * may be used to reset the pass statistics every N txgs to provide an * estimated completion time based on currently observed performance. */ static uint_t zfs_scan_report_txgs = 0; /* * By default zfs will check to ensure it is not over the hard memory * limit before each txg. If finer-grained control of this is needed * this value can be set to 1 to enable checking before scanning each * block. */ static int zfs_scan_strict_mem_lim = B_FALSE; /* * Maximum number of parallelly executed bytes per leaf vdev. We attempt * to strike a balance here between keeping the vdev queues full of I/Os * at all times and not overflowing the queues to cause long latency, * which would cause long txg sync times. No matter what, we will not * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ static uint64_t zfs_scan_vdev_limit = 16 << 20; static uint_t zfs_scan_issue_strategy = 0; /* don't queue & sort zios, go direct */ static int zfs_scan_legacy = B_FALSE; static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ /* * fill_weight is non-tunable at runtime, so we copy it at module init from * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would * break queue sorting. */ static uint_t zfs_scan_fill_weight = 3; static uint64_t fill_weight; /* See dsl_scan_should_clear() for details on the memory limit tunables */ static const uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ /* fraction of physmem */ static uint_t zfs_scan_mem_lim_fact = 20; /* fraction of mem lim above */ static uint_t zfs_scan_mem_lim_soft_fact = 20; /* minimum milliseconds to scrub per txg */ static uint_t zfs_scrub_min_time_ms = 1000; /* minimum milliseconds to obsolete per txg */ static uint_t zfs_obsolete_min_time_ms = 500; /* minimum milliseconds to free per txg */ static uint_t zfs_free_min_time_ms = 1000; /* minimum milliseconds to resilver per txg */ static uint_t zfs_resilver_min_time_ms = 3000; static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */ int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */ static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ static uint64_t zfs_async_block_max_blocks = UINT64_MAX; /* max number of dedup blocks to free in a single TXG */ static uint64_t zfs_max_async_dedup_frees = 100000; /* set to disable resilver deferring */ static int zfs_resilver_disable_defer = B_FALSE; /* Don't defer a resilver if the one in progress only got this far: */ static uint_t zfs_resilver_defer_percent = 10; /* * We wait a few txgs after importing a pool to begin scanning so that * the import / mounting code isn't held up by scrub / resilver IO. * Unfortunately, it is a bit difficult to determine exactly how long * this will take since userspace will trigger fs mounts asynchronously * and the kernel will create zvol minors asynchronously. As a result, * the value provided here is a bit arbitrary, but represents a * reasonable estimate of how many txgs it will take to finish fully * importing a pool */ #define SCAN_IMPORT_WAIT_TXGS 5 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) #define DSL_SCAN_IS_SCRUB(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB) /* * Enable/disable the processing of the free_bpobj object. */ static int zfs_free_bpobj_enabled = 1; /* Error blocks to be scrubbed in one txg. */ static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; /* In core node for the scn->scn_queue. Represents a dataset to be scanned */ typedef struct { uint64_t sds_dsobj; uint64_t sds_txg; avl_node_t sds_node; } scan_ds_t; /* * This controls what conditions are placed on dsl_scan_sync_state(): * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0 * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0. * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise * write out the scn_phys_cached version. * See dsl_scan_sync_state for details. */ typedef enum { SYNC_OPTIONAL, SYNC_MANDATORY, SYNC_CACHED } state_sync_type_t; /* * This struct represents the minimum information needed to reconstruct a * zio for sequential scanning. This is useful because many of these will * accumulate in the sequential IO queues before being issued, so saving * memory matters here. */ typedef struct scan_io { /* fields from blkptr_t */ uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; uint32_t sio_nr_dvas; /* fields from zio_t */ uint32_t sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { avl_node_t sio_addr_node; /* link into issuing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; /* * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, * depending on how many were in the original bp. Only the * first DVA is really used for sorting and issuing purposes. * The other DVAs (if provided) simply exist so that the zio * layer can find additional copies to repair from in the * event of an error. This array must go at the end of the * struct to allow this for the variable number of elements. */ dva_t sio_dva[]; } scan_io_t; #define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) #define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) #define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) #define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) #define SIO_GET_END_OFFSET(sio) \ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) #define SIO_GET_MUSED(sio) \ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ zio_t *q_zio; /* scn_zio_root child for waiting on IO */ /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; uint64_t q_last_ext_addr; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; uint64_t q_inflight_bytes; kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ /* per txg statistics */ uint64_t q_total_seg_size_this_txg; uint64_t q_segs_this_txg; uint64_t q_total_zio_size_this_txg; uint64_t q_zios_this_txg; }; /* private data for dsl_scan_prefetch_cb() */ typedef struct scan_prefetch_ctx { zfs_refcount_t spc_refcnt; /* refcount for memory management */ dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ boolean_t spc_root; /* is this prefetch for an objset? */ uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ } scan_prefetch_ctx_t; /* private data for dsl_scan_prefetch() */ typedef struct scan_prefetch_issue_ctx { avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ blkptr_t spic_bp; /* bp to prefetch */ zbookmark_phys_t spic_zb; /* bookmark to prefetch */ } scan_prefetch_issue_ctx_t; static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio); static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; /* sio->sio_nr_dvas must be set so we know which cache to free from */ static void sio_free(scan_io_t *sio) { ASSERT3U(sio->sio_nr_dvas, >, 0); ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); } /* It is up to the caller to set sio->sio_nr_dvas for freeing */ static scan_io_t * sio_alloc(unsigned short nr_dvas) { ASSERT3U(nr_dvas, >, 0); ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); } void scan_init(void) { /* * This is used in ext_size_compare() to weight segments * based on how sparse they are. This cannot be changed * mid-scan and the tree comparison functions don't currently * have a mechanism for passing additional context to the * compare functions. Thus we store this value globally and * we only allow it to be set at module initialization time */ fill_weight = zfs_scan_fill_weight; for (int i = 0; i < SPA_DVAS_PER_BP; i++) { char name[36]; (void) snprintf(name, sizeof (name), "sio_cache_%d", i); sio_cache[i] = kmem_cache_create(name, (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), 0, NULL, NULL, NULL, NULL, NULL, 0); } } void scan_fini(void) { for (int i = 0; i < SPA_DVAS_PER_BP; i++) { kmem_cache_destroy(sio_cache[i]); } } static inline boolean_t dsl_scan_is_running(const dsl_scan_t *scn) { return (scn->scn_phys.scn_state == DSS_SCANNING); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dsl_scan_is_running(dp->dp_scan) && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } static inline void sio2bp(const scan_io_t *sio, blkptr_t *bp) { memset(bp, 0, sizeof (*bp)); bp->blk_prop = sio->sio_blk_prop; BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth); BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth); bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; ASSERT3U(sio->sio_nr_dvas, >, 0); ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp); sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp); sio->sio_cksum = bp->blk_cksum; sio->sio_nr_dvas = BP_GET_NDVAS(bp); /* * Copy the DVAs to the sio. We need all copies of the block so * that the self healing code can use the alternate copies if the * first is corrupted. We want the DVA at index dva_i to be first * in the sio since this is the primary one that we want to issue. */ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; } } int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; /* * It's possible that we're resuming a scan after a reboot so * make sure that the scan_async_destroying flag is initialized * appropriately. */ ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); /* * Calculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). * Limits for the issuing phase are done per top-level vdev and * are handled separately. */ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress for %s; " "restarting new-style scrub in txg %llu", spa->spa_name, (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); /* * Detect if the pool contains the signature of #2094. If it * does properly update the scn->scn_phys structure and notify * the administrator by setting an errata for the pool. */ if (err == EOVERFLOW) { uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, (23 * sizeof (uint64_t))); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); if (err == 0) { uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; if (overflow & ~DSL_SCAN_FLAGS_MASK || scn->scn_async_destroying) { spa->spa_errata = ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; return (EOVERFLOW); } memcpy(&scn->scn_phys, zaptmp, SCAN_PHYS_NUMINTS * sizeof (uint64_t)); scn->scn_phys.scn_flags = overflow; /* Required scrub already in progress. */ if (scn->scn_phys.scn_state == DSS_FINISHED || scn->scn_phys.scn_state == DSS_CANCELED) spa->spa_errata = ZPOOL_ERRATA_ZOL_2094_SCRUB; } } if (err == ENOENT) return (0); else if (err) return (err); /* * We might be restarting after a reboot, so jump the issued * counter to how far we've scanned. We know we're consistent * up to here. */ scn->scn_issued_before_pass = scn->scn_phys.scn_examined - scn->scn_phys.scn_skipped; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub for %s was modified " "by old software; restarting in txg %llu", spa->spa_name, (longlong_t)scn->scn_restart_txg); } else if (dsl_scan_resilvering(dp)) { /* * If a resilver is in progress and there are already * errors, restart it instead of finishing this scan and * then restarting it. If there haven't been any errors * then remember that the incore DTL is valid. */ if (scn->scn_phys.scn_errors > 0) { scn->scn_restart_txg = txg; zfs_dbgmsg("resilver can't excise DTL_MISSING " "when finished; restarting on %s in txg " "%llu", spa->spa_name, (u_longlong_t)scn->scn_restart_txg); } else { /* it's safe to excise DTL when finished */ spa->spa_scrub_started = B_TRUE; } } } memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); /* reload the queue into the in-core state */ if (scn->scn_phys.scn_queue_obj != 0) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za->za_name, NULL), za->za_first_integer); } zap_cursor_fini(&zc); zap_attribute_free(za); } ddt_walk_init(spa, scn->scn_phys.scn_max_txg); spa_scan_stat_init(spa); vdev_scan_stat_init(spa->spa_root_vdev); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan != NULL) { dsl_scan_t *scn = dp->dp_scan; if (scn->scn_taskq != NULL) taskq_destroy(scn->scn_taskq); scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); mutex_destroy(&scn->scn_queue_lock); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } static boolean_t dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) { return (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg); } boolean_t dsl_scan_resilver_scheduled(dsl_pool_t *dp) { return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); } boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; return (scn_phys->scn_state == DSS_SCANNING && scn_phys->scn_func == POOL_SCAN_SCRUB); } boolean_t dsl_errorscrubbing(const dsl_pool_t *dp) { dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys; return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING && errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB); } boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn) { return (dsl_errorscrubbing(scn->scn_dp) && scn->errorscrub_phys.dep_paused_flags); } boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { return (dsl_scan_scrubbing(scn->scn_dp) && scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } static void dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { scn->errorscrub_phys.dep_cursor = zap_cursor_serialize(&scn->errorscrub_cursor); VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys, tx)); } static void dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(!dsl_scan_is_running(scn)); ASSERT(!dsl_errorscrubbing(scn->scn_dp)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); scn->errorscrub_phys.dep_func = *funcp; scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING; scn->errorscrub_phys.dep_start_time = gethrestime_sec(); scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa); scn->errorscrub_phys.dep_examined = 0; scn->errorscrub_phys.dep_errors = 0; scn->errorscrub_phys.dep_cursor = 0; zap_cursor_init_serialized(&scn->errorscrub_cursor, spa->spa_meta_objset, spa->spa_errlog_last, scn->errorscrub_phys.dep_cursor); vdev_config_dirty(spa->spa_root_vdev); spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START); dsl_errorscrub_sync_state(scn, tx); spa_history_log_internal(spa, "error scrub setup", tx, "func=%u mintxg=%u maxtxg=%llu", *funcp, 0, (u_longlong_t)tx->tx_txg); } static int dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) { return (SET_ERROR(EBUSY)); } if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) { return (ECANCELED); } return (0); } /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always * want to write out the record, only when it is "safe" to do so. This safety * condition is achieved by making sure that the sorting queues are empty * (scn_queues_pending == 0). When this condition is not true, the sync'd state * is inconsistent with how much actual scanning progress has been made. The * kind of sync to be performed is specified by the sync_type argument. If the * sync is optional, we only sync if the queues are empty. If the sync is * mandatory, we do a hard ASSERT to make sure that the queues are empty. The * third possible state is a "cached" sync. This is done in response to: * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been * destroyed, so we wouldn't be able to restart scanning from it. * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been * superseded by a newer snapshot. * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been * swapped with its clone. * In all cases, a cached sync simply rewrites the last record we've written, * just slightly modified. For the modifications that are performed to the * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. */ static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) { int i; spa_t *spa = scn->scn_dp->dp_spa; ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0); if (scn->scn_queues_pending == 0) { for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; if (q == NULL) continue; mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } if (scn->scn_phys.scn_queue_obj != 0) scan_ds_queue_sync(scn, tx); VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); if (scn->scn_checkpointing) zfs_dbgmsg("finish scan checkpoint for %s", spa->spa_name); scn->scn_checkpointing = B_FALSE; scn->scn_last_checkpoint = ddi_get_lbolt(); } else if (sync_type == SYNC_CACHED) { VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys_cached, tx)); } } int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) || dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(EBUSY)); return (0); } void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { setup_sync_arg_t *setup_sync_arg = (setup_sync_arg_t *)arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(!dsl_scan_is_running(scn)); ASSERT3U(setup_sync_arg->func, >, POOL_SCAN_NONE); ASSERT3U(setup_sync_arg->func, <, POOL_SCAN_FUNCS); memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); /* * If we are starting a fresh scrub, we erase the error scrub * information from disk. */ memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); dsl_errorscrub_sync_state(scn, tx); scn->scn_phys.scn_func = setup_sync_arg->func; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = setup_sync_arg->txgstart; if (setup_sync_arg->txgend == 0) { scn->scn_phys.scn_max_txg = tx->tx_txg; } else { scn->scn_phys.scn_max_txg = setup_sync_arg->txgend; } scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_issued_before_pass = 0; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); vdev_scan_stat_init(spa->spa_root_vdev); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "healing"); spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_START); nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; /* * When starting a resilver clear any existing rebuild state. * This is required to prevent stale rebuild status from * being reported when a rebuild is run, then a resilver and * finally a scrub. In which case only the scrub status * should be reported by 'zpool status'. */ if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; vdev_rebuild_clear_sync( (void *)(uintptr_t)vd->vdev_id, tx); } } } /* back to the generic stuff */ if (zfs_scan_blkstats) { if (dp->dp_blkstats == NULL) { dp->dp_blkstats = vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); } memset(&dp->dp_blkstats->zab_type, 0, sizeof (dp->dp_blkstats->zab_type)); } else { if (dp->dp_blkstats) { vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); dp->dp_blkstats = NULL; } } if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); ddt_walk_init(spa, scn->scn_phys.scn_max_txg); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", setup_sync_arg->func, (u_longlong_t)scn->scn_phys.scn_min_txg, (u_longlong_t)scn->scn_phys.scn_max_txg); } /* * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub, * error scrub or resilver. Can also be called to resume a paused scrub or * error scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func, uint64_t txgstart, uint64_t txgend) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; setup_sync_arg_t setup_sync_arg; if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) { return (EINVAL); } /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); return (0); } if (func == POOL_SCAN_ERRORSCRUB) { if (dsl_errorscrub_is_paused(dp->dp_scan)) { /* * got error scrub start cmd, resume paused error scrub. */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, POOL_SCRUB_NORMAL); if (err == 0) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_RESUME); return (ECANCELED); } return (SET_ERROR(err)); } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync, &func, 0, ZFS_SPACE_CHECK_RESERVED)); } if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, POOL_SCRUB_NORMAL); if (err == 0) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (SET_ERROR(ECANCELED)); } return (SET_ERROR(err)); } setup_sync_arg.func = func; setup_sync_arg.txgstart = txgstart; setup_sync_arg.txgend = txgend; return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, dsl_scan_setup_sync, &setup_sync_arg, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static void dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; if (complete) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH); spa_history_log_internal(spa, "error scrub done", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } else { spa_history_log_internal(spa, "error scrub canceled", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED; spa->spa_scrub_active = B_FALSE; spa_errlog_rotate(spa); scn->errorscrub_phys.dep_end_time = gethrestime_sec(); zap_cursor_fini(&scn->errorscrub_cursor); if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) spa->spa_errata = 0; ASSERT(!dsl_errorscrubbing(scn->scn_dp)); } static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } scan_ds_queue_clear(scn); scan_ds_prefetch_queue_clear(scn); scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (!dsl_scan_is_running(scn)) { ASSERT(!scn->scn_is_sorted); return; } if (scn->scn_is_sorted) { scan_io_queues_destroy(scn); scn->scn_is_sorted = B_FALSE; if (scn->scn_taskq != NULL) { taskq_destroy(scn->scn_taskq); scn->scn_taskq = NULL; } } scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; spa_notify_waiters(spa); if (dsl_scan_restarting(scn, tx)) { spa_history_log_internal(spa, "scan aborted, restarting", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } else if (!complete) { spa_history_log_internal(spa, "scan cancelled", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); } else { spa_history_log_internal(spa, "scan done", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB(scn)) { VERIFY0(zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LAST_SCRUBBED_TXG, sizeof (uint64_t), 1, &scn->scn_phys.scn_max_txg, tx)); spa->spa_scrubbed_last_txg = scn->scn_phys.scn_max_txg; } } if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. * * As the scrub does not currently support traversing * data that have been freed but are part of a checkpoint, * we don't mark the scrub as done in the DTLs as faults * may still exist in those vdevs. */ if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); if (scn->scn_phys.scn_min_txg) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "healing"); spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_FINISH); nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_FINISH); } } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 0, B_TRUE, B_FALSE); } spa_errlog_rotate(spa); /* * Don't clear flag until after vdev_dtl_reassess to ensure that * DTL_MISSING will get updated when possible. */ spa->spa_scrub_started = B_FALSE; /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* * Clear any resilver_deferred flags in the config. * If there are drives that need resilvering, kick * off an asynchronous request to start resilver. * vdev_clear_resilver_deferred() may update the config * before the resilver can restart. In the event of * a crash during this period, the spa loading code * will find the drives that need to be resilvered * and start the resilver then. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { spa_history_log_internal(spa, "starting deferred resilver", tx, "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); spa_async_request(spa, SPA_ASYNC_RESILVER); } /* Clear recent error events (i.e. duplicate events tracking) */ if (complete) zfs_ereport_clear(spa, NULL); } scn->scn_phys.scn_end_time = gethrestime_sec(); if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) spa->spa_errata = 0; ASSERT(!dsl_scan_is_running(scn)); } static int dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* * can't pause a error scrub when there is no in-progress * error scrub. */ if (!dsl_errorscrubbing(dp)) return (SET_ERROR(ENOENT)); /* can't pause a paused error scrub */ if (dsl_errorscrub_is_paused(scn)) return (SET_ERROR(EBUSY)); } else if (*cmd != POOL_SCRUB_NORMAL) { return (SET_ERROR(ENOTSUP)); } return (0); } static void dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { spa->spa_scan_pass_errorscrub_pause = gethrestime_sec(); scn->errorscrub_phys.dep_paused_flags = B_TRUE; dsl_errorscrub_sync_state(scn, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_errorscrub_is_paused(scn)) { /* * We need to keep track of how much time we spend * paused per pass so that we can adjust the error scrub * rate shown in the output of 'zpool status'. */ spa->spa_scan_pass_errorscrub_spent_paused += gethrestime_sec() - spa->spa_scan_pass_errorscrub_pause; spa->spa_scan_pass_errorscrub_pause = 0; scn->errorscrub_phys.dep_paused_flags = B_FALSE; zap_cursor_init_serialized( &scn->errorscrub_cursor, spa->spa_meta_objset, spa->spa_errlog_last, scn->errorscrub_phys.dep_cursor); dsl_errorscrub_sync_state(scn, tx); } } } static int dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; /* can't cancel a error scrub when there is no one in-progress */ if (!dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(ENOENT)); return (0); } static void dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_errorscrub_done(scn, B_FALSE, tx); dsl_errorscrub_sync_state(scn, tx); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_ABORT); } static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); return (0); } static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); } int dsl_scan_cancel(dsl_pool_t *dp) { if (dsl_errorscrubbing(dp)) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ if (!dsl_scan_scrubbing(dp)) return (SET_ERROR(ENOENT)); /* can't pause a paused scrub */ if (dsl_scan_is_paused_scrub(scn)) return (SET_ERROR(EBUSY)); } else if (*cmd != POOL_SCRUB_NORMAL) { return (SET_ERROR(ENOTSUP)); } return (0); } static void dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); spa_notify_waiters(spa); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_scan_is_paused_scrub(scn)) { /* * We need to keep track of how much time we spend * paused per pass so that we can adjust the scrub rate * shown in the output of 'zpool status' */ spa->spa_scan_pass_scrub_spent_paused += gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); } } } /* * Set scrub pause/resume state if it makes sense to do so */ int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { if (dsl_errorscrubbing(dp)) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_errorscrub_pause_resume_check, dsl_errorscrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); } /* start a new scan, or restart an existing one. */ void dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver for %s at txg=%llu", dp->dp_spa->spa_name, (longlong_t)txg); } void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); } static int scan_ds_queue_compare(const void *a, const void *b) { const scan_ds_t *sds_a = a, *sds_b = b; if (sds_a->sds_dsobj < sds_b->sds_dsobj) return (-1); if (sds_a->sds_dsobj == sds_b->sds_dsobj) return (0); return (1); } static void scan_ds_queue_clear(dsl_scan_t *scn) { void *cookie = NULL; scan_ds_t *sds; while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { kmem_free(sds, sizeof (*sds)); } } static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); if (sds != NULL && txg != NULL) *txg = sds->sds_txg; return (sds != NULL); } static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) { scan_ds_t *sds; avl_index_t where; sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); sds->sds_dsobj = dsobj; sds->sds_txg = txg; VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); avl_insert(&scn->scn_queue, sds, where); } static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); VERIFY(sds != NULL); avl_remove(&scn->scn_queue, sds); kmem_free(sds, sizeof (*sds)); } static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; ASSERT0(scn->scn_queues_pending); ASSERT(scn->scn_phys.scn_queue_obj != 0); VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, DMU_OT_NONE, 0, tx); for (scan_ds_t *sds = avl_first(&scn->scn_queue); sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { VERIFY0(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, sds->sds_dsobj, sds->sds_txg, tx)); } } /* * Computes the memory limit state that we're currently in. A sorted scan * needs quite a bit of memory to hold the sorting queue, so we need to * reasonably constrain the size so it doesn't impact overall system * performance. We compute two limits: * 1) Hard memory limit: if the amount of memory used by the sorting * queues on a pool gets above this value, we stop the metadata * scanning portion and start issuing the queued up and sorted * I/Os to reduce memory usage. * This limit is calculated as a fraction of physmem (by default 5%). * We constrain the lower bound of the hard limit to an absolute * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain * the upper bound to 5% of the total pool size - no chance we'll * ever need that much memory, but just to keep the value in check. * 2) Soft memory limit: once we hit the hard memory limit, we start * issuing I/O to reduce queue memory usage, but we don't want to * completely empty out the queues, since we might be able to find I/Os * that will fill in the gaps of our non-sequential IOs at some point * in the future. So we stop the issuing of I/Os once the amount of * memory used drops below the soft limit (at which point we stop issuing * I/O and start scanning metadata again). * * This limit is calculated by subtracting a fraction of the hard * limit from the hard limit. By default this fraction is 5%, so * the soft limit is 95% of the hard limit. We cap the size of the * difference between the hard and soft limits at an absolute * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is * sufficient to not cause too frequent switching between the * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's * worth of queues is about 1.2 GiB of on-pool data, so scanning * that should take at least a decent fraction of a second). */ static boolean_t dsl_scan_should_clear(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; uint64_t alloc, mlim_hard, mlim_soft, mused; alloc = metaslab_class_get_alloc(spa_normal_class(spa)); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, zfs_scan_mem_lim_min); mlim_hard = MIN(mlim_hard, alloc / 20); mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, zfs_scan_mem_lim_soft_max); mused = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; dsl_scan_io_queue_t *queue; mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* * # of extents in exts_by_addr = # in exts_by_size. * B-tree efficiency is ~75%, but can be as low as 50%. */ mused += zfs_btree_numnodes(&queue->q_exts_by_size) * ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) * 3 / 2) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); if (mused == 0) ASSERT0(scn->scn_queues_pending); /* * If we are above our hard limit, we need to clear out memory. * If we are below our soft limit, we need to accumulate sequential IOs. * Otherwise, we should keep doing whatever we are currently doing. */ if (mused >= mlim_hard) return (B_TRUE); else if (mused < mlim_soft) return (B_FALSE); else return (scn->scn_clearing); } static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_suspending) return (B_TRUE); /* we're already suspending */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 and objset blocks. */ if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL)) return (B_FALSE); /* * We suspend if: * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly * (default 30%), someone is explicitly waiting for this txg * to complete, or we have used up all of the time in the txg * timeout (default 5 sec). * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. * or * - the scan queue has reached its memory use limit */ uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; if ((NSEC2MSEC(scan_time_ns) > mintime && (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) || !ddt_walk_ready(scn->scn_dp->dp_spa)) { if (zb && zb->zb_level == ZB_ROOT_LEVEL) { dprintf("suspending at first available bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); SET_BOOKMARK(&scn->scn_phys.scn_bookmark, zb->zb_objset, 0, 0, 0); } else if (zb != NULL) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } else { #ifdef ZFS_DEBUG dsl_scan_phys_t *scnp = &scn->scn_phys; dprintf("suspending at at DDT bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); #endif } scn->scn_suspending = B_TRUE; return (B_TRUE); } return (B_FALSE); } static boolean_t dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* * We suspend if: * - we have scrubbed for at least the minimum time (default 1 sec * for error scrub), someone is explicitly waiting for this txg * to complete, or we have used up all of the time in the txg * timeout (default 5 sec). * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. */ uint64_t curr_time_ns = gethrtime(); uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; int mintime = zfs_scrub_min_time_ms; if ((NSEC2MSEC(error_scrub_time_ns) > mintime && (txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa)) { if (zb) { dprintf("error scrub suspending at bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); } return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; static int dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { (void) zilog; zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } static int dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, uint64_t claim_txg) { (void) zilog; if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; const lr_write_t *lr = (const lr_write_t *)lrc; const blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; ASSERT(spa_writeable(dp->dp_spa)); /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg, B_FALSE); zil_free(zilog); } /* * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea * here is to sort the AVL tree by the order each block will be needed. */ static int scan_prefetch_queue_compare(const void *a, const void *b) { const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; return (zbookmark_compare(spc_a->spc_datablkszsec, spc_a->spc_indblkshift, spc_b->spc_datablkszsec, spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); } static void scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag) { if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { zfs_refcount_destroy(&spc->spc_refcnt); kmem_free(spc, sizeof (scan_prefetch_ctx_t)); } } static scan_prefetch_ctx_t * scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag) { scan_prefetch_ctx_t *spc; spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); zfs_refcount_create(&spc->spc_refcnt); zfs_refcount_add(&spc->spc_refcnt, tag); spc->spc_scn = scn; if (dnp != NULL) { spc->spc_datablkszsec = dnp->dn_datablkszsec; spc->spc_indblkshift = dnp->dn_indblkshift; spc->spc_root = B_FALSE; } else { spc->spc_datablkszsec = 0; spc->spc_indblkshift = 0; spc->spc_root = B_TRUE; } return (spc); } static void scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag) { zfs_refcount_add(&spc->spc_refcnt, tag); } static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; void *cookie = NULL; scan_prefetch_issue_ctx_t *spic = NULL; mutex_enter(&spa->spa_scrub_lock); while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue, &cookie)) != NULL) { scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } mutex_exit(&spa->spa_scrub_lock); } static boolean_t dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, const zbookmark_phys_t *zb) { zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; dnode_phys_t tmp_dnp; dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; if (zb->zb_objset != last_zb->zb_objset) return (B_TRUE); if ((int64_t)zb->zb_object < 0) return (B_FALSE); tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; tmp_dnp.dn_indblkshift = spc->spc_indblkshift; if (zbookmark_subtree_completed(dnp, zb, last_zb)) return (B_TRUE); return (B_FALSE); } static void dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) { avl_index_t idx; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) return; if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; if (dsl_scan_check_prefetch_resume(spc, zb)) return; scan_prefetch_ctx_add_ref(spc, scn); spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); spic->spic_spc = spc; spic->spic_bp = *bp; spic->spic_zb = *zb; /* * Add the IO to the queue of blocks to prefetch. This allows us to * prioritize blocks that we will need first for the main traversal * thread. */ mutex_enter(&spa->spa_scrub_lock); if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { /* this block is already queued for prefetch */ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); scan_prefetch_ctx_rele(spc, scn); mutex_exit(&spa->spa_scrub_lock); return; } avl_insert(&scn->scn_prefetch_queue, spic, idx); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } static void dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int i; zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) return; SET_BOOKMARK(&zb, objset, object, 0, 0); spc = scan_prefetch_ctx_create(scn, dnp, FTAG); for (i = 0; i < dnp->dn_nblkptr; i++) { zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); zb.zb_blkid = i; dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zb.zb_level = 0; zb.zb_blkid = DMU_SPILL_BLKID; dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb); } scan_prefetch_ctx_rele(spc, FTAG); } static void dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *private) { (void) zio; scan_prefetch_ctx_t *spc = private; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; /* broadcast that the IO has completed for rate limiting purposes */ mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); /* if there was an error or we are done prefetching, just cleanup */ if (buf == NULL || scn->scn_prefetch_stop) goto out; if (BP_GET_LEVEL(bp) > 0) { int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t czb; for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_prefetch(spc, cbp, &czb); } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { dnode_phys_t *cdnp; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_prefetch_dnode(scn, cdnp, zb->zb_objset, zb->zb_blkid * epb + i); } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { objset_phys_t *osp = buf->b_data; dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (OBJSET_BUF_HAS_USERUSED(buf)) { if (OBJSET_BUF_HAS_PROJECTUSED(buf)) { dsl_scan_prefetch_dnode(scn, &osp->os_projectused_dnode, zb->zb_objset, DMU_PROJECTUSED_OBJECT); } dsl_scan_prefetch_dnode(scn, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); dsl_scan_prefetch_dnode(scn, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } } out: if (buf != NULL) arc_buf_destroy(buf, private); scan_prefetch_ctx_rele(spc, scn); } static void dsl_scan_prefetch_thread(void *arg) { dsl_scan_t *scn = arg; spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; /* loop until we are told to stop */ while (!scn->scn_prefetch_stop) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; mutex_enter(&spa->spa_scrub_lock); /* * Wait until we have an IO to issue and are not above our * maximum in flight limit. */ while (!scn->scn_prefetch_stop && (avl_numnodes(&scn->scn_prefetch_queue) == 0 || spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } /* recheck if we should stop since we waited for the cv */ if (scn->scn_prefetch_stop) { mutex_exit(&spa->spa_scrub_lock); break; } /* remove the prefetch IO from the tree */ spic = avl_first(&scn->scn_prefetch_queue); spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); avl_remove(&scn->scn_prefetch_queue, spic); mutex_exit(&spa->spa_scrub_lock); if (BP_IS_PROTECTED(&spic->spic_bp)) { ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE || BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET); ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0); zio_flags |= ZIO_FLAG_RAW; } /* We don't need data L1 buffer since we do not prefetch L0. */ blkptr_t *bp = &spic->spic_bp; if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET) flags |= ARC_FLAG_NO_BUF; /* issue the prefetch asynchronously */ (void) arc_read(scn->scn_zio_root, spa, bp, dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT(scn->scn_prefetch_stop); /* free any prefetches we didn't get to complete */ mutex_enter(&spa->spa_scrub_lock); while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { avl_remove(&scn->scn_prefetch_queue, spic); scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); mutex_exit(&spa->spa_scrub_lock); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_subtree_completed(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it, zero it out to indicate that it's OK * to start checking for suspending again. */ if (zbookmark_subtree_tbd(dnp, zb, &scn->scn_phys.scn_bookmark)) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb)); } } return (B_FALSE); } static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); inline __attribute__((always_inline)) static void dsl_scan_visitdnode( dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ inline __attribute__((always_inline)) static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; ASSERT(!BP_IS_REDACTED(bp)); /* * There is an unlikely case of encountering dnodes with contradicting * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created * or modified before commit 4254acb was merged. As it is not possible * to know which of the two is correct, report an error. */ if (dnp != NULL && dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { scn->scn_phys.scn_errors++; spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; if (BP_IS_PROTECTED(bp)) { ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); zio_flags |= ZIO_FLAG_RAW; } err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = buf->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(buf)) { /* * We also always visit user/group/project accounting * objects, and never skip them, even if we are * suspending. This is necessary so that the * space deltas from this txg get integrated. */ if (OBJSET_BUF_HAS_PROJECTUSED(buf)) dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_projectused_dnode, DMU_PROJECTUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); } else if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled * by arc_read() for the cases above. */ scn->scn_phys.scn_errors++; spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } return (0); } inline __attribute__((always_inline)) static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), &czb, dnp, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; if (dsl_scan_check_suspend(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; scn->scn_visited_this_txg++; if (BP_IS_HOLE(bp)) { scn->scn_holes_this_txg++; return; } if (BP_IS_REDACTED(bp)) { ASSERT(dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)); return; } /* * Check if this block contradicts any filesystem flags. */ spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS; if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) ASSERT(dsl_dataset_feature_is_active(ds, f)); f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); if (f != SPA_FEATURE_NONE) ASSERT(dsl_dataset_feature_is_active(ds, f)); f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); if (f != SPA_FEATURE_NONE) ASSERT(dsl_dataset_feature_is_active(ds, f)); if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0) return; /* * If dsl_scan_ddt() has already visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { scn->scn_ddt_contained_this_txg++; return; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; return; } scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { SET_BOOKMARK(&scn->scn_prefetch_bookmark, zb.zb_objset, 0, 0, 0); } else { scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; } scn->scn_objsets_visited_this_txg++; spc = scan_prefetch_ctx_create(scn, NULL, FTAG); dsl_scan_prefetch(spc, bp, &zb); scan_prefetch_ctx_rele(spc, FTAG); dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } static void ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) { if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: * - scn_cur_{min,max}_txg stays the same. * - Setting the flag is not really necessary if * scn_cur_max_txg == scn_max_txg, because there * is nothing after this snapshot that we care * about. However, we set it anyway and then * ignore it when we retraverse it in * dsl_scan_visitds(). */ scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu on %s; currently " "traversing; reset zb_objset to %llu", (u_longlong_t)ds->ds_object, ds->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu on %s; currently " "traversing; reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object, ds->ds_dir->dd_pool->dp_spa->spa_name); } } } /* * Invoked when a dataset is destroyed. We need to make sure that: * * 1) If it is the dataset that was currently being scanned, we write * a new dsl_scan_phys_t and marking the objset reference in it * as destroyed. * 2) Remove it from the work queue, if it was present. * * If the dataset was actually a snapshot, instead of marking the dataset * as destroyed, we instead substitute the next snapshot in line. */ void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ds_destroyed_scn_phys(ds, &scn->scn_phys); ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); if (ds->ds_is_snapshot) scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, dp->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu on %s; in queue; " "removing", (u_longlong_t)ds->ds_object, dp->dp_spa->spa_name); } } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds->ds_object) { scn_bookmark->zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, ds->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } } /* * Called when a dataset is snapshotted. If we were currently traversing * this snapshot, we reset our bookmark to point at the newly created * snapshot. We also modify our work queue to remove the old snapshot and * replace with the new one. */ void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, dp->dp_spa->spa_name, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds1->ds_object) { scn_bookmark->zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, ds1->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)ds2->ds_object); } else if (scn_bookmark->zb_objset == ds2->ds_object) { scn_bookmark->zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, ds2->ds_dir->dd_pool->dp_spa->spa_name, (u_longlong_t)ds1->ds_object); } } /* * Called when an origin dataset and its clone are swapped. If we were * currently traversing the dataset, we need to switch to traversing the * newly promoted clone. */ void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg1, mintxg2; boolean_t ds1_queued, ds2_queued; if (!dsl_scan_is_running(scn)) return; ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); /* * Handle the in-memory scan queue. */ ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); /* Sanity checking. */ if (ds1_queued) { ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds2_queued) { ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds1_queued && ds2_queued) { /* * If both are queued, we don't need to do anything. * The swapping code below would not handle this case correctly, * since we can't insert ds2 if it is already there. That's * because scan_ds_queue_insert() prohibits a duplicate insert * and panics. */ } else if (ds1_queued) { scan_ds_queue_remove(scn, ds1->ds_object); scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); } else if (ds2_queued) { scan_ds_queue_remove(scn, ds2->ds_object); scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); } /* * Handle the on-disk scan queue. * The on-disk state is an out-of-date version of the in-memory state, * so the in-memory and on-disk values for ds1_queued and ds2_queued may * be different. Therefore we need to apply the swap logic to the * on-disk state independently of the in-memory state. */ ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; /* Sanity checking. */ if (ds1_queued) { ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds2_queued) { ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds1_queued && ds2_queued) { /* * If both are queued, we don't need to do anything. * Alternatively, we could check for EEXIST from * zap_add_int_key() and back out to the original state, but * that would be more work than checking for this case upfront. */ } else if (ds1_queued) { VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, dp->dp_spa->spa_name, (u_longlong_t)ds2->ds_object); } else if (ds2_queued) { VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, dp->dp_spa->spa_name, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { uint64_t originobj = *(uint64_t *)arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (scn->scn_phys.scn_cur_min_txg >= scn->scn_phys.scn_max_txg) { /* * This can happen if this snapshot was created after the * scan started, and we already completed a previous snapshot * that was created after the scan started. This snapshot * only references blocks with: * * birth < our ds_creation_txg * cur_min_txg is no less than ds_creation_txg. * We have already visited these blocks. * or * birth > scn_max_txg * The scan requested not to visit these blocks. * * Subsequent snapshots (and clones) can reference our * blocks, or blocks with even higher birth times. * Therefore we do not need to visit them either, * so we do not add them to the work queue. * * Note that checking for cur_min_txg >= cur_max_txg * is not sufficient, because in that case we may need to * visit subsequent snapshots. This happens when min_txg > 0, * which raises cur_min_txg. In this case we will visit * this dataset but skip all of its blocks, because the * rootbp's birth time is < cur_min_txg. Then we will * add the next snapshots/clones to the work queue. */ char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; } /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. In addition, $ORIGIN * doesn't have a objset (i.e. its ds_bp is a hole) so we don't * need to look for a ZIL in it either. So we traverse the ZIL here, * rather than in scan_recurse(), because the regular snapshot * block-sharing rules don't apply to it. */ if (!dsl_dataset_is_snapshot(ds) && (dp->dp_origin_snap == NULL || ds->ds_dir != dp->dp_origin_snap->ds_dir)) { objset_t *os; if (dmu_objset_from_ds(ds, &os) != 0) { goto out; } dsl_scan_zil(dp, &os->os_zil_header); } /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "suspending=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_suspending); kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); if (scn->scn_suspending) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass on %s; visiting again", dp->dp_spa->spa_name); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; scan_ds_queue_insert(scn, ds->ds_object, scn->scn_phys.scn_cur_max_txg); goto out; } /* * Add descendant datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, dsl_dataset_phys(ds)->ds_creation_txg); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za->za_name, NULL), dsl_dataset_phys(ds)->ds_creation_txg); } zap_cursor_fini(&zc); zap_attribute_free(za); } else { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_clones_cb, &ds->ds_object, DS_FIND_CHILDREN)); } } out: dsl_dataset_rele(ds, FTAG); } static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { (void) arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } mutex_enter(&scn->scn_queue_lock); scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); mutex_exit(&scn->scn_queue_lock); dsl_dataset_rele(ds, FTAG); return (0); } void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { (void) tx; const ddt_key_t *ddk = &ddlwe->ddlwe_key; blkptr_t bp; zbookmark_phys_t zb = { 0 }; if (!dsl_scan_is_running(scn)) return; /* * This function is special because it is the only thing * that can add scan_io_t's to the vdev scan queues from * outside dsl_scan_sync(). For the most part this is ok * as long as it is called from within syncing context. * However, dsl_scan_sync() expects that no new sio's will * be added between when all the work for a scan is done * and the next txg when the scan is actually marked as * completed. This check ensures we do not issue new sio's * during this period. */ if (scn->scn_done_txg != 0) return; for (int p = 0; p < DDT_NPHYS(ddt); p++) { ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v); if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg) continue; ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (ddt_class_t) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_lightweight_entry_t ddlwe = {0}; int error; uint64_t n = 0; while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) break; } if (error == EAGAIN) { dsl_scan_check_suspend(scn, NULL); error = 0; zfs_dbgmsg("waiting for ddt to become ready for scan " "on %s with class_max = %u; suspending=%u", scn->scn_dp->dp_spa->spa_name, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); } else zfs_dbgmsg("scanned %llu ddt entries on %s with " "class_max = %u; suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (ds->ds_is_snapshot) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { scan_ds_t *sds; dsl_pool_t *dp = scn->scn_dp; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_suspending) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); if (scn->scn_suspending) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_cb, NULL, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; /* * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t)); /* * Keep pulling things out of the dataset avl queue. Updates to the * persistent zap-object-as-queue happen only at checkpoints. */ while ((sds = avl_first(&scn->scn_queue)) != NULL) { dsl_dataset_t *ds; uint64_t dsobj = sds->sds_dsobj; uint64_t txg = sds->sds_txg; /* dequeue and free the ds from the queue */ scan_ds_queue_remove(scn, dsobj); sds = NULL; /* set up min / max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (txg != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, txg); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* No more objsets to fetch, we're done */ scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; ASSERT0(scn->scn_suspending); } static uint64_t dsl_scan_count_data_disks(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t i, leaves = 0; for (i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache) continue; leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd); } return (leaves); } static void scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) { int i; uint64_t cur_size = 0; for (i = 0; i < BP_GET_NDVAS(bp); i++) { cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); } q->q_total_zio_size_this_txg += cur_size; q->q_zios_this_txg++; } static void scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, uint64_t end) { q->q_total_seg_size_this_txg += end - start; q->q_segs_this_txg++; } static boolean_t scan_io_queue_check_suspend(dsl_scan_t *scn) { /* See comment in dsl_scan_check_suspend() */ uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; return ((NSEC2MSEC(scan_time_ns) > mintime && (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa)); } /* * Given a list of scan_io_t's in io_list, this issues the I/Os out to * disk. This consumes the io_list and frees the scan_io_t's. This is * called when emptying queues, either when we're up against the memory * limit or when we have finished scanning. Returns B_TRUE if we stopped * processing the list before we finished. Any sios that were not issued * will remain in the io_list. */ static boolean_t scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; boolean_t suspended = B_FALSE; while ((sio = list_head(io_list)) != NULL) { blkptr_t bp; if (scan_io_queue_check_suspend(scn)) { suspended = B_TRUE; break; } sio2bp(sio, &bp); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); sio_free(sio); } return (suspended); } /* * This function removes sios from an IO queue which reside within a given * range_seg_t and inserts them (in offset order) into a list. Note that * we only ever return a maximum of 32 sios at once. If there are more sios * to process within this segment that did not make it onto the list we * return B_TRUE and otherwise B_FALSE. */ static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) { scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); sio_free(srch_sio); if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, queue->q_exts_by_addr) && num_sios <= 32) { ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, queue->q_exts_by_addr)); ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); if (avl_is_empty(&queue->q_sios_by_addr)) atomic_add_64(&queue->q_scn->scn_queues_pending, -1); queue->q_sio_memused -= SIO_GET_MUSED(sio); bytes_issued += SIO_GET_ASIZE(sio); num_sios++; list_insert_tail(list, sio); sio = next_sio; } /* * We limit the number of sios we process at once to 32 to avoid * biting off more than we can chew. If we didn't take everything * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, queue->q_exts_by_addr)) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, SIO_GET_OFFSET(sio), rs_get_end(rs, queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); queue->q_last_ext_addr = SIO_GET_OFFSET(sio); return (B_TRUE); } else { uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); queue->q_last_ext_addr = -1; return (B_FALSE); } } /* * This is called from the queue emptying thread and selects the next * extent from which we are to issue I/Os. The behavior of this function * depends on the state of the scan, the current memory consumption and * whether or not we are performing a scan shutdown. * 1) We select extents in an elevator algorithm (LBA-order) if the scan * needs to perform a checkpoint * 2) We select the largest available extent if we are up against the * memory limit. * 3) Otherwise we don't select any extents. */ static range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); if (!scn->scn_checkpointing && !scn->scn_clearing) return (NULL); /* * During normal clearing, we want to issue our largest segments * first, keeping IO as sequential as possible, and leaving the * smaller extents for later with the hope that they might eventually * grow to larger sequential segments. However, when the scan is * checkpointing, no new extents will be added to the sorting queue, * so the way we are sorted now is as good as it will ever get. * In this case, we instead switch to issuing extents in LBA order. */ if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) || zfs_scan_issue_strategy == 1) return (range_tree_first(rt)); /* * Try to continue previous extent if it is not completed yet. After * shrink in scan_io_queue_gather() it may no longer be the best, but * otherwise we leave shorter remnant every txg. */ uint64_t start; uint64_t size = 1ULL << rt->rt_shift; range_seg_t *addr_rs; if (queue->q_last_ext_addr != -1) { start = queue->q_last_ext_addr; addr_rs = range_tree_find(rt, start, size); if (addr_rs != NULL) return (addr_rs); } /* * Nothing to continue, so find new best extent. */ uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL); if (v == NULL) return (NULL); queue->q_last_ext_addr = start = *v << rt->rt_shift; /* * We need to get the original entry in the by_addr tree so we can * modify it. */ addr_rs = range_tree_find(rt, start, size); ASSERT3P(addr_rs, !=, NULL); ASSERT3U(rs_get_start(addr_rs, rt), ==, start); ASSERT3U(rs_get_end(addr_rs, rt), >, start); return (addr_rs); } static void scan_io_queues_run_one(void *arg) { dsl_scan_io_queue_t *queue = arg; kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; boolean_t suspended = B_FALSE; range_seg_t *rs; scan_io_t *sio; zio_t *zio; list_t sio_list; ASSERT(queue->q_scn->scn_is_sorted); list_create(&sio_list, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_list_node)); zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa, NULL, NULL, NULL, ZIO_FLAG_CANFAIL); mutex_enter(q_lock); queue->q_zio = zio; /* Calculate maximum in-flight bytes for this vdev. */ queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit * (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd))); /* reset per-queue scan statistics for this txg */ queue->q_total_seg_size_this_txg = 0; queue->q_segs_this_txg = 0; queue->q_total_zio_size_this_txg = 0; queue->q_zios_this_txg = 0; /* loop until we run out of time or sios */ while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { uint64_t seg_start = 0, seg_end = 0; boolean_t more_left; ASSERT(list_is_empty(&sio_list)); /* loop while we still have sios left to process in this rs */ do { scan_io_t *first_sio, *last_sio; /* * We have selected which extent needs to be * processed next. Gather up the corresponding sios. */ more_left = scan_io_queue_gather(queue, rs, &sio_list); ASSERT(!list_is_empty(&sio_list)); first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); seg_end = SIO_GET_END_OFFSET(last_sio); if (seg_start == 0) seg_start = SIO_GET_OFFSET(first_sio); /* * Issuing sios can take a long time so drop the * queue lock. The sio queue won't be updated by * other threads since we're in syncing context so * we can be sure that our trees will remain exactly * as we left them. */ mutex_exit(q_lock); suspended = scan_io_queue_issue(queue, &sio_list); mutex_enter(q_lock); if (suspended) break; } while (more_left); /* update statistics for debugging purposes */ scan_io_queues_update_seg_stats(queue, seg_start, seg_end); if (suspended) break; } /* * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ while ((sio = list_remove_head(&sio_list)) != NULL) scan_io_queue_insert_impl(queue, sio); queue->q_zio = NULL; mutex_exit(q_lock); zio_nowait(zio); list_destroy(&sio_list); } /* * Performs an emptying run on all scan queues in the pool. This just * punches out one thread per top-level vdev, each of which processes * only that vdev's scan queue. We can parallelize the I/O here because * we know that each queue's I/Os only affect its own top-level vdev. * * This function waits for the queue runs to complete, and must be * called from dsl_scan_sync (or in general, syncing context). */ static void scan_io_queues_run(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; ASSERT(scn->scn_is_sorted); ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (scn->scn_queues_pending == 0) return; if (scn->scn_taskq == NULL) { int nthreads = spa->spa_root_vdev->vdev_children; /* * We need to make this taskq *always* execute as many * threads in parallel as we have top-level vdevs and no * less, otherwise strange serialization of the calls to * scan_io_queues_run_one can occur during spa_sync runs * and that significantly impacts performance. */ scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads, minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE); } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; mutex_enter(&vd->vdev_scan_io_queue_lock); if (vd->vdev_scan_io_queue != NULL) { VERIFY(taskq_dispatch(scn->scn_taskq, scan_io_queues_run_one, vd->vdev_scan_io_queue, TQ_SLEEP) != TASKQID_INVALID); } mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * Wait for the queues to finish issuing their IOs for this run * before we return. There may still be IOs in flight at this * point. */ taskq_wait(scn->scn_taskq); } static boolean_t dsl_scan_async_block_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); if (zfs_async_block_max_blocks != 0 && scn->scn_visited_this_txg >= zfs_async_block_max_blocks) { return (B_TRUE); } if (zfs_max_async_dedup_frees != 0 && scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) { return (B_TRUE); } elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; if (BP_GET_DEDUP(bp)) scn->scn_dedup_frees_this_txg++; return (0); } static void dsl_scan_update_stats(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t i; uint64_t seg_size_total = 0, zio_size_total = 0; uint64_t seg_count_total = 0, zio_count_total = 0; for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; if (queue == NULL) continue; seg_size_total += queue->q_total_seg_size_this_txg; zio_size_total += queue->q_total_zio_size_this_txg; seg_count_total += queue->q_segs_this_txg; zio_count_total += queue->q_zios_this_txg; } if (seg_count_total == 0 || zio_count_total == 0) { scn->scn_avg_seg_size_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_zios_this_txg = 0; return; } scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; scn->scn_segs_this_txg = seg_count_total; scn->scn_zios_this_txg = zio_count_total; } static int bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (dsl_scan_free_block_cb(arg, bp, tx)); } static int dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); dsl_scan_t *scn = arg; const dva_t *dva = &bp->blk_dva[0]; if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; boolean_t clones_left; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } clones_left = spa_livelist_delete_check(spa); return ((used != 0) || (clones_left)); } boolean_t dsl_errorscrub_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if (dsl_errorscrubbing(scn->scn_dp)) return (B_TRUE); return (B_FALSE); } static boolean_t dsl_scan_check_deferred(vdev_t *vd) { boolean_t need_resilver = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) { need_resilver |= dsl_scan_check_deferred(vd->vdev_child[c]); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (need_resilver); if (!vd->vdev_resilver_deferred) need_resilver = B_TRUE; return (need_resilver); } static boolean_t dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_t *vd; vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops == &vdev_indirect_ops) { /* * The indirect vdev can point to multiple * vdevs. For simplicity, always create * the resilver zio_t. zio_vdev_io_start() * will bypass the child resilver i/o's if * they are on vdevs that don't have DTL's. */ return (B_TRUE); } if (DVA_GET_GANG(dva)) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ return (B_TRUE); } /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* * Check that this top-level vdev has a device under it which * is resilvering and is not deferred. */ if (!dsl_scan_check_deferred(vd)) return (B_FALSE); return (B_TRUE); } static int dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; if (spa_suspend_async_destroy(spa)) return (0); if (zfs_free_bpobj_enabled && spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, bpobj_dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); } if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err == EIO || err == ECKSUM) { err = 0; } else if (err != 0 && err != ERESTART) { zfs_panic_recover("error %u from " "traverse_dataset_destroyed()", err); } if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { /* finished; deactivate async destroy feature */ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); ASSERT(!spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; scn->scn_async_destroying = B_FALSE; scn->scn_async_stalled = B_FALSE; } else { /* * If we didn't make progress, mark the async * destroy as stalled, so that we will not initiate * a spa_sync() on its behalf. Note that we only * check this if we are not finished, because if the * bptree had no blocks for us to visit, we can * finish without "making progress". */ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree on %s in txg %llu; err=%u", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), spa->spa_name, (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; scn->scn_dedup_frees_this_txg = 0; /* * Write out changes to the DDT and the BRT that may be required * as a result of the blocks freed. This ensures that the DDT * and the BRT are clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); brt_sync(spa, tx->tx_txg); } if (err != 0) return (err); if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked * space to the dp_leak_dir. */ if (dp->dp_leak_dir == NULL) { rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, LEAK_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir)); rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && !spa_livelist_delete_check(spa)) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } spa_notify_waiters(spa); EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ)); if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)); scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; err = bpobj_iterate(&dp->dp_obsolete_bpobj, dsl_scan_obsolete_block_cb, scn, tx); if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) dsl_pool_destroy_obsolete_bpobj(dp, tx); } return (0); } static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { zb->zb_objset = zfs_strtonum(buf, &buf); ASSERT(*buf == ':'); zb->zb_object = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } static void name_to_object(char *buf, uint64_t *obj) { *obj = zfs_strtonum(buf, &buf); ASSERT(*buf == '\0'); } static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; objset_t *os; if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0) return; if (dmu_objset_from_ds(ds, &os) != 0) { dsl_dataset_rele(ds, FTAG); return; } /* * If the key is not loaded dbuf_dnode_findbp() will error out with * EACCES. However in that case dnode_hold() will eventually call * dbuf_read()->zio_wait() which may call spa_log_error(). This will * lead to a deadlock due to us holding the mutex spa_errlist_lock. * Avoid this by checking here if the keys are loaded, if not return. * If the keys are not loaded the head_errlog feature is meaningless * as we cannot figure out the birth txg of the block pointer. */ if (dsl_dataset_get_keystatus(ds->ds_dir) == ZFS_KEYSTATUS_UNAVAILABLE) { dsl_dataset_rele(ds, FTAG); return; } dnode_t *dn; blkptr_t bp; if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) { dsl_dataset_rele(ds, FTAG); return; } rw_enter(&dn->dn_struct_rwlock, RW_READER); int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL, NULL); if (error) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele(ds, FTAG); return; } if (!error && BP_IS_HOLE(&bp)) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele(ds, FTAG); return; } int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB; /* If it's an intent log block, failure is expected. */ if (zb.zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; ASSERT(!BP_IS_EMBEDDED(&bp)); scan_exec_io(dp, &bp, zio_flags, &zb, NULL); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele(ds, FTAG); } /* * We keep track of the scrubbed error blocks in "count". This will be used * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This * function is modelled after check_filesystem(). */ static int scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, int *count) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds); if (error != 0) return (error); uint64_t latest_txg; uint64_t txg_to_consider = spa->spa_syncing_txg; boolean_t check_snapshot = B_TRUE; error = find_birth_txg(ds, zep, &latest_txg); /* * If find_birth_txg() errors out, then err on the side of caution and * proceed. In worst case scenario scrub all objects. If zep->zb_birth * is 0 (e.g. in case of encryption with unloaded keys) also proceed to * scrub all objects. */ if (error == 0 && zep->zb_birth == latest_txg) { /* Block neither free nor re written. */ zbookmark_phys_t zb; zep_to_zb(fs, zep, &zb); scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* We have already acquired the config lock for spa */ read_by_block_level(scn, zb); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; scn->errorscrub_phys.dep_examined++; scn->errorscrub_phys.dep_to_examine--; (*count)++; if ((*count) == zfs_scrub_error_blocks_per_txg || dsl_error_scrub_check_suspend(scn, &zb)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EFAULT)); } check_snapshot = B_FALSE; } else if (error == 0) { txg_to_consider = latest_txg; } /* * Retrieve the number of snapshots if the dataset is not a snapshot. */ uint64_t snap_count = 0; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { error = zap_count(spa->spa_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } } if (snap_count == 0) { /* Filesystem without snapshots. */ dsl_dataset_rele(ds, FTAG); return (0); } uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele(ds, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); if (error != 0) return (error); if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele(ds, FTAG); continue; } boolean_t affected = B_TRUE; if (check_snapshot) { uint64_t blk_txg; error = find_birth_txg(ds, zep, &blk_txg); /* * Scrub the snapshot also when zb_birth == 0 or when * find_birth_txg() returns an error. */ affected = (error == 0 && zep->zb_birth == blk_txg) || (error != 0) || (zep->zb_birth == 0); } /* Scrub snapshots. */ if (affected) { zbookmark_phys_t zb; zep_to_zb(snap_obj, zep, &zb); scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* We have already acquired the config lock for spa */ read_by_block_level(scn, zb); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; scn->errorscrub_phys.dep_examined++; scn->errorscrub_phys.dep_to_examine--; (*count)++; if ((*count) == zfs_scrub_error_blocks_per_txg || dsl_error_scrub_check_suspend(scn, &zb)) { dsl_dataset_rele(ds, FTAG); return (EFAULT); } } snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_rele(ds, FTAG); } return (0); } void dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; /* * Only process scans in sync pass 1. */ if (spa_sync_pass(spa) > 1) return; /* * If the spa is shutting down, then stop scanning. This will * ensure that the scan does not dirty any new data during the * shutdown phase. */ if (spa_shutting_down(spa)) return; if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) { return; } if (dsl_scan_resilvering(scn->scn_dp)) { /* cancel the error scrub if resilver started */ dsl_scan_cancel(scn->scn_dp); return; } spa->spa_scrub_active = B_TRUE; scn->scn_sync_start_time = gethrtime(); /* * zfs_scan_suspend_progress can be set to disable scrub progress. * See more detailed comment in dsl_scan_sync(). */ if (zfs_scan_suspend_progress) { uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; int mintime = zfs_scrub_min_time_ms; while (zfs_scan_suspend_progress && !txg_sync_waiting(scn->scn_dp) && !spa_shutting_down(scn->scn_dp->dp_spa) && NSEC2MSEC(scan_time_ns) < mintime) { delay(hz); scan_time_ns = gethrtime() - scn->scn_sync_start_time; } return; } int i = 0; zap_attribute_t *za; zbookmark_phys_t *zb; boolean_t limit_exceeded = B_FALSE; za = zap_attribute_alloc(); zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP); if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; zap_cursor_advance(&scn->errorscrub_cursor)) { name_to_bookmark(za->za_name, zb); scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); dsl_pool_config_enter(dp, FTAG); read_by_block_level(scn, *zb); dsl_pool_config_exit(dp, FTAG); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; scn->errorscrub_phys.dep_examined += 1; scn->errorscrub_phys.dep_to_examine -= 1; i++; if (i == zfs_scrub_error_blocks_per_txg || dsl_error_scrub_check_suspend(scn, zb)) { limit_exceeded = B_TRUE; break; } } if (!limit_exceeded) dsl_errorscrub_done(scn, B_TRUE, tx); dsl_errorscrub_sync_state(scn, tx); zap_attribute_free(za); kmem_free(zb, sizeof (*zb)); return; } int error = 0; for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; zap_cursor_advance(&scn->errorscrub_cursor)) { zap_cursor_t *head_ds_cursor; zap_attribute_t *head_ds_attr; zbookmark_err_phys_t head_ds_block; head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); head_ds_attr = zap_attribute_alloc(); uint64_t head_ds_err_obj = za->za_first_integer; uint64_t head_ds; name_to_object(za->za_name, &head_ds); boolean_t config_held = B_FALSE; uint64_t top_affected_fs; for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { name_to_errphys(head_ds_attr->za_name, &head_ds_block); /* * In case we are called from spa_sync the pool * config is already held. */ if (!dsl_pool_config_held(dp)) { dsl_pool_config_enter(dp, FTAG); config_held = B_TRUE; } error = find_top_affected_fs(spa, head_ds, &head_ds_block, &top_affected_fs); if (error) break; error = scrub_filesystem(spa, top_affected_fs, &head_ds_block, &i); if (error == SET_ERROR(EFAULT)) { limit_exceeded = B_TRUE; break; } } zap_cursor_fini(head_ds_cursor); kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); zap_attribute_free(head_ds_attr); if (config_held) dsl_pool_config_exit(dp, FTAG); } zap_attribute_free(za); kmem_free(zb, sizeof (*zb)); if (!limit_exceeded) dsl_errorscrub_done(scn, B_TRUE, tx); dsl_errorscrub_sync_state(scn, tx); } /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we * can guarantee that blocks we are currently scanning will not change out * from under us. While a scan is active, this function controls how quickly * transaction groups proceed, instead of the normal handling provided by * txg_sync_thread(). */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; state_sync_type_t sync_type = SYNC_OPTIONAL; int restart_early = 0; if (spa->spa_resilver_deferred) { uint64_t to_issue, issued; if (!spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx); /* * See print_scan_scrub_resilver_status() issued/total_i * @ cmd/zpool/zpool_main.c */ to_issue = scn->scn_phys.scn_to_examine - scn->scn_phys.scn_skipped; issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; restart_early = zfs_resilver_disable_defer || (issued < (to_issue * zfs_resilver_defer_percent / 100)); } /* * Only process scans in sync pass 1. */ if (spa_sync_pass(spa) > 1) return; /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). We also restart scans if there * is a deferred resilver and the user has manually disabled * deferred resilvers via zfs_resilver_disable_defer, or if the * current scan progress is below zfs_resilver_defer_percent. */ if (dsl_scan_restarting(scn, tx) || restart_early) { setup_sync_arg_t setup_sync_arg = { .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, }; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) setup_sync_arg.func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u on %s txg=%llu early=%d", setup_sync_arg.func, dp->dp_spa->spa_name, (longlong_t)tx->tx_txg, restart_early); dsl_scan_setup_sync(&setup_sync_arg, tx); } /* * If the spa is shutting down, then stop scanning. This will * ensure that the scan does not dirty any new data during the * shutdown phase. */ if (spa_shutting_down(spa)) return; /* * If the scan is inactive due to a stalled async destroy, try again. */ if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; /* reset scan statistics */ scn->scn_visited_this_txg = 0; scn->scn_dedup_frees_this_txg = 0; scn->scn_holes_this_txg = 0; scn->scn_lt_min_this_txg = 0; scn->scn_gt_max_this_txg = 0; scn->scn_ddt_contained_this_txg = 0; scn->scn_objsets_visited_this_txg = 0; scn->scn_avg_seg_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_zios_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the async destroys. If we suspend, don't do * any scrubbing or resilvering. This ensures that there are no * async destroys while we are scanning, so the scan code doesn't * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ err = dsl_process_async_destroys(dp, tx); if (err != 0) return; if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; /* * Wait a few txgs after importing to begin scanning so that * we can get the pool imported quickly. */ if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) return; /* * zfs_scan_suspend_progress can be set to disable scan progress. * We don't want to spin the txg_sync thread, so we add a delay * here to simulate the time spent doing a scan. This is mostly * useful for testing and debugging. */ if (zfs_scan_suspend_progress) { uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; while (zfs_scan_suspend_progress && !txg_sync_waiting(scn->scn_dp) && !spa_shutting_down(scn->scn_dp->dp_spa) && NSEC2MSEC(scan_time_ns) < mintime) { delay(hz); scan_time_ns = gethrtime() - scn->scn_sync_start_time; } return; } /* * Disabled by default, set zfs_scan_report_txgs to report * average performance over the last zfs_scan_report_txgs TXGs. */ if (zfs_scan_report_txgs != 0 && tx->tx_txg % zfs_scan_report_txgs == 0) { scn->scn_issued_before_pass += spa->spa_scan_pass_issued; spa_scan_stat_init(spa); } /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from * a checkpoint after a reboot. */ if (!zfs_scan_legacy) { scn->scn_is_sorted = B_TRUE; if (scn->scn_last_checkpoint == 0) scn->scn_last_checkpoint = ddi_get_lbolt(); } /* * For sorted scans, determine what kind of work we will be doing * this txg based on our memory limitations and whether or not we * need to perform a checkpoint. */ if (scn->scn_is_sorted) { /* * If we are over our checkpoint interval, set scn_clearing * so that we can begin checkpointing immediately. The * checkpoint allows us to save a consistent bookmark * representing how much data we have scrubbed so far. * Otherwise, use the memory limit to determine if we should * scan for metadata or start issue scrub IOs. We accumulate * metadata until we hit our hard memory limit at which point * we issue scrub IOs until we are at our soft memory limit. */ if (scn->scn_checkpointing || ddi_get_lbolt() - scn->scn_last_checkpoint > SEC_TO_TICK(zfs_scan_checkpoint_intval)) { if (!scn->scn_checkpointing) zfs_dbgmsg("begin scan checkpoint for %s", spa->spa_name); scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } else { boolean_t should_clear = dsl_scan_should_clear(scn); if (should_clear && !scn->scn_clearing) { zfs_dbgmsg("begin scan clearing for %s", spa->spa_name); scn->scn_clearing = B_TRUE; } else if (!should_clear && scn->scn_clearing) { zfs_dbgmsg("finish scan clearing for %s", spa->spa_name); scn->scn_clearing = B_FALSE; } } } else { ASSERT0(scn->scn_checkpointing); ASSERT0(scn->scn_clearing); } if (!scn->scn_clearing && scn->scn_done_txg == 0) { /* Need to scan metadata for more blocks to scrub */ dsl_scan_phys_t *scnp = &scn->scn_phys; taskqid_t prefetch_tqid; /* * Calculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). * Limits for the issuing phase are done per top-level vdev and * are handled separately. */ scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); zfs_dbgmsg("doing scan sync for %s txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", spa->spa_name, (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } else { zfs_dbgmsg("doing scan sync for %s txg %llu; " "bm=%llu/%llu/%llu/%llu", spa->spa_name, (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_bookmark.zb_objset, (longlong_t)scnp->scn_bookmark.zb_object, (longlong_t)scnp->scn_bookmark.zb_level, (longlong_t)scnp->scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scn->scn_prefetch_stop = B_FALSE; prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, dsl_scan_prefetch_thread, scn, TQ_SLEEP); ASSERT(prefetch_tqid != TASKQID_INVALID); dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); dsl_pool_config_exit(dp, FTAG); mutex_enter(&dp->dp_spa->spa_scrub_lock); scn->scn_prefetch_stop = B_TRUE; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&dp->dp_spa->spa_scrub_lock); taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("scan visited %llu blocks of %s in %llums " "(%llu os's, %llu holes, %llu < mintxg, " "%llu in ddt, %llu > maxtxg)", (longlong_t)scn->scn_visited_this_txg, spa->spa_name, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_objsets_visited_this_txg, (longlong_t)scn->scn_holes_this_txg, (longlong_t)scn->scn_lt_min_this_txg, (longlong_t)scn->scn_ddt_contained_this_txg, (longlong_t)scn->scn_gt_max_this_txg); if (!scn->scn_suspending) { ASSERT0(avl_numnodes(&scn->scn_queue)); scn->scn_done_txg = tx->tx_txg + 1; if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; scn->scn_issued_before_pass += spa->spa_scan_pass_issued; spa_scan_stat_init(spa); } zfs_dbgmsg("scan complete for %s txg %llu", spa->spa_name, (longlong_t)tx->tx_txg); } } else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) { ASSERT(scn->scn_clearing); /* need to issue scrubbing IOs from per-vdev queues */ scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scan_io_queues_run(scn); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; /* calculate and dprintf the current memory usage */ (void) dsl_scan_should_clear(scn); dsl_scan_update_stats(scn); zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) " "in %llums (avg_block_size = %llu, avg_seg_size = %llu)", (longlong_t)scn->scn_zios_this_txg, spa->spa_name, (longlong_t)scn->scn_segs_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_avg_zio_size_this_txg, (longlong_t)scn->scn_avg_seg_size_this_txg); } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { /* Finished with everything. Mark the scrub as complete */ zfs_dbgmsg("scan issuing complete txg %llu for %s", (longlong_t)tx->tx_txg, spa->spa_name); ASSERT3U(scn->scn_done_txg, !=, 0); ASSERT0(spa->spa_scrub_inflight); ASSERT0(scn->scn_queues_pending); dsl_scan_done(scn, B_TRUE, tx); sync_type = SYNC_MANDATORY; } dsl_scan_sync_state(scn, tx, sync_type); } static void count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all) { /* * Don't count embedded bp's, since we already did the work of * scanning these when we scanned the containing block. */ if (BP_IS_EMBEDDED(bp)) return; /* * Update the spa's stats on how many bytes we have issued. * Sequential scrubs create a zio for each DVA of the bp. Each * of these will include all DVAs for repair purposes, but the * zio code will only try the first one unless there is an issue. * Therefore, we should only count the first DVA for these IOs. */ atomic_add_64(&spa->spa_scan_pass_issued, all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); } static void count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all) { if (BP_IS_EMBEDDED(bp)) return; atomic_add_64(&scn->scn_phys.scn_skipped, all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); } static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } } static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); if (unlikely(avl_is_empty(&queue->q_sios_by_addr))) atomic_add_64(&scn->scn_queues_pending, 1); if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); queue->q_sio_memused += SIO_GET_MUSED(sio); range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)); } /* * Given all the info we got from our metadata scanning process, we * construct a scan_io_t and insert it into the scan sorting queue. The * I/O must already be suitable for us to process. This is controlled * by dsl_scan_enqueue(). */ static void scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); bp2sio(bp, sio, dva_i); sio->sio_flags = zio_flags; sio->sio_zb = *zb; queue->q_last_ext_addr = -1; scan_io_queue_insert_impl(queue, sio); } /* * Given a set of I/O parameters as discovered by the metadata traversal * process, attempts to place the I/O into the sorted queues (if allowed), * or immediately executes the I/O. */ static void dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb) { spa_t *spa = dp->dp_spa; ASSERT(!BP_IS_EMBEDDED(bp)); /* * Gang blocks are hard to issue sequentially, so we just issue them * here immediately instead of queuing them. */ if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { scan_exec_io(dp, bp, zio_flags, zb, NULL); return; } for (int i = 0; i < BP_GET_NDVAS(bp); i++) { dva_t dva; vdev_t *vdev; dva = bp->blk_dva[i]; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); ASSERT(vdev != NULL); mutex_enter(&vdev->vdev_scan_io_queue_lock); if (vdev->vdev_scan_io_queue == NULL) vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); ASSERT(dp->dp_scan != NULL); scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, i, zio_flags, zb); mutex_exit(&vdev->vdev_scan_io_queue_lock); } } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_GET_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; count_block(dp->dp_blkstats, bp); if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { count_block_skipped(scn, bp, B_TRUE); return (0); } /* Embedded BP's have phys_birth==0, so we reject them above. */ ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (int d = 0; d < BP_GET_NDVAS(bp); d++) { const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(8) status can make useful progress reports. */ uint64_t asize = DVA_GET_ASIZE(dva); scn->scn_phys.scn_examined += asize; spa->spa_scan_pass_exam += asize; /* if it's a resilver, this may not be in the target range */ if (!needs_io) needs_io = dsl_scan_need_resilver(spa, dva, psize, phys_birth); } if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { count_block_skipped(scn, bp, B_TRUE); } /* do not relocate this block */ return (0); } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; dsl_scan_io_queue_t *queue = zio->io_private; abd_free(zio->io_abd); if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } else { mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); queue->q_inflight_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&queue->q_zio_cv); mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); } if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { if (dsl_errorscrubbing(spa->spa_dsl_pool) && !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) { atomic_inc_64(&spa->spa_dsl_pool->dp_scan ->errorscrub_phys.dep_errors); } else { atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys .scn_errors); } } } /* * Given a scanning zio's information, executes the zio. The zio need * not necessarily be only sortable, this function simply executes the * zio, no matter what it is. The optional queue argument allows the * caller to specify that they want per top level vdev IO rate limiting * instead of the legacy global limiting. */ static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); abd_t *data = abd_alloc_for_io(size, B_FALSE); zio_t *pio; if (queue == NULL) { ASSERT3U(scn->scn_maxinflight_bytes, >, 0); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); pio = scn->scn_zio_root; } else { kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; ASSERT3U(queue->q_maxinflight_bytes, >, 0); mutex_enter(q_lock); while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) cv_wait(&queue->q_zio_cv, q_lock); queue->q_inflight_bytes += BP_GET_PSIZE(bp); pio = queue->q_zio; mutex_exit(q_lock); } ASSERT(pio != NULL); count_block_issued(spa, bp, queue == NULL); zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* * This is the primary extent sorting algorithm. We balance two parameters: * 1) how many bytes of I/O are in an extent * 2) how well the extent is filled with I/O (as a fraction of its total size) * Since we allow extents to have gaps between their constituent I/Os, it's * possible to have a fairly large extent that contains the same amount of * I/O bytes than a much smaller extent, which just packs the I/O more tightly. * The algorithm sorts based on a score calculated from the extent's size, * the relative fill volume (in %) and a "fill weight" parameter that controls * the split between whether we prefer larger extents or more well populated * extents: * * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) * * Example: * 1) assume extsz = 64 MiB * 2) assume fill = 32 MiB (extent is half full) * 3) assume fill_weight = 3 * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 * SCORE = 32M + (50 * 3 * 32M) / 100 * SCORE = 32M + (4800M / 100) * SCORE = 32M + 48M * ^ ^ * | +--- final total relative fill-based score * +--------- final total fill-based score * SCORE = 80M * * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards * extents that are more completely filled (in a 3:2 ratio) vs just larger. * Note that as an optimization, we replace multiplication and division by * 100 with bitshifting by 7 (which effectively multiplies and divides by 128). * * Since we do not care if one extent is only few percent better than another, * compress the score into 6 bits via binary logarithm AKA highbit64() and * put into otherwise unused due to ashift high bits of offset. This allows * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them * with single operation. Plus it makes scrubs more sequential and reduces * chances that minor extent change move it within the B-tree. */ __attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { const uint64_t *a = x, *b = y; return (TREE_CMP(*a, *b)); } ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, ext_size_compare) static void ext_size_create(range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, sizeof (uint64_t)); } static void ext_size_destroy(range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; ASSERT0(zfs_btree_numnodes(size_tree)); zfs_btree_destroy(size_tree); } static uint64_t ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg) { (void) rt; uint64_t size = rsg->rs_end - rsg->rs_start; uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) * fill_weight * rsg->rs_fill) >> 7); ASSERT3U(rt->rt_shift, >=, 8); return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start); } static void ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); zfs_btree_add(size_tree, &v); } static void ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); zfs_btree_remove(size_tree, &v); } static void ext_size_vacate(range_tree_t *rt, void *arg) { zfs_btree_t *size_tree = arg; zfs_btree_clear(size_tree); zfs_btree_destroy(size_tree); ext_size_create(rt, arg); } static const range_tree_ops_t ext_size_ops = { .rtop_create = ext_size_create, .rtop_destroy = ext_size_destroy, .rtop_add = ext_size_add, .rtop_remove = ext_size_remove, .rtop_vacate = ext_size_vacate }; /* * Comparator for the q_sios_by_addr tree. Sorting is simply performed * based on LBA-order (from lowest to highest). */ static int sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ static dsl_scan_io_queue_t * scan_io_queue_create(vdev_t *vd) { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); q->q_scn = scn; q->q_vd = vd; q->q_sio_memused = 0; q->q_last_ext_addr = -1; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP, &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); return (q); } /* * Destroys a scan queue and all segments and scan_io_t's contained in it. * No further execution of I/O occurs, anything pending in the queue is * simply freed without being executed. */ void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; void *cookie = NULL; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); if (!avl_is_empty(&queue->q_sios_by_addr)) atomic_add_64(&scn->scn_queues_pending, -1); while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); queue->q_sio_memused -= SIO_GET_MUSED(sio); sio_free(sio); } ASSERT0(queue->q_sio_memused); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); avl_destroy(&queue->q_sios_by_addr); cv_destroy(&queue->q_zio_cv); kmem_free(queue, sizeof (*queue)); } /* * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is * called on behalf of vdev_top_transfer when creating or destroying * a mirror vdev due to zpool attach/detach. */ void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) { mutex_enter(&svd->vdev_scan_io_queue_lock); mutex_enter(&tvd->vdev_scan_io_queue_lock); VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; svd->vdev_scan_io_queue = NULL; if (tvd->vdev_scan_io_queue != NULL) tvd->vdev_scan_io_queue->q_vd = tvd; mutex_exit(&tvd->vdev_scan_io_queue_lock); mutex_exit(&svd->vdev_scan_io_queue_lock); } static void scan_io_queues_destroy(dsl_scan_t *scn) { vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; mutex_enter(&tvd->vdev_scan_io_queue_lock); if (tvd->vdev_scan_io_queue != NULL) dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); tvd->vdev_scan_io_queue = NULL; mutex_exit(&tvd->vdev_scan_io_queue_lock); } } static void dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; scan_io_t *srch_sio, *sio; avl_index_t idx; uint64_t start, size; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); ASSERT(vdev != NULL); q_lock = &vdev->vdev_scan_io_queue_lock; queue = vdev->vdev_scan_io_queue; mutex_enter(q_lock); if (queue == NULL) { mutex_exit(q_lock); return; } srch_sio = sio_alloc(BP_GET_NDVAS(bp)); bp2sio(bp, srch_sio, dva_i); start = SIO_GET_OFFSET(srch_sio); size = SIO_GET_ASIZE(srch_sio); /* * We can find the zio in two states: * 1) Cold, just sitting in the queue of zio's to be issued at * some point in the future. In this case, all we do is * remove the zio from the q_sios_by_addr tree, decrement * its data volume from the containing range_seg_t and * resort the q_exts_by_size tree to reflect that the * range_seg_t has lost some of its 'fill'. We don't shorten * the range_seg_t - this is usually rare enough not to be * worth the extra hassle of trying keep track of precise * extent boundaries. * 2) Hot, where the zio is currently in-flight in * dsl_scan_issue_ios. In this case, we can't simply * reach in and stop the in-flight zio's, so we instead * block the caller. Eventually, dsl_scan_issue_ios will * be done with issuing the zio's it gathered and will * signal us. */ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); sio_free(srch_sio); if (sio != NULL) { blkptr_t tmpbp; /* Got it while it was cold in the queue */ ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(size, ==, SIO_GET_ASIZE(sio)); avl_remove(&queue->q_sios_by_addr, sio); if (avl_is_empty(&queue->q_sios_by_addr)) atomic_add_64(&scn->scn_queues_pending, -1); queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); /* count the block as though we skipped it */ sio2bp(sio, &tmpbp); count_block_skipped(scn, &tmpbp, B_FALSE); sio_free(sio); } mutex_exit(q_lock); } /* * Callback invoked when a zio_free() zio is executing. This needs to be * intercepted to prevent the zio from deallocating a particular portion * of disk space and it then getting reallocated and written to, while we * still have it queued up for processing. */ void dsl_scan_freed(spa_t *spa, const blkptr_t *bp) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(scn != NULL); if (!dsl_scan_is_running(scn)) return; for (int i = 0; i < BP_GET_NDVAS(bp); i++) dsl_scan_freed_dva(spa, bp, i); } /* * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has * not started, start it. Otherwise, only restart if max txg in DTL range is * greater than the max txg in the current scan. If the DTL max is less than * the scan max, then the vdev has not missed any new data since the resilver * started, so a restart is not needed. */ void dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) { uint64_t min, max; if (!vdev_resilver_needed(vd, &min, &max)) return; if (!dsl_scan_resilvering(dp)) { spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); return; } if (max <= dp->dp_scan->scn_phys.scn_max_txg) return; /* restart is needed, check if it can be deferred */ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) vdev_defer_resilver(vd); else spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); } ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW, "Max bytes in flight per leaf vdev for scrubs and resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW, "Min millisecs to scrub per txg"); ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW, "Min millisecs to obsolete per txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW, "Min millisecs to free per txg"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW, "Min millisecs to resilver per txg"); ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW, "Set to prevent scans from progressing"); ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, "Set to disable scrub I/O"); ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, "Set to disable scrub prefetching"); ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW, "Max number of blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW, "Max number of dedup blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, "Enable processing of the free_bpobj"); ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW, "Enable block statistics calculation during scrub"); ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW, "Fraction of RAM for scan hard limit"); ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW, "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size"); ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, "Scrub using legacy non-sequential method"); ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW, "Scan progress on-disk checkpointing interval"); ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW, "Max gap in bytes between sequential scrub / resilver I/Os"); ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW, "Fraction of hard limit used as soft limit"); ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, "Tunable to attempt to reduce lock contention"); ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, "Tunable to report resilver performance over the last N txgs"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_defer_percent, UINT, ZMOD_RW, "Issued IO percent complete after which resilvers are deferred"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW, "Error blocks to be scrubbed in one txg"); -/* END CSTYLED */ diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3bd6e93e93a4..7affbfac9dc7 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1,6287 +1,6283 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define GANG_ALLOCATION(flags) \ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal * operation, we will try to write this amount of data to each disk before * moving on to the next top-level vdev. */ static uint64_t metaslab_aliquot = 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. */ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* * Of blocks of size >= metaslab_force_ganging, actually gang them this often. */ uint_t metaslab_force_ganging_pct = 3; /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each * transaction group. Thus, we benefit from having a small space map * block size since it allows us to issue more I/O operations scattered * around the disk. So a sane default for the space map block size * is 8~16K. */ int zfs_metaslab_sm_blksz_no_log = (1 << 14); /* * When the log space map feature is enabled, we accumulate a lot of * changes per metaslab that are flushed once in a while so we benefit * from a bigger block size like 128K for the metaslab space maps. */ int zfs_metaslab_sm_blksz_with_log = (1 << 17); /* * The in-core space map representation is more compact than its on-disk form. * The zfs_condense_pct determines how much more compact the in-core * space map representation must be before we compact it on-disk. * Values should be greater than or equal to 100. */ uint_t zfs_condense_pct = 200; /* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksz), so a metaslab might use the * same number of blocks after condensing. Since the goal of condensing is to * reduce the number of IOPs required to read the space map, we only want to * condense when we can be sure we will reduce the number of blocks used by the * space map. Unfortunately, we cannot precisely compute whether or not this is * the case in metaslab_should_condense since we are holding ms_lock. Instead, * we apply the following heuristic: do not condense a spacemap unless the * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold * blocks. */ static const int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. * Once all groups in the pool reach zfs_mg_noalloc_threshold then all * groups are allowed to accept allocations. Gang blocks are always * eligible to allocate on any metaslab group. The default value of 0 means * no metaslab group will be excluded based on this criterion. */ static uint_t zfs_mg_noalloc_threshold = 0; /* * Metaslab groups are considered eligible for allocations if their * fragmentation metric (measured as a percentage) is less than or * equal to zfs_mg_fragmentation_threshold. If a metaslab group * exceeds this threshold then it will be skipped unless all metaslab * groups within the metaslab class have also crossed this threshold. * * This tunable was introduced to avoid edge cases where we continue * allocating from very fragmented disks in our pool while other, less * fragmented disks, exists. On the other hand, if all disks in the * pool are uniformly approaching the threshold, the threshold can * be a speed bump in performance, where we keep switching the disks * that we allocate from (e.g. we allocate some segments from disk A * making it bypassing the threshold while freeing segments from disk * B getting its fragmentation below the threshold). * * Empirically, we've seen that our vdev selection for allocations is * good enough that fragmentation increases uniformly across all vdevs * the majority of the time. Thus we set the threshold percentage high * enough to avoid hitting the speed bump on pools that are being pushed * to the edge. */ static uint_t zfs_mg_fragmentation_threshold = 95; /* * Allow metaslabs to keep their active state as long as their fragmentation * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ static uint_t zfs_metaslab_fragmentation_threshold = 70; /* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = B_FALSE; /* * When set will prevent metaslabs from being unloaded. */ static int metaslab_debug_unload = B_FALSE; /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ uint_t metaslab_df_free_pct = 4; /* * Maximum distance to search forward from the last offset. Without this * limit, fragmented pools can see >100,000 iterations and * metaslab_block_picker() becomes the performance limiting factor on * high-performance storage. * * With the default setting of 16MB, we typically see less than 500 * iterations, even with very fragmented, ashift=9 pools. The maximum number * of iterations possible is: * metaslab_df_max_search / (2 * (1<60KB (but fewer segments in this * bucket, and therefore a lower weight). */ static uint_t zfs_metaslab_find_max_tries = 100; static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; kstat_named_t metaslabstat_reload_tree; kstat_named_t metaslabstat_too_many_tries; kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, { "too_many_tries", KSTAT_DATA_UINT64 }, { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); static kstat_t *metaslab_ksp; void metaslab_stat_init(void) { ASSERT(metaslab_alloc_trace_cache == NULL); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (metaslab_ksp != NULL) { metaslab_ksp->ks_data = &metaslab_stats; kstat_install(metaslab_ksp); } } void metaslab_stat_fini(void) { if (metaslab_ksp != NULL) { kstat_delete(metaslab_ksp); metaslab_ksp = NULL; } kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; mca->mca_rotor = NULL; zfs_refcount_create_tracked(&mca->mca_alloc_slots); } return (mc); } void metaslab_class_destroy(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; ASSERT(mca->mca_rotor == NULL); zfs_refcount_destroy(&mca->mca_alloc_slots); } mutex_destroy(&mc->mc_lock); multilist_destroy(&mc->mc_metaslab_txg_list); kmem_free(mc, offsetof(metaslab_class_t, mc_allocator[spa->spa_alloc_count])); } int metaslab_class_validate(metaslab_class_t *mc) { metaslab_group_t *mg; vdev_t *vd; /* * Must hold one of the spa_config locks. */ ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) return (0); do { vd = mg->mg_vd; ASSERT(vd->vdev_mg != NULL); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); return (0); } static void metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { atomic_add_64(&mc->mc_alloc, alloc_delta); atomic_add_64(&mc->mc_deferred, defer_delta); atomic_add_64(&mc->mc_space, space_delta); atomic_add_64(&mc->mc_dspace, dspace_delta); } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { return (mc->mc_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { return (mc->mc_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { return (mc->mc_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } void metaslab_class_histogram_verify(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t *mc_hist; int i; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or * vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); } mutex_exit(&mc->mc_lock); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } /* * Calculate the metaslab class's fragmentation metric. The metric * is weighted based on the space contribution of each metaslab group. * The return value will be a number between 0 and 100 (inclusive), or * ZFS_FRAG_INVALID if the metric has not been set. See comment above the * zfs_frag_table for more information about the metric. */ uint64_t metaslab_class_fragmentation(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t fragmentation = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; /* * Skip any holes, uninitialized top-levels, * or vdevs that are not in this metalab class. */ if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * If a metaslab group does not contain a fragmentation * metric then just bail out. */ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (ZFS_FRAG_INVALID); } /* * Determine how much this metaslab_group is contributing * to the overall pool fragmentation metric. */ fragmentation += mg->mg_fragmentation * metaslab_group_get_space(mg); } fragmentation /= metaslab_class_get_space(mc); ASSERT3U(fragmentation, <=, 100); spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (fragmentation); } /* * Calculate the amount of expandable space that is available in * this metaslab class. If a device is expanded then its expandable * space will be the amount of allocatable space that is currently not * part of this metaslab class. */ uint64_t metaslab_class_expandable_space(metaslab_class_t *mc) { vdev_t *rvd = mc->mc_spa->spa_root_vdev; uint64_t space = 0; spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || mg->mg_class != mc) { continue; } /* * Calculate if we have enough space to add additional * metaslabs. We report the expandable space in terms * of the metaslab size since that's the unit of expansion. */ space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); return (space); } void metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) { multilist_t *ml = &mc->mc_metaslab_txg_list; hrtime_t now = gethrtime(); for (int i = 0; i < multilist_get_num_sublists(ml); i++) { multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL) { mutex_enter(&msp->ms_lock); /* * If the metaslab has been removed from the list * (which could happen if we were at the memory limit * and it was evicted during this loop), then we can't * proceed and we should restart the sublist. */ if (!multilist_link_active(&msp->ms_class_txg_node)) { mutex_exit(&msp->ms_lock); i--; break; } mls = multilist_sublist_lock_idx(ml, i); metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); if (txg > msp->ms_selected_txg + metaslab_unload_delay && now > msp->ms_selected_time + MSEC2NSEC(metaslab_unload_delay_ms) && (msp->ms_allocator == -1 || !metaslab_preload_enabled)) { metaslab_evict(msp, txg); } else { /* * Once we've hit a metaslab selected too * recently to evict, we're done evicting for * now. */ mutex_exit(&msp->ms_lock); break; } mutex_exit(&msp->ms_lock); msp = next_msp; } } } static int metaslab_compare(const void *x1, const void *x2) { const metaslab_t *m1 = (const metaslab_t *)x1; const metaslab_t *m2 = (const metaslab_t *)x2; int sort1 = 0; int sort2 = 0; if (m1->ms_allocator != -1 && m1->ms_primary) sort1 = 1; else if (m1->ms_allocator != -1 && !m1->ms_primary) sort1 = 2; if (m2->ms_allocator != -1 && m2->ms_primary) sort2 = 1; else if (m2->ms_allocator != -1 && !m2->ms_primary) sort2 = 2; /* * Sort inactive metaslabs first, then primaries, then secondaries. When * selecting a metaslab to allocate from, an allocator first tries its * primary, then secondary active metaslab. If it doesn't have active * metaslabs, or can't allocate from them, it searches for an inactive * metaslab to activate. If it can't find a suitable one, it will steal * a primary or secondary metaslab from another allocator. */ if (sort1 < sort2) return (-1); if (sort1 > sort2) return (1); int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* * ========================================================================== * Metaslab groups * ========================================================================== */ /* * Update the allocatable flag and the metaslab group's capacity. * The allocatable flag is set to true if the capacity is below * the zfs_mg_noalloc_threshold or has a fragmentation value that is * greater than zfs_mg_fragmentation_threshold. If a metaslab group * transitions from allocatable to non-allocatable or vice versa then the * metaslab group's class is updated to reflect the transition. */ static void metaslab_group_alloc_update(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; metaslab_class_t *mc = mg->mg_class; vdev_stat_t *vs = &vd->vdev_stat; boolean_t was_allocatable; boolean_t was_initialized; ASSERT(vd == vd->vdev_top); ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, SCL_ALLOC); mutex_enter(&mg->mg_lock); was_allocatable = mg->mg_allocatable; was_initialized = mg->mg_initialized; mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); mutex_enter(&mc->mc_lock); /* * If the metaslab group was just added then it won't * have any space until we finish syncing out this txg. * At that point we will consider it initialized and available * for allocations. We also don't consider non-activated * metaslab groups (e.g. vdevs that are in the middle of being removed) * to be initialized, because they can't be used for allocation. */ mg->mg_initialized = metaslab_group_initialized(mg); if (!was_initialized && mg->mg_initialized) { mc->mc_groups++; } else if (was_initialized && !mg->mg_initialized) { ASSERT3U(mc->mc_groups, >, 0); mc->mc_groups--; } if (mg->mg_initialized) mg->mg_no_free_space = B_FALSE; /* * A metaslab group is considered allocatable if it has plenty * of free space or is not heavily fragmented. We only take * fragmentation into account if the metaslab group has a valid * fragmentation metric (i.e. a value between 0 and 100). */ mg->mg_allocatable = (mg->mg_activation_count > 0 && mg->mg_free_capacity > zfs_mg_noalloc_threshold && (mg->mg_fragmentation == ZFS_FRAG_INVALID || mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of * groups in this metaslab class that are still above the * zfs_mg_noalloc_threshold. This is used by the allocating * threads to determine if they should avoid allocations to * a given group. The allocator will avoid allocations to a group * if that group has reached or is below the zfs_mg_noalloc_threshold * and there are still other groups that are above the threshold. * When a group transitions from allocatable to non-allocatable or * vice versa we update the metaslab class to reflect that change. * When the mc_alloc_groups value drops to 0 that means that all * groups have reached the zfs_mg_noalloc_threshold making all groups * eligible for allocations. This effectively means that all devices * are balanced again. */ if (was_allocatable && !mg->mg_allocatable) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } int metaslab_sort_by_flushed(const void *va, const void *vb) { const metaslab_t *a = va; const metaslab_t *b = vb; int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(offsetof(metaslab_group_t, mg_allocator[allocators]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; for (int i = 0; i < allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); } return (mg); } void metaslab_group_destroy(metaslab_group_t *mg) { ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); /* * We may have gone below zero with the activation count * either because we never activated in the first place or * because we're done, and possibly removing the vdev. */ ASSERT(mg->mg_activation_count <= 0); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); mutex_destroy(&mg->mg_ms_disabled_lock); cv_destroy(&mg->mg_ms_disabled_cv); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_destroy(&mga->mga_alloc_queue_depth); } kmem_free(mg, offsetof(metaslab_group_t, mg_allocator[mg->mg_allocators])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) return; mg->mg_aliquot = metaslab_aliquot * MAX(1, vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { mgnext = mgprev->mg_next; mg->mg_prev = mgprev; mg->mg_next = mgnext; mgprev->mg_next = mg; mgnext->mg_prev = mg; } for (int i = 0; i < spa->spa_alloc_count; i++) { mc->mc_allocator[i].mca_rotor = mg; mg = mg->mg_next; } } /* * Passivate a metaslab group and remove it from the allocation rotor. * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating * a metaslab group. This function will momentarily drop spa_config_locks * that are lower than the SCL_ALLOC lock (see comment below). */ void metaslab_group_passivate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); return; } /* * The spa_config_lock is an array of rwlocks, ordered as * follows (from highest to lowest): * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > * SCL_ZIO > SCL_FREE > SCL_VDEV * (For more information about the spa_config_lock see spa_misc.c) * The higher the lock, the broader its coverage. When we passivate * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO * config locks. However, the metaslab group's taskq might be trying * to preload metaslabs so we must drop the SCL_ZIO lock and any * lower locks to allow the I/O to complete. At a minimum, * we continue to hold the SCL_ALLOC lock, which prevents any future * allocations from taking place and any changes to the vdev tree. */ spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); for (int i = 0; i < mg->mg_allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; metaslab_t *msp = mga->mga_primary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, metaslab_weight_from_range_tree(msp)); mutex_exit(&msp->ms_lock); } } mgprev = mg->mg_prev; mgnext = mg->mg_next; if (mg == mgnext) { mgnext = NULL; } else { mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } for (int i = 0; i < spa->spa_alloc_count; i++) { if (mc->mc_allocator[i].mca_rotor == mg) mc->mc_allocator[i].mca_rotor = mgnext; } mg->mg_prev = NULL; mg->mg_next = NULL; } boolean_t metaslab_group_initialized(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; vdev_stat_t *vs = &vd->vdev_stat; return (vs->vs_space != 0 && mg->mg_activation_count > 0); } uint64_t metaslab_group_get_space(metaslab_group_t *mg) { /* * Note that the number of nodes in mg_metaslab_tree may be one less * than vdev_ms_count, due to the embedded log metaslab. */ mutex_enter(&mg->mg_lock); uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); mutex_exit(&mg->mg_lock); return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); mutex_enter(&mg->mg_lock); for (metaslab_t *msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { VERIFY3P(msp->ms_group, ==, mg); /* skip if not active */ if (msp->ms_sm == NULL) continue; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } } for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); mutex_exit(&mg->mg_lock); kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } static void metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } void metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { metaslab_class_t *mc = mg->mg_class; uint64_t ashift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_sm == NULL) return; mutex_enter(&mg->mg_lock); mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { ASSERT(msp->ms_group == NULL); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); mutex_enter(&msp->ms_lock); metaslab_group_histogram_add(mg, msp); mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { mutex_enter(&msp->ms_lock); metaslab_group_histogram_remove(mg, msp); mutex_exit(&msp->ms_lock); mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); msp->ms_group = NULL; mutex_exit(&mg->mg_lock); } static void metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&mg->mg_lock)); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); msp->ms_weight = weight; avl_add(&mg->mg_metaslab_tree, msp); } static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in * practice we do not use values in the range [1, 511]. */ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } /* * Calculate the fragmentation for a given metaslab group. We can use * a simple average here since all metaslabs within the group must have * the same size. The return value will be a value between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ uint64_t metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; uint64_t valid_ms = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp->ms_fragmentation == ZFS_FRAG_INVALID) continue; if (msp->ms_group != mg) continue; valid_ms++; fragmentation += msp->ms_fragmentation; } if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) return (ZFS_FRAG_INVALID); fragmentation /= valid_ms; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } /* * Determine if a given metaslab group should skip allocations. A metaslab * group should avoid allocations if its free capacity is less than the * zfs_mg_noalloc_threshold or its fragmentation metric is greater than * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. If the allocation throttle is enabled * then we skip allocations to devices that have reached their maximum * allocation queue depth unless the selected metaslab group is the only * eligible group remaining. */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, int flags, uint64_t psize, int allocator, int d) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; /* * We can only consider skipping this metaslab group if it's * in the normal metaslab class and there are other metaslab * groups to select from. Otherwise, we always consider it eligible * for allocations. */ if ((mc != spa_normal_class(spa) && mc != spa_special_class(spa) && mc != spa_dedup_class(spa)) || mc->mc_groups <= 1) return (B_TRUE); /* * If the metaslab group's mg_allocatable flag is set (see comments * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then * check if we have reached our allocation limit (mga_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest * gang block size then we allow allocations on this metaslab group * regardless of the mg_allocatable or throttle settings. */ if (mg->mg_allocatable) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; int64_t qdepth; uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); /* * If this metaslab group does not have any free space, then * there is no point in looking further. */ if (mg->mg_no_free_space) return (B_FALSE); /* * Some allocations (e.g., those coming from device removal * where the * allocations are not even counted in the * metaslab * allocation queues) are allowed to bypass * the throttle. */ if (flags & METASLAB_DONT_THROTTLE) return (B_TRUE); /* * Relax allocation throttling for ditto blocks. Due to * random imbalances in allocation it tends to push copies * to one vdev, that looks a bit better at the moment. */ qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); /* * If this metaslab group is below its qmax or it's * the only allocatable metaslab group, then attempt * to allocate from it. */ if (qdepth < qmax || mc->mc_alloc_groups == 1) return (B_TRUE); ASSERT3U(mc->mc_alloc_groups, >, 1); /* * Since this metaslab group is at or over its qmax, we * need to determine if there are metaslab groups after this * one that might be able to handle this allocation. This is * racy since we can't hold the locks for all metaslab * groups at the same time when we make this check. */ for (metaslab_group_t *mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { metaslab_group_allocator_t *mgap = &mgp->mg_allocator[allocator]; qmax = mgap->mga_cur_max_alloc_queue_depth; qmax = qmax * (4 + d) / 4; qdepth = zfs_refcount_count(&mgap->mga_alloc_queue_depth); /* * If there is another metaslab group that * might be able to handle the allocation, then * we return false so that we skip this group. */ if (qdepth < qmax && !mgp->mg_no_free_space) return (B_FALSE); } /* * We didn't find another group to handle the allocation * so we can't skip this metaslab group even though * we are at or over our qmax. */ return (B_TRUE); } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { return (B_TRUE); } return (B_FALSE); } /* * ========================================================================== * Range tree callbacks * ========================================================================== */ /* * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { const range_seg32_t *r1 = x1; const range_seg32_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ __attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { const range_seg64_t *r1 = x1; const range_seg64_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; } metaslab_rt_arg_t; struct mssa_arg { range_tree_t *rt; metaslab_rt_arg_t *mra; }; static void metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) { struct mssa_arg *mssap = arg; range_tree_t *rt = mssap->rt; metaslab_rt_arg_t *mrap = mssap->mra; range_seg_max_t seg = {0}; rs_set_start(&seg, rt, start); rs_set_end(&seg, rt, start + size); metaslab_rt_add(rt, &seg, mrap); } static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; METASLABSTAT_BUMP(metaslabstat_reload_tree); ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; arg.rt = rt; arg.mra = mrap; range_tree_walk(rt, metaslab_size_sorted_add, &arg); } ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, range_seg32_t, metaslab_rangesize32_compare) ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, range_seg64_t, metaslab_rangesize64_compare) /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. */ static void metaslab_rt_create(range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; size_t size; int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } static void metaslab_rt_destroy(range_tree_t *rt, void *arg) { (void) rt; metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_destroy(size_tree); kmem_free(mrap, sizeof (*mrap)); } static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_add(size_tree, rs); } static void metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; zfs_btree_remove(size_tree, rs); } static void metaslab_rt_vacate(range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; zfs_btree_clear(size_tree); zfs_btree_destroy(size_tree); metaslab_rt_create(rt, arg); } static const range_tree_ops_t metaslab_rt_ops = { .rtop_create = metaslab_rt_create, .rtop_destroy = metaslab_rt_destroy, .rtop_add = metaslab_rt_add, .rtop_remove = metaslab_rt_remove, .rtop_vacate = metaslab_rt_vacate }; /* * ========================================================================== * Common allocator routines * ========================================================================== */ /* * Return the maximum contiguous segment within the metaslab. */ uint64_t metaslab_largest_allocatable(metaslab_t *msp) { zfs_btree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL) return (0); if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, msp->ms_allocatable)); } /* * Return the maximum contiguous segment within the unflushed frees of this * metaslab. */ static uint64_t metaslab_largest_unflushed_free(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if (msp->ms_unflushed_frees == NULL) return (0); if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); /* * When a range is freed from the metaslab, that range is added to * both the unflushed frees and the deferred frees. While the block * will eventually be usable, if the metaslab were loaded the range * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE * txgs had passed. As a result, when attempting to estimate an upper * bound for the largest currently-usable free segment in the * metaslab, we need to not consider any ranges currently in the defer * trees. This algorithm approximates the largest available chunk in * the largest range in the unflushed_frees tree by taking the first * chunk. While this may be a poor estimate, it should only remain so * briefly and should eventually self-correct as frees are no longer * deferred. Similar logic applies to the ms_freed tree. See * metaslab_load() for more details. * * There are two primary sources of inaccuracy in this estimate. Both * are tolerated for performance reasons. The first source is that we * only check the largest segment for overlaps. Smaller segments may * have more favorable overlaps with the other trees, resulting in * larger usable chunks. Second, we only look at the first chunk in * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, rsize, &start, &size); if (found) { if (rstart == start) return (0); rsize = start - rstart; } } uint64_t start = 0; uint64_t size = 0; boolean_t found = range_tree_find_in(msp->ms_freed, rstart, rsize, &start, &size); if (found) rsize = start - rstart; return (rsize); } static range_seg_t * metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, uint64_t size, zfs_btree_index_t *where) { range_seg_t *rs; range_seg_max_t rsearch; rs_set_start(&rsearch, rt, start); rs_set_end(&rsearch, rt, start + size); rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { rs = zfs_btree_next(t, where, where); } return (rs); } /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking * for a block that matches the specified criteria. */ static uint64_t metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_search) { if (*cursor == 0) *cursor = rt->rt_start; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); uint64_t first_found; int count_searched = 0; if (rs != NULL) first_found = rs_get_start(rs, rt); while (rs != NULL && (rs_get_start(rs, rt) - first_found <= max_search || count_searched < metaslab_min_search_count)) { uint64_t offset = rs_get_start(rs, rt); if (offset + size <= rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } rs = zfs_btree_next(bt, &where, &where); count_searched++; } *cursor = 0; return (-1ULL); } static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size); static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size); static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size); metaslab_ops_t *metaslab_allocator(spa_t *spa); static metaslab_ops_t metaslab_allocators[] = { { "dynamic", metaslab_df_alloc }, { "cursor", metaslab_cf_alloc }, { "new-dynamic", metaslab_ndf_alloc }, }; static int spa_find_allocator_byname(const char *val) { int a = ARRAY_SIZE(metaslab_allocators) - 1; if (strcmp("new-dynamic", val) == 0) return (-1); /* remove when ndf is working */ for (; a >= 0; a--) { if (strcmp(val, metaslab_allocators[a].msop_name) == 0) return (a); } return (-1); } void spa_set_allocator(spa_t *spa, const char *allocator) { int a = spa_find_allocator_byname(allocator); if (a < 0) a = 0; spa->spa_active_allocator = a; zfs_dbgmsg("spa allocator: %s", metaslab_allocators[a].msop_name); } int spa_get_allocator(spa_t *spa) { return (spa->spa_active_allocator); } #if defined(_KERNEL) int param_set_active_allocator_common(const char *val) { char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; int a = spa_find_allocator_byname(val); if (a < 0) return (SET_ERROR(EINVAL)); zfs_active_allocator = metaslab_allocators[a].msop_name; return (0); } #endif metaslab_ops_t * metaslab_allocator(spa_t *spa) { int allocator = spa_get_allocator(spa); return (&metaslab_allocators[allocator]); } /* * ========================================================================== * Dynamic Fit (df) block allocator * * Search for a free chunk of at least this size, starting from the last * offset (for this alignment of block) looking for up to * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not * found within 16MB, then return a free chunk of exactly the requested size (or * larger). * * If it seems like searching from the last offset will be unproductive, skip * that and just return a free chunk of exactly the requested size (or larger). * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This * mechanism is probably not very useful and may be removed in the future. * * The behavior when not searching can be changed to return the largest free * chunk, instead of a free chunk of exactly the requested size, by setting * metaslab_df_use_largest_segment. * ========================================================================== */ static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size) { /* * Find the largest power of 2 block size that evenly divides the * requested size. This is used to try to allocate blocks with similar * alignment from the same area of the metaslab (i.e. same cursor * bucket) but it does not guarantee that other allocations sizes * may exist in the same region. */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; range_tree_t *rt = msp->ms_allocatable; uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're running low on space, find a segment based on size, * rather than iterating based on offset. */ if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { offset = -1; } else { offset = metaslab_block_picker(rt, cursor, size, metaslab_df_max_search); } if (offset == -1) { range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, rt)) { offset = rs_get_start(rs, rt); *cursor = offset + size; } } return (offset); } /* * ========================================================================== * Cursor fit block allocator - * Select the largest region in the metaslab, set the cursor to the beginning * of the range and the cursor_end to the end of the range. As allocations * are made advance the cursor. Continue allocating from the cursor until * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) return (-1ULL); *cursor = rs_get_start(rs, rt); *cursor_end = rs_get_end(rs, rt); } offset = *cursor; *cursor += size; return (offset); } /* * ========================================================================== * New dynamic fit allocator - * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift * contiguous blocks. If no region is found then just use the largest segment * that remains. * ========================================================================== */ /* * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) * to request from the allocator. */ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; range_seg_t *rs; range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); if (max_size < size) return (-1ULL); rs_set_start(&rsearch, rt, *cursor); rs_set_end(&rsearch, rt, *cursor + size); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; rs_set_start(&rsearch, rt, 0); rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { *cursor = rs_get_start(rs, rt) + size; return (rs_get_start(rs, rt)); } return (-1ULL); } /* * ========================================================================== * Metaslabs * ========================================================================== */ /* * Wait for any in-progress metaslab loads to complete. */ static void metaslab_load_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_loading) { ASSERT(!msp->ms_loaded); cv_wait(&msp->ms_load_cv, &msp->ms_lock); } } /* * Wait for any in-progress flushing to complete. */ static void metaslab_flush_wait(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); while (msp->ms_flushing) cv_wait(&msp->ms_flush_cv, &msp->ms_lock); } static unsigned int metaslab_idx_func(multilist_t *ml, void *arg) { metaslab_t *msp = arg; /* * ms_id values are allocated sequentially, so full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); } uint64_t metaslab_allocated_space(metaslab_t *msp) { return (msp->ms_allocated_space); } /* * Verify that the space accounting on disk matches the in-core range_trees. */ static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called * from syncing context with a loaded metaslab that has an * allocated space map. Calling this in non-syncing context * does not provide a consistent view of the metaslab since * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; /* * Even though the smp_alloc field can get negative, * when it comes to a metaslab's space map, that should * never be the case. */ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); ASSERT3U(space_map_allocated(msp->ms_sm), >=, range_tree_space(msp->ms_unflushed_frees)); ASSERT3U(metaslab_allocated_space(msp), ==, space_map_allocated(msp->ms_sm) + range_tree_space(msp->ms_unflushed_allocs) - range_tree_space(msp->ms_unflushed_frees)); sm_free_space = msp->ms_size - metaslab_allocated_space(msp); /* * Account for future allocations since we would have * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, msp->ms_allocating_total); ASSERT3U(msp->ms_deferspace, ==, range_tree_space(msp->ms_defer[0]) + range_tree_space(msp->ms_defer[1])); msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } static void metaslab_aux_histograms_clear(metaslab_t *msp) { /* * Auxiliary histograms are only cleared when resetting them, * which can only happen while the metaslab is loaded. */ ASSERT(msp->ms_loaded); memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); for (int t = 0; t < TXG_DEFER_SIZE; t++) memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); } static void metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, range_tree_t *rt) { /* * This is modeled after space_map_histogram_add(), so refer to that * function for implementation details. We want this to work like * the space map histogram, and not the range tree histogram, as we * are essentially constructing a delta that will be later subtracted * from the space map histogram. */ int idx = 0; for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT3U(i, >=, idx + shift); histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } /* * Called at every sync pass that the metaslab gets synced. * * The reason is that we want our auxiliary histograms to be updated * wherever the metaslab's space map histogram is updated. This way * we stay consistent on which parts of the metaslab space map's * histogram are currently not available for allocations (e.g because * they are in the defer, freed, and freeing trees). */ static void metaslab_aux_histograms_update(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(sm != NULL); /* * This is similar to the metaslab's space map histogram updates * that take place in metaslab_sync(). The only difference is that * we only care about segments that haven't made it into the * ms_allocatable tree yet. */ if (msp->ms_loaded) { metaslab_aux_histograms_clear(msp); metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freed); for (int t = 0; t < TXG_DEFER_SIZE; t++) { metaslab_aux_histogram_add(msp->ms_deferhist[t], sm->sm_shift, msp->ms_defer[t]); } } metaslab_aux_histogram_add(msp->ms_synchist, sm->sm_shift, msp->ms_freeing); } /* * Called every time we are done syncing (writing to) the metaslab, * i.e. at the end of each sync pass. * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] */ static void metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; space_map_t *sm = msp->ms_sm; if (sm == NULL) { /* * We came here from metaslab_init() when creating/opening a * pool, looking at a metaslab that hasn't had any allocations * yet. */ return; } /* * This is similar to the actions that we take for the ms_freed * and ms_defer trees in metaslab_sync_done(). */ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; if (defer_allowed) { memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, sizeof (msp->ms_synchist)); } else { memset(msp->ms_deferhist[hist_index], 0, sizeof (msp->ms_deferhist[hist_index])); } memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); } /* * Ensure that the metaslab's weight and fragmentation are consistent * with the contents of the histogram (either the range tree's histogram * or the space map's depending whether the metaslab is loaded). */ static void metaslab_verify_weight_and_frag(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can end up here from vdev_remove_complete(), in which case we * cannot do these assertions because we hold spa config locks and * thus we are not allowed to read from the DMU. * * We check if the metaslab group has been removed and if that's * the case we return immediately as that would mean that we are * here from the aforementioned code path. */ if (msp->ms_group == NULL) return; /* * Devices being removed always return a weight of 0 and leave * fragmentation and ms_max_size as is - there is nothing for * us to verify here. */ vdev_t *vd = msp->ms_group->mg_vd; if (vd->vdev_removing) return; /* * If the metaslab is dirty it probably means that we've done * some allocations or frees that have changed our histograms * and thus the weight. */ for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&vd->vdev_ms_list, msp, t)) return; } /* * This verification checks that our in-memory state is consistent * with what's on disk. If the pool is read-only then there aren't * any changes and we just have the initially-loaded state. */ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) return; /* some extra verification for in-core tree if you can */ if (msp->ms_loaded) { range_tree_stat_verify(msp->ms_allocatable); VERIFY(space_map_histogram_verify(msp->ms_sm, msp->ms_allocatable)); } uint64_t weight = msp->ms_weight; uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); uint64_t frag = msp->ms_fragmentation; uint64_t max_segsize = msp->ms_max_size; msp->ms_weight = 0; msp->ms_fragmentation = 0; /* * This function is used for verification purposes and thus should * not introduce any side-effects/mutations on the system's state. * * Regardless of whether metaslab_weight() thinks this metaslab * should be active or not, we want to ensure that the actual weight * (and therefore the value of ms_weight) would be the same if it * was to be recalculated at this point. * * In addition we set the nodirty flag so metaslab_weight() does * not dirty the metaslab for future TXGs (e.g. when trying to * force condensing to upgrade the metaslab spacemaps). */ msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; VERIFY3U(max_segsize, ==, msp->ms_max_size); /* * If the weight type changed then there is no point in doing * verification. Revert fields to their original values. */ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { msp->ms_fragmentation = frag; msp->ms_weight = weight; return; } VERIFY3U(msp->ms_fragmentation, ==, frag); VERIFY3U(msp->ms_weight, ==, weight); } /* * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from * this class that was used longest ago, and attempt to unload it. We don't * want to spend too much time in this loop to prevent performance * degradation, and we expect that most of the time this operation will * succeed. Between that and the normal unloading processing during txg sync, * we expect this to keep the metaslab memory usage under control. */ static void metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); uint_t tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; tries++) { unsigned int idx = multilist_get_random_index( &mc->mc_metaslab_txg_list); multilist_sublist_t *mls = multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); metaslab_t *msp = multilist_sublist_head(mls); multilist_sublist_unlock(mls); while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < inuse * size) { VERIFY3P(mls, ==, multilist_sublist_lock_idx( &mc->mc_metaslab_txg_list, idx)); ASSERT3U(idx, ==, metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); if (!multilist_link_active(&msp->ms_class_txg_node)) { multilist_sublist_unlock(mls); break; } metaslab_t *next_msp = multilist_sublist_next(mls, msp); multilist_sublist_unlock(mls); /* * If the metaslab is currently loading there are two * cases. If it's the metaslab we're evicting, we * can't continue on or we'll panic when we attempt to * recursively lock the mutex. If it's another * metaslab that's loading, it can be safely skipped, * since we know it's very new and therefore not a * good eviction candidate. We check later once the * lock is held that the metaslab is fully loaded * before actually unloading it. */ if (msp->ms_loading) { msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); continue; } /* * We can't unload metaslabs with no spacemap because * they're not ready to be unloaded yet. We can't * unload metaslabs with outstanding allocations * because doing so could cause the metaslab's weight * to decrease while it's unloaded, which violates an * invariant that we use to prevent unnecessary * loading. We also don't unload metaslabs that are * currently active because they are high-weight * metaslabs that are likely to be used in the near * future. */ mutex_enter(&msp->ms_lock); if (msp->ms_allocator == -1 && msp->ms_sm != NULL && msp->ms_allocating_total == 0) { metaslab_unload(msp); } mutex_exit(&msp->ms_lock); msp = next_msp; inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); } } #else (void) mc, (void) zfs_metaslab_mem_limit; #endif } static int metaslab_load_impl(metaslab_t *msp) { int error = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We temporarily drop the lock to unblock other operations while we * are reading the space map. Therefore, metaslab_sync() and * metaslab_sync_done() can run at the same time as we do. * * If we are using the log space maps, metaslab_sync() can't write to * the metaslab's space map while we are loading as we only write to * it when we are flushing the metaslab, and that can't happen while * we are loading it. * * If we are not using log space maps though, metaslab_sync() can * append to the space map while we are loading. Therefore we load * only entries that existed when we started the load. Additionally, * metaslab_sync_done() has to wait for the load to complete because * there are potential races like metaslab_load() loading parts of the * space map that are currently being appended by metaslab_sync(). If * we didn't, the ms_allocatable would have entries that * metaslab_sync_done() would try to re-add later. * * That's why before dropping the lock we remember the synced length * of the metaslab and read up to that point of the space map, * ignoring entries appended by metaslab_sync() that happen after we * drop the lock. */ uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); metaslab_rt_arg_t *mrap; if (msp->ms_allocatable->rt_arg == NULL) { mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); } else { mrap = msp->ms_allocatable->rt_arg; msp->ms_allocatable->rt_ops = NULL; msp->ms_allocatable->rt_arg = NULL; } mrap->mra_bt = &msp->ms_allocatable_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); /* Now, populate the size-sorted tree. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; struct mssa_arg arg = {0}; arg.rt = msp->ms_allocatable; arg.mra = mrap; range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, &arg); } else { /* * Add the size-sorted tree first, since we don't need to load * the metaslab from the spacemap. */ metaslab_rt_create(msp->ms_allocatable, mrap); msp->ms_allocatable->rt_ops = &metaslab_rt_ops; msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the * ms_allocatable tree. */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); if (msp->ms_new) { /* * If the ms_sm doesn't exist, this means that this * metaslab hasn't gone through metaslab_sync() and * thus has never been dirtied. So we shouldn't * expect any unflushed allocs or frees from previous * TXGs. */ ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); } } /* * We need to grab the ms_sync_lock to prevent metaslab_sync() from * changing the ms_sm (or log_sm) and the metaslab's range trees * while we are about to use them and populate the ms_allocatable. * The ms_lock is insufficient for this because metaslab_sync() doesn't * hold the ms_lock while writing the ms_checkpointing tree to disk. */ mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); ASSERT(!msp->ms_condensing); ASSERT(!msp->ms_flushing); if (error != 0) { mutex_exit(&msp->ms_sync_lock); return (error); } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* * Apply all the unflushed changes to ms_allocatable right * away so any manipulations we do below have a clear view * of what is allocated and what is free. */ range_tree_walk(msp->ms_unflushed_allocs, range_tree_remove, msp->ms_allocatable); range_tree_walk(msp->ms_unflushed_frees, range_tree_add, msp->ms_allocatable); ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (spa_syncing_log_sm(spa) != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); /* * If we use a log space map we add all the segments * that are in ms_unflushed_frees so they are available * for allocation. * * ms_allocatable needs to contain all free segments * that are ready for allocations (thus not segments * from ms_freeing, ms_freed, and the ms_defer trees). * But if we grab the lock in this code path at a sync * pass later that 1, then it also contains the * segments of ms_freed (they were added to it earlier * in this path through ms_unflushed_frees). So we * need to remove all the segments that exist in * ms_freed from ms_allocatable as they will be added * later in metaslab_sync_done(). * * When there's no log space map, the ms_allocatable * correctly doesn't contain any segments that exist * in ms_freed [see ms_synced_length]. */ range_tree_walk(msp->ms_freed, range_tree_remove, msp->ms_allocatable); } /* * If we are not using the log space map, ms_allocatable * contains the segments that exist in the ms_defer trees * [see ms_synced_length]. Thus we need to remove them * from ms_allocatable as they will be added again in * metaslab_sync_done(). * * If we are using the log space map, ms_allocatable still * contains the segments that exist in the ms_defer trees. * Not because it read them through the ms_sm though. But * because these segments are part of ms_unflushed_frees * whose segments we add to ms_allocatable earlier in this * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_remove, msp->ms_allocatable); } /* * Call metaslab_recalculate_weight_and_sort() now that the * metaslab is loaded so we get the metaslab's real weight. * * Unless this metaslab was created with older software and * has not yet been converted to use segment-based weight, we * expect the new weight to be better or equal to the weight * that the metaslab had while it was not loaded. This is * because the old weight does not take into account the * consolidation of adjacent segments between TXGs. [see * comment for ms_synchist and ms_deferhist[] for more info] */ uint64_t weight = msp->ms_weight; uint64_t max_size = msp->ms_max_size; metaslab_recalculate_weight_and_sort(msp); if (!WEIGHT_IS_SPACEBASED(weight)) ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_largest_allocatable(msp); ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); msp->ms_load_time = load_end; zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " "unflushed_allocs %llu, unflushed_frees %llu, " "freed %llu, defer %llu + %llu, unloaded time %llu ms, " "loading_time %lld ms, ms_max_size %llu, " "max size error %lld, " "old_weight %llx, new_weight %llx", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), (u_longlong_t)range_tree_space(msp->ms_freed), (u_longlong_t)range_tree_space(msp->ms_defer[0]), (u_longlong_t)range_tree_space(msp->ms_defer[1]), (longlong_t)((load_start - msp->ms_unload_time) / 1000000), (longlong_t)((load_end - load_start) / 1000000), (u_longlong_t)msp->ms_max_size, (u_longlong_t)msp->ms_max_size - max_size, (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); metaslab_verify_space(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_sync_lock); return (0); } int metaslab_load(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * There may be another thread loading the same metaslab, if that's * the case just wait until the other thread is done and return. */ metaslab_load_wait(msp); if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); /* * We set the loading flag BEFORE potentially dropping the lock to * wait for an ongoing flush (see ms_flushing below). This way other * threads know that there is already a thread that is loading this * metaslab. */ msp->ms_loading = B_TRUE; /* * Wait for any in-progress flushing to finish as we drop the ms_lock * both here (during space_map_load()) and in metaslab_flush() (when * we flush our changes to the ms_sm). */ if (msp->ms_flushing) metaslab_flush_wait(msp); /* * In the possibility that we were waiting for the metaslab to be * flushed (where we temporarily dropped the ms_lock), ensure that * no one else loaded the metaslab somehow. */ ASSERT(!msp->ms_loaded); /* * If we're loading a metaslab in the normal class, consider evicting * another one to keep our memory usage under the limit defined by the * zfs_metaslab_mem_limit tunable. */ if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == msp->ms_group->mg_class) { metaslab_potentially_evict(msp->ms_group->mg_class); } int error = metaslab_load_impl(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); msp->ms_loading = B_FALSE; cv_broadcast(&msp->ms_load_cv); return (error); } void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * This can happen if a metaslab is selected for eviction (in * metaslab_potentially_evict) and then unloaded during spa_sync (via * metaslab_class_evict_old). */ if (!msp->ms_loaded) return; range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); msp->ms_activation_weight = 0; msp->ms_weight &= ~METASLAB_ACTIVE_MASK; if (msp->ms_group != NULL) { metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); multilist_sublist_unlock(mls); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, weight %llx, " "selected txg %llu (%llu ms ago), alloc_txg %llu, " "loaded %llu ms ago, max_size %llu", (u_longlong_t)spa_syncing_txg(spa), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_weight, (u_longlong_t)msp->ms_selected_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_selected_time) / 1000 / 1000, (u_longlong_t)msp->ms_alloc_txg, (u_longlong_t)(msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, (u_longlong_t)msp->ms_max_size); } /* * We explicitly recalculate the metaslab's weight based on its space * map (as it is now not loaded). We want unload metaslabs to always * have their weights calculated from the space map histograms, while * loaded ones have it calculated from their in-core range tree * [see metaslab_load()]. This way, the weight reflects the information * available in-core, whether it is loaded or not. * * If ms_group == NULL means that we came here from metaslab_fini(), * at which point it doesn't make sense for us to do the recalculation * and the sorting. */ if (msp->ms_group != NULL) metaslab_recalculate_weight_and_sort(msp); } /* * We want to optimize the memory use of the per-metaslab range * trees. To do this, we store the segments in the range trees in * units of sectors, zero-indexing from the start of the metaslab. If * the vdev_ms_shift - the vdev_ashift is less than 32, we can store * the ranges using two uint32_ts, rather than two uint64_ts. */ range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift) { if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && !zfs_metaslab_force_large_segs) { *shift = vdev->vdev_ashift; *start = msp->ms_start; return (RANGE_SEG32); } else { *shift = 0; *start = 0; return (RANGE_SEG64); } } void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (multilist_link_active(&msp->ms_class_txg_node)) multilist_sublist_remove(mls, msp); msp->ms_selected_txg = txg; msp->ms_selected_time = gethrtime(); multilist_sublist_insert_tail(mls, msp); multilist_sublist_unlock(mls); } void metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { vdev_space_update(vd, alloc_delta, defer_delta, space_delta); ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, vdev_deflated_space(vd, space_delta)); } int metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, metaslab_t **msp) { vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; metaslab_t *ms; int error; ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); multilist_link_init(&ms->ms_class_txg_node); ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; ms->ms_allocator = -1; ms->ms_new = B_TRUE; vdev_ops_t *ops = vd->vdev_ops; if (ops->vdev_op_metaslab_init != NULL) ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. For * readonly pools there is no need to open the space map object. * * Note: * When called from vdev_expand(), we can't call into the DMU as * we are holding the spa_config_lock as a writer and we would * deadlock [see relevant comment in vdev_metaslab_init()]. in * that case, the object parameter is zero though, so we won't * call into the DMU. */ if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, ms->ms_size, vd->vdev_ashift); if (error != 0) { kmem_free(ms, sizeof (metaslab_t)); return (error); } ASSERT(ms->ms_sm != NULL); ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } uint64_t shift, start; range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_SIZE; t++) { ms->ms_allocating[t] = range_tree_create(NULL, type, NULL, start, shift); } ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ms->ms_defer[t] = range_tree_create(NULL, type, NULL, start, shift); } ms->ms_checkpointing = range_tree_create(NULL, type, NULL, start, shift); ms->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, start, shift); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, type, mrap, start, shift); ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. * The metaslab's weight will also be initialized when we sync * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); metaslab_space_update(vd, mg->mg_class, metaslab_allocated_space(ms), 0, 0); } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); vdev_dirty(vd, VDD_METASLAB, ms, txg); } *msp = ms; return (0); } static void metaslab_fini_flush_data(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (metaslab_unflushed_txg(msp) == 0) { ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, NULL); return; } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), metaslab_unflushed_dirty(msp)); } uint64_t metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((range_tree_numsegs(ms->ms_unflushed_allocs) + range_tree_numsegs(ms->ms_unflushed_frees)) * ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; metaslab_fini_flush_data(msp); metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); /* * If this metaslab hasn't been through metaslab_sync_done() yet its * space hasn't been accounted for in its vdev and doesn't need to be * subtracted. */ if (!msp->ms_new) { metaslab_space_update(vd, mg->mg_class, -metaslab_allocated_space(msp), 0, -msp->ms_size); } space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); range_tree_destroy(msp->ms_allocatable); range_tree_destroy(msp->ms_freeing); range_tree_destroy(msp->ms_freed); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_destroy(msp->ms_unflushed_allocs); range_tree_destroy(msp->ms_checkpointing); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); range_tree_destroy(msp->ms_unflushed_frees); for (int t = 0; t < TXG_SIZE; t++) { range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); range_tree_vacate(msp->ms_trim, NULL, NULL); range_tree_destroy(msp->ms_trim); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); cv_destroy(&msp->ms_flush_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } #define FRAGMENTATION_TABLE_SIZE 17 /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and * multiplying that by the fragmentation metric in this table. Doing * this for all buckets and dividing it by the total amount of free * space in this metaslab (i.e. the total free space in all buckets) gives * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in * large segments. A 10% change in fragmentation equates to approximately * double the number of segments. * * This table defines 0% fragmented space using 16MB segments. Testing has * shown that segments that are greater than or equal to 16MB do not suffer * from drastic performance problems. Using this value, we derive the rest * of the table. Since the fragmentation value is never stored on disk, it * is possible to change these calculations in the future. */ static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 100, /* 512B */ 100, /* 1K */ 98, /* 2K */ 95, /* 4K */ 90, /* 8K */ 80, /* 16K */ 70, /* 32K */ 60, /* 64K */ 50, /* 128K */ 40, /* 256K */ 30, /* 512K */ 20, /* 1M */ 15, /* 2M */ 10, /* 4M */ 5, /* 8M */ 0 /* 16M */ }; /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation. * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not * been upgraded and does not support this metric. Otherwise, the return * value should be in the range [0, 100]. */ static void metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; uint64_t fragmentation = 0; uint64_t total = 0; boolean_t feature_enabled = spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM); if (!feature_enabled) { msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } /* * A null space map means that the entire metaslab is free * and thus is not fragmented. */ if (msp->ms_sm == NULL) { msp->ms_fragmentation = 0; return; } /* * If this metaslab's space map has not been upgraded, flag it * so that we upgrade next time we encounter it. */ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; /* * If we've reached the final dirty txg, then we must * be shutting down the pool. We don't want to dirty * any data past this point so skip setting the condense * flag. We can retry this action the next time the pool * is imported. We also skip marking this metaslab for * condensing if the caller has explicitly set nodirty. */ if (!nodirty && spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, (u_longlong_t)vd->vdev_id); } msp->ms_fragmentation = ZFS_FRAG_INVALID; return; } for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { uint64_t space = 0; uint8_t shift = msp->ms_sm->sm_shift; int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, FRAGMENTATION_TABLE_SIZE - 1); if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); total += space; ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); fragmentation += space * zfs_frag_table[idx]; } if (total > 0) fragmentation /= total; ASSERT3U(fragmentation, <=, 100); msp->ms_fragmentation = fragmentation; } /* * Compute a weight -- a selection preference value -- for the given metaslab. * This is based on the amount of free space, the level of fragmentation, * the LBA range, and whether the metaslab is loaded. */ static uint64_t metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The baseline weight is the metaslab's free space. */ space = msp->ms_size - metaslab_allocated_space(msp); if (metaslab_fragmentation_factor_enabled && msp->ms_fragmentation != ZFS_FRAG_INVALID) { /* * Use the fragmentation information to inversely scale * down the baseline weight. We need to ensure that we * don't exclude this metaslab completely when it's 100% * fragmented. To avoid this we reduce the fragmented value * by 1. */ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; /* * If space < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. The fragmentation metric may have * decreased the space to something smaller than * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE * so that we can consume any remaining space. */ if (space > 0 && space < SPA_MINBLOCKSIZE) space = SPA_MINBLOCKSIZE; } weight = space; /* * Modern disks have uniform bit density and constant angular velocity. * Therefore, the outer recording zones are faster (higher bandwidth) * than the inner zones by the ratio of outer to inner track diameter, * which is typically around 2:1. We account for this by assigning * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; ASSERT(weight >= space && weight <= 2 * space); } /* * If this metaslab is one we're actively using, adjust its * weight to make it preferable to any inactive metaslab so * we'll polish it off. If the fragmentation on this metaslab * has exceed our threshold, then don't mark it active. */ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } WEIGHT_SET_SPACEBASED(weight); return (weight); } /* * Return the weight of the specified metaslab, according to the segment-based * weighting algorithm. The metaslab must be loaded. This function can * be called within a sync pass since it relies only on the metaslab's * range tree which is always accurate when the metaslab is loaded. */ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp) { uint64_t weight = 0; uint32_t segments = 0; ASSERT(msp->ms_loaded); for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; segments <<= 1; segments += msp->ms_allocatable->rt_histogram[i]; /* * The range tree provides more precision than the space map * and must be downgraded so that all values fit within the * space map's histogram. This allows us to compare loaded * vs. unloaded metaslabs to determine which metaslab is * considered "best". */ if (i > max_idx) continue; if (segments != 0) { WEIGHT_SET_COUNT(weight, segments); WEIGHT_SET_INDEX(weight, i); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Calculate the weight based on the on-disk histogram. Should be applied * only to unloaded metaslabs (i.e no incoming allocations) in-order to * give results consistent with the on-disk state */ static uint64_t metaslab_weight_from_spacemap(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; ASSERT(!msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(space_map_object(sm), !=, 0); ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * Create a joint histogram from all the segments that have made * it to the metaslab's space map histogram, that are not yet * available for allocation because they are still in the freeing * pipeline (e.g. freeing, freed, and defer trees). Then subtract * these segments from the space map's histogram to get a more * accurate weight. */ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) deferspace_histogram[i] += msp->ms_synchist[i]; for (int t = 0; t < TXG_DEFER_SIZE; t++) { for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { deferspace_histogram[i] += msp->ms_deferhist[t][i]; } } uint64_t weight = 0; for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { ASSERT3U(sm->sm_phys->smp_histogram[i], >=, deferspace_histogram[i]); uint64_t count = sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; if (count != 0) { WEIGHT_SET_COUNT(weight, count); WEIGHT_SET_INDEX(weight, i + sm->sm_shift); WEIGHT_SET_ACTIVE(weight, 0); break; } } return (weight); } /* * Compute a segment-based weight for the specified metaslab. The weight * is determined by highest bucket in the histogram. The information * for the highest bucket is encoded into the weight value. */ static uint64_t metaslab_segment_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; uint64_t weight = 0; uint8_t shift = mg->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The metaslab is completely free. */ if (metaslab_allocated_space(msp) == 0) { int idx = highbit64(msp->ms_size) - 1; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; if (idx < max_idx) { WEIGHT_SET_COUNT(weight, 1ULL); WEIGHT_SET_INDEX(weight, idx); } else { WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); WEIGHT_SET_INDEX(weight, max_idx); } WEIGHT_SET_ACTIVE(weight, 0); ASSERT(!WEIGHT_IS_SPACEBASED(weight)); return (weight); } ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); /* * If the metaslab is fully allocated then just make the weight 0. */ if (metaslab_allocated_space(msp) == msp->ms_size) return (0); /* * If the metaslab is already loaded, then use the range tree to * determine the weight. Otherwise, we rely on the space map information * to generate the weight. */ if (msp->ms_loaded) { weight = metaslab_weight_from_range_tree(msp); } else { weight = metaslab_weight_from_spacemap(msp); } /* * If the metaslab was active the last time we calculated its weight * then keep it active. We want to consume the entire region that * is associated with this weight. */ if (msp->ms_activation_weight != 0 && weight != 0) WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); return (weight); } /* * Determine if we should attempt to allocate from this metaslab. If the * metaslab is loaded, then we can determine if the desired allocation * can be satisfied by looking at the size of the maximum free segment * on that metaslab. Otherwise, we make our decision based on the metaslab's * weight. For segment-based weighting we can determine the maximum * allocation based on the index encoded in its value. For space-based * weights we rely on the entire weight (excluding the weight-type bit). */ static boolean_t metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) { /* * This case will usually but not always get caught by the checks below; * metaslabs can be loaded by various means, including the trim and * initialize code. Once that happens, without this check they are * allocatable even before they finish their first txg sync. */ if (unlikely(msp->ms_new)) return (B_FALSE); /* * If the metaslab is loaded, ms_max_size is definitive and we can use * the fast check. If it's not, the ms_max_size is a lower bound (once * set), and we should use the fast check as long as we're not in * try_hard and it's been less than zfs_metaslab_max_size_cache_sec * seconds since the metaslab was unloaded. */ if (msp->ms_loaded || (msp->ms_max_size != 0 && !try_hard && gethrtime() < msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) return (msp->ms_max_size >= asize); boolean_t should_allocate; if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { /* * The metaslab segment weight indicates segments in the * range [2^i, 2^(i+1)), where i is the index in the weight. * Since the asize might be in the middle of the range, we * should attempt the allocation if asize < 2^(i+1). */ should_allocate = (asize < 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); } else { should_allocate = (asize <= (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); } return (should_allocate); } static uint64_t metaslab_weight(metaslab_t *msp, boolean_t nodirty) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; uint64_t weight; ASSERT(MUTEX_HELD(&msp->ms_lock)); metaslab_set_fragmentation(msp, nodirty); /* * Update the maximum size. If the metaslab is loaded, this will * ensure that we get an accurate maximum size if newly freed space * has been added back into the free tree. If the metaslab is * unloaded, we check if there's a larger free segment in the * unflushed frees. This is a lower bound on the largest allocatable * segment size. Coalescing of adjacent entries may reveal larger * allocatable segments, but we aren't aware of those until loading * the space map into a range tree. */ if (msp->ms_loaded) { msp->ms_max_size = metaslab_largest_allocatable(msp); } else { msp->ms_max_size = MAX(msp->ms_max_size, metaslab_largest_unflushed_free(msp)); } /* * Segment-based weighting requires space map histogram support. */ if (zfs_metaslab_segment_weight_enabled && spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == sizeof (space_map_phys_t))) { weight = metaslab_segment_weight(msp); } else { weight = metaslab_space_weight(msp); } return (weight); } void metaslab_recalculate_weight_and_sort(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* note: we preserve the mask (e.g. indication of primary, etc..) */ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(msp->ms_group, msp, metaslab_weight(msp, B_FALSE) | was_active); } static int metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, int allocator, uint64_t activation_weight) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * If we're activating for the claim code, we don't want to actually * set the metaslab up for a specific allocator. */ if (activation_weight == METASLAB_WEIGHT_CLAIM) { ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(mg, msp, msp->ms_weight | activation_weight); return (0); } metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? &mga->mga_primary : &mga->mga_secondary); mutex_enter(&mg->mg_lock); if (*mspp != NULL) { mutex_exit(&mg->mg_lock); return (EEXIST); } *mspp = msp; ASSERT3S(msp->ms_allocator, ==, -1); msp->ms_allocator = allocator; msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); ASSERT0(msp->ms_activation_weight); msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort_impl(mg, msp, msp->ms_weight | activation_weight); mutex_exit(&mg->mg_lock); return (0); } static int metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); /* * The current metaslab is already activated for us so there * is nothing to do. Already activated though, doesn't mean * that this metaslab is activated for our allocator nor our * requested activation weight. The metaslab could have started * as an active one for our allocator but changed allocators * while we were waiting to grab its ms_lock or we stole it * [see find_valid_metaslab()]. This means that there is a * possibility of passivating a metaslab of another allocator * or from a different activation mask, from this thread. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { ASSERT(msp->ms_loaded); return (0); } int error = metaslab_load(msp); if (error != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } /* * When entering metaslab_load() we may have dropped the * ms_lock because we were loading this metaslab, or we * were waiting for another thread to load it for us. In * that scenario, we recheck the weight of the metaslab * to see if it was activated by another thread. * * If the metaslab was activated for another allocator or * it was activated with a different activation weight (e.g. * we wanted to make it a primary but it was activated as * secondary) we return error (EBUSY). * * If the metaslab was activated for the same allocator * and requested activation mask, skip activating it. */ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { if (msp->ms_allocator != allocator) return (EBUSY); if ((msp->ms_weight & activation_weight) == 0) return (SET_ERROR(EBUSY)); EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), msp->ms_primary); return (0); } /* * If the metaslab has literally 0 space, it will have weight 0. In * that case, don't bother activating it. This can happen if the * metaslab had space during find_valid_metaslab, but another thread * loaded it and used all that space while we were waiting to grab the * lock. */ if (msp->ms_weight == 0) { ASSERT0(range_tree_space(msp->ms_allocatable)); return (SET_ERROR(ENOSPC)); } if ((error = metaslab_activate_allocator(msp->ms_group, msp, allocator, activation_weight)) != 0) { return (error); } ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { metaslab_group_sort(mg, msp, weight); return; } mutex_enter(&mg->mg_lock); ASSERT3P(msp->ms_group, ==, mg); ASSERT3S(0, <=, msp->ms_allocator); ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; if (msp->ms_primary) { ASSERT3P(mga->mga_primary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); mga->mga_primary = NULL; } else { ASSERT3P(mga->mga_secondary, ==, msp); ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); mga->mga_secondary = NULL; } msp->ms_allocator = -1; metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_activation_weight != 0); msp->ms_activation_weight = 0; metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); } /* * Segment-based metaslabs are activated once and remain active until * we either fail an allocation attempt (similar to space-based metaslabs) * or have exhausted the free space in zfs_metaslab_switch_threshold * buckets since the metaslab was activated. This function checks to see * if we've exhausted the zfs_metaslab_switch_threshold buckets in the * metaslab and passivates it proactively. This will allow us to select a * metaslab with a larger contiguous region, if any, remaining within this * metaslab group. If we're in sync pass > 1, then we continue using this * metaslab so that we don't dirty more block and cause more sync passes. */ static void metaslab_segment_may_passivate(metaslab_t *msp) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree * histogram; calculate the new weight based on that information. */ uint64_t weight = metaslab_weight_from_range_tree(msp); int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); int current_idx = WEIGHT_GET_INDEX(weight); if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) metaslab_passivate(msp, weight); } static void metaslab_preload(void *arg) { metaslab_t *msp = arg; metaslab_class_t *mc = msp->ms_group->mg_class; spa_t *spa = mc->mc_spa; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); mutex_enter(&msp->ms_lock); (void) metaslab_load(msp); metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); mutex_exit(&msp->ms_lock); spl_fstrans_unmark(cookie); } static void metaslab_group_preload(metaslab_group_t *mg) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_t *msp; avl_tree_t *t = &mg->mg_metaslab_tree; int m = 0; if (spa_shutting_down(spa) || !metaslab_preload_enabled) return; mutex_enter(&mg->mg_lock); /* * Load the next potential metaslabs */ for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { ASSERT3P(msp->ms_group, ==, mg); /* * We preload only the maximum number of metaslabs specified * by metaslab_preload_limit. If a metaslab is being forced * to condense then we preload it too. This will ensure * that force condensing happens in the next txg. */ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { continue; } VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) != TASKQID_INVALID); } mutex_exit(&mg->mg_lock); } /* * Determine if the space map's on-disk footprint is past our tolerance for * inefficiency. We would like to use the following criteria to make our * decision: * * 1. Do not condense if the size of the space map object would dramatically * increase as a result of writing out the free space range tree. * * 2. Condense if the on on-disk space map representation is at least * zfs_condense_pct/100 times the size of the optimal representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). * * 3. Do not condense if the on-disk size of the space map does not actually * decrease. * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for * zfs_metaslab_condense_block_threshold - we only condense if the space used * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(sm != NULL); ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); /* * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ if (range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); uint64_t object_size = space_map_length(sm); uint64_t optimal_size = space_map_estimate_optimal_size(sm, msp->ms_allocatable, SM_NO_VDEVID); return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } /* * Condense the on-disk space map representation to its minimized form. * The minimized form consists of a small number of allocations followed * by the entries of the free range tree (ms_allocatable). The condensed * spacemap contains all the entries of previous TXGs (including those in * the pool-wide log spacemaps; thus this is effectively a superset of * metaslab_flush()), but this TXG's entries still need to be written. */ static void metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); ASSERT(msp->ms_sm != NULL); /* * In order to condense the space map, we need to change it so it * only describes which segments are currently allocated and free. * * All the current free space resides in the ms_allocatable, all * the ms_defer trees, and all the ms_allocating trees. We ignore * ms_freed because it is empty because we're in sync pass 1. We * ignore ms_freeing because these changes are not yet reflected * in the spacemap (they will be written later this txg). * * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * * 1] We create a range tree (condense tree) that is 100% empty. * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. * 3] We vacate any unflushed allocs, since they are not frees we * need to add to the condense tree. Then we vacate any * unflushed frees as they should already be part of ms_allocatable. * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the * condensed space map, which would only contain freed * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can * be large, and therefore computationally expensive to add to * the condense_tree. Instead we first sync out an entry marking * everything as allocated, then the condense_tree and then the * ms_allocatable, in the condensed space map. While this is not * optimal, it is typically close to optimal and more importantly * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed * all the unflushed changes to disk, thus we call * metaslab_flush_update(). */ ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %llu, forcing condense=%s", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), (u_longlong_t)range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; range_seg_type_t type; uint64_t shift, start; type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); condense_tree = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); /* * We're about to drop the metaslab's lock thus allowing other * consumers to change it's content. Set the metaslab's ms_condensing * flag to ensure that allocations on this metaslab do not occur * while we're in the middle of committing it to disk. This is only * critical for ms_allocatable as all other range trees use per TXG * views of their content. */ msp->ms_condensing = B_TRUE; mutex_exit(&msp->ms_lock); uint64_t object = space_map_object(msp->ms_sm); space_map_truncate(sm, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); /* * space_map_truncate() may have reallocated the spacemap object. * If so, update the vdev_ms_array. */ if (space_map_object(msp->ms_sm) != object) { object = space_map_object(msp->ms_sm); dmu_write(spa->spa_meta_objset, msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &object, tx); } /* * Note: * When the log space map feature is enabled, each space map will * always have ALLOCS followed by FREES for each sync pass. This is * typically true even when the log space map feature is disabled, * except from the case where a metaslab goes through metaslab_sync() * and gets condensed. In that case the metaslab's space map will have * ALLOCS followed by FREES (due to condensing) followed by ALLOCS * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, shift); range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); range_tree_vacate(tmp_tree, NULL, NULL); range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; metaslab_flush_update(msp, tx); } static void metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); mutex_enter(&spa->spa_flushed_ms_lock); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, B_TRUE); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_increment_current_mscount(spa); spa_log_summary_add_flushed_metaslab(spa, B_TRUE); } void metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); /* update metaslab counts of spa_log_sm_t nodes */ spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); /* update log space map summary */ spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, ms_prev_flushed_dirty); spa_log_summary_add_flushed_metaslab(spa, dirty); /* cleanup obsolete logs if any */ spa_cleanup_old_sm_logs(spa, tx); } /* * Called when the metaslab has been flushed (its own spacemap now reflects * all the contents of the pool-wide spacemap log). Updates the metaslab's * metadata and any pool-wide related log space map data (e.g. summary, * obsolete logs, etc..) to reflect that. */ static void metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); /* * Just because a metaslab got flushed, that doesn't mean that * it will pass through metaslab_sync_done(). Thus, make sure to * update ms_synced_length here in case it doesn't. */ msp->ms_synced_length = space_map_length(msp->ms_sm); /* * We may end up here from metaslab_condense() without the * feature being active. In that case this is a no-op. */ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || metaslab_unflushed_txg(msp) == 0) return; metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); /* * There is nothing wrong with flushing the same metaslab twice, as * this codepath should work on that case. However, the current * flushing scheme makes sure to avoid this situation as we would be * making all these calls without having anything meaningful to write * to disk. We assert this behavior here. */ ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); /* * We can not flush while loading, because then we would * not load the ms_unflushed_{allocs,frees}. */ if (msp->ms_loading) return (B_FALSE); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); /* * Metaslab condensing is effectively flushing. Therefore if the * metaslab can be condensed we can just condense it instead of * flushing it. * * Note that metaslab_condense() does call metaslab_flush_update() * so we can just return immediately after condensing. We also * don't need to care about setting ms_flushing or broadcasting * ms_flush_cv, even if we temporarily drop the ms_lock in * metaslab_condense(), as the metaslab is already loaded. */ if (msp->ms_loaded && metaslab_should_condense(msp)) { metaslab_group_t *mg = msp->ms_group; /* * For all histogram operations below refer to the * comments of metaslab_sync() where we follow a * similar procedure. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); metaslab_condense(msp, tx); space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); ASSERT(range_tree_is_empty(msp->ms_freed)); for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); /* * Since we recreated the histogram (and potentially * the ms_sm too while condensing) ensure that the * weight is updated too because we are not guaranteed * that this metaslab is dirty and will go through * metaslab_sync_done(). */ metaslab_recalculate_weight_and_sort(msp); return (B_TRUE); } msp->ms_flushing = B_TRUE; uint64_t sm_len_before = space_map_length(msp->ms_sm); mutex_exit(&msp->ms_lock); space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); uint64_t sm_len_after = space_map_length(msp->ms_sm); if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx), spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), (u_longlong_t)(sm_len_after - sm_len_before)); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); metaslab_flush_update(msp, tx); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); msp->ms_flushing = B_FALSE; cv_broadcast(&msp->ms_flush_cv); return (B_TRUE); } /* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); /* * This metaslab has just been added so there's no work to do now. */ if (msp->ms_new) { ASSERT0(range_tree_space(alloctree)); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); ASSERT0(range_tree_space(msp->ms_trim)); return; } /* * Normally, we don't want to process a metaslab if there are no * allocations or frees to perform. However, if the metaslab is being * forced to condense, it's loaded and we're not beyond the final * dirty txg, we need to let it through. Not condensing beyond the * final dirty txg prevents an issue where metaslabs that need to be * condensed but were loaded for other reasons could cause a panic * here. By only checking the txg in that branch of the conditional, * we preserve the utility of the VERIFY statements in all other * cases. */ if (range_tree_is_empty(alloctree) && range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted && txg <= spa_final_dirty_txg(spa))) return; VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently * with metaslab_sync() is the metaslab's ms_allocatable. No * other thread can be modifying this txg's alloc, freeing, * freed, or space_map_phys_t. We drop ms_lock whenever we * could call into the DMU, because the DMU can call down to * us (e.g. via zio_free()) at any time. * * The spa_vdev_remove_thread() can be reading metaslab state * concurrently, and it is locked out by the ms_sync_lock. * Note that the ms_lock is insufficient for this, because it * is dropped by space_map_write(). */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); /* * Generate a log space map if one doesn't exist already. */ spa_generate_syncing_log_sm(spa, tx); if (msp->ms_sm == NULL) { uint64_t new_object = space_map_alloc(mos, spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); VERIFY3U(new_object, !=, 0); dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * msp->ms_id, sizeof (uint64_t), &new_object, tx); VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); uint64_t new_object = space_map_alloc(mos, zfs_vdev_standard_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * We save the space map object as an entry in vdev_top_zap * so it can be retrieved when the pool is reopened after an * export or through zdb. */ VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (new_object), 1, &new_object, tx)); } mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Note: metaslab_condense() clears the space map's histogram. * Therefore we must verify and remove this histogram before * condensing. */ metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); metaslab_group_histogram_remove(mg, msp); if (spa->spa_sync_pass == 1 && msp->ms_loaded && metaslab_should_condense(msp)) metaslab_condense(msp, tx); /* * We'll be going to disk to sync our space accounting, thus we * drop the ms_lock during that time so allocations coming from * open-context (ZIL) for future TXGs do not block. */ mutex_exit(&msp->ms_lock); space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); if (metaslab_unflushed_txg(msp) == 0) metaslab_unflushed_add(msp, tx); else if (!metaslab_unflushed_dirty(msp)) metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); space_map_write(log_sm, msp->ms_freeing, SM_FREE, vd->vdev_id, tx); mutex_enter(&msp->ms_lock); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); range_tree_remove_xor_add(alloctree, msp->ms_unflushed_frees, msp->ms_unflushed_allocs); range_tree_remove_xor_add(msp->ms_freeing, msp->ms_unflushed_allocs, msp->ms_unflushed_frees); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(msp); } else { ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); space_map_write(msp->ms_sm, alloctree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } msp->ms_allocated_space += range_tree_space(alloctree); ASSERT3U(msp->ms_allocated_space, >=, range_tree_space(msp->ms_freeing)); msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); if (!range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since we are doing writes to disk and the ms_checkpointing * tree won't be changing during that time, we drop the * ms_lock while writing to the checkpoint space map, for the * same reason mentioned above. */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); spa->spa_checkpoint_info.sci_dspace += range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -space_map_allocated(vd->vdev_checkpoint_sm)); range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { /* * When the space map is loaded, we have an accurate * histogram in the range tree. This gives us an opportunity * to bring the space map's histogram up-to-date so we clear * it first before updating it. */ space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); /* * Since we've cleared the histogram we need to add back * any free space that has already been processed, plus * any deferred space. This allows the on-disk histogram * to accurately reflect all free space even if some space * is not yet available for allocation (i.e. deferred). */ space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); /* * Add back any deferred free space that has not been * added back into the in-core free tree yet. This will * ensure that we don't end up with a space map histogram * that is completely empty unless the metaslab is fully * allocated. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); } } /* * Always add the free space from this sync pass to the space * map histogram. We want to make sure that the on-disk histogram * accounts for all free space. If the space map is not loaded, * then we will lose some accuracy but will correct it the next * time we load the space map. */ space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); metaslab_aux_histograms_update(msp); metaslab_group_histogram_add(mg, msp); metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree * and instead will just swap the pointers for freeing and freed. * We can safely do this since the freed_tree is guaranteed to be * empty on the initial pass. * * Keep in mind that even if we are currently using a log spacemap * we want current frees to end up in the ms_allocatable (but not * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { range_tree_swap(&msp->ms_freeing, &msp->ms_freed); ASSERT0(msp->ms_allocated_this_txg); } else { range_tree_vacate(msp->ms_freeing, range_tree_add, msp->ms_freed); } msp->ms_allocated_this_txg += range_tree_space(alloctree); range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); /* * Verify that the space map object ID has been recorded in the * vdev_ms_array. */ uint64_t object; VERIFY0(dmu_read(mos, vd->vdev_ms_array, msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); VERIFY3U(object, ==, space_map_object(msp->ms_sm)); mutex_exit(&msp->ms_sync_lock); dmu_tx_commit(tx); } static void metaslab_evict(metaslab_t *msp, uint64_t txg) { if (!msp->ms_loaded || msp->ms_disabled != 0) return; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); if (!metaslab_debug_unload) metaslab_unload(msp); } /* * Called after a transaction group has completely synced to mark * all of the metaslab's free space as usable. */ void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); if (msp->ms_new) { /* this is a new metaslab, add its capacity to the vdev */ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); /* there should be no allocations nor frees at this point */ VERIFY0(msp->ms_allocated_this_txg); VERIFY0(range_tree_space(msp->ms_freed)); } ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - range_tree_space(msp->ms_freed); if (defer_allowed) { defer_delta = range_tree_space(msp->ms_freed) - range_tree_space(*defer_tree); } else { defer_delta -= range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); if (spa_syncing_log_sm(spa) == NULL) { /* * If there's a metaslab_load() in progress and we don't have * a log space map, it means that we probably wrote to the * metaslab's space map. If this is the case, we need to * make sure that we wait for the load to complete so that we * have a consistent view at the in-core side of the metaslab. */ metaslab_load_wait(msp); } else { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); } /* * When auto-trimming is enabled, free ranges which are added to * ms_allocatable are also be added to ms_trim. The ms_trim tree is * periodically consumed by the vdev_autotrim_thread() which issues * trims for all ranges and then vacates the tree. The ms_trim tree * can be discarded at any time with the sole consequence of recent * frees not being trimmed. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); if (!defer_allowed) { range_tree_walk(msp->ms_freed, range_tree_add, msp->ms_trim); } } else { range_tree_vacate(msp->ms_trim, NULL, NULL); } /* * Move the frees from the defer_tree back to the free * range tree (if it's loaded). Swap the freed_tree and * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ range_tree_vacate(*defer_tree, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { range_tree_swap(&msp->ms_freed, defer_tree); } else { range_tree_vacate(msp->ms_freed, msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); } msp->ms_synced_length = space_map_length(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees * are back in circulation. */ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } metaslab_aux_histograms_update_done(msp, defer_allowed); if (msp->ms_new) { msp->ms_new = B_FALSE; mutex_enter(&mg->mg_lock); mg->mg_ms_ready++; mutex_exit(&mg->mg_lock); } /* * Re-sort metaslab within its group now that we've adjusted * its allocatable space. */ metaslab_recalculate_weight_and_sort(msp); ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_checkpointing)); msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); } void metaslab_sync_reassess(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs but only on active * metaslab groups. We can get into a state where the metaslab * is no longer active since we dirty metaslabs as we remove a * a device, thus potentially making the metaslab group eligible * for preloading. */ if (mg->mg_activation_count > 0) { metaslab_group_preload(mg); } spa_config_exit(spa, SCL_ALLOC, FTAG); } /* * When writing a ditto block (i.e. more than one DVA for a given BP) on * the same vdev as an existing DVA of this BP, then try to allocate it * on a different metaslab than existing DVAs (i.e. a unique metaslab). */ static boolean_t metaslab_is_unique(metaslab_t *msp, dva_t *dva) { uint64_t dva_ms_id; if (DVA_GET_ASIZE(dva) == 0) return (B_TRUE); if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (B_TRUE); dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; return (msp->ms_id != dva_ms_id); } /* * ========================================================================== * Metaslab allocation tracing facility * ========================================================================== */ /* * Add an allocation trace element to the allocation tracing list. */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, int allocator) { metaslab_alloc_trace_t *mat; if (!metaslab_trace_enabled) return; /* * When the tracing list reaches its maximum we remove * the second element in the list before adding a new one. * By removing the second element we preserve the original * entry as a clue to what allocations steps have already been * performed. */ if (zal->zal_size == metaslab_trace_max_entries) { metaslab_alloc_trace_t *mat_next; #ifdef ZFS_DEBUG panic("too many entries in allocation list"); #endif METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); kmem_cache_free(metaslab_alloc_trace_cache, mat_next); } mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); list_link_init(&mat->mat_list_node); mat->mat_mg = mg; mat->mat_msp = msp; mat->mat_size = psize; mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; /* * The list is part of the zio so locking is not required. Only * a single thread will perform allocations for a given zio. */ list_insert_tail(&zal->zal_list, mat); zal->zal_size++; ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); } void metaslab_trace_init(zio_alloc_list_t *zal) { list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), offsetof(metaslab_alloc_trace_t, mat_list_node)); zal->zal_size = 0; } void metaslab_trace_fini(zio_alloc_list_t *zal) { metaslab_alloc_trace_t *mat; while ((mat = list_remove_head(&zal->zal_list)) != NULL) kmem_cache_free(metaslab_alloc_trace_cache, mat); list_destroy(&zal->zal_list); zal->zal_size = 0; } /* * ========================================================================== * Metaslab block operations * ========================================================================== */ static void metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); } static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; metaslab_class_allocator_t *mca = &mg->mg_class->mc_allocator[allocator]; uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mga->mga_cur_max_alloc_queue_depth; while (cur < max) { if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, cur, cur + 1) == cur) { atomic_inc_64(&mca->mca_alloc_max_slots); return; } cur = mga->mga_cur_max_alloc_queue_depth; } } void metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag, int flags, int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); if (io_complete) metaslab_group_increment_qdepth(mg, allocator); } void metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag, int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); } #endif } static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); VERIFY0(msp->ms_new); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); range_tree_remove(rt, start, size); range_tree_clear(msp->ms_trim, start, size); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); msp->ms_allocating_total += size; /* Track the last successful allocation */ msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } /* * Now that we've attempted the allocation we need to update the * metaslab's maximum block size since it may have changed. */ msp->ms_max_size = metaslab_largest_allocatable(msp); return (start); } /* * Find the metaslab with the highest weight that is less than what we've * already tried. In the common case, this means that we will examine each * metaslab at most once. Note that concurrent callers could reorder metaslabs * by activation/passivation once we have dropped the mg_lock. If a metaslab is * activated by another thread, and we fail to allocate from the metaslab we * have selected, we may not try the newly-activated metaslab, and instead * activate another metaslab. This is not optimal, but generally does not cause * any problems (a possible exception being if every metaslab is completely full * except for the newly-activated metaslab which we fail to examine). */ static metaslab_t * find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) { avl_index_t idx; avl_tree_t *t = &mg->mg_metaslab_tree; metaslab_t *msp = avl_find(t, search, &idx); if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); uint_t tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; if (!try_hard && tries > zfs_metaslab_find_max_tries) { METASLABSTAT_BUMP(metaslabstat_too_many_tries); return (NULL); } tries++; if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); continue; } /* * If the selected metaslab is condensing or disabled, or * hasn't gone through a metaslab_sync_done(), then skip it. */ if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) continue; *was_active = msp->ms_allocator != -1; /* * If we're activating as primary, this is our first allocation * from this disk, so we don't need to check how close we are. * If the metaslab under consideration was already active, * we're getting desperate enough to steal another allocator's * metaslab, so we still don't care about distances. */ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) break; for (i = 0; i < d; i++) { if (want_unique && !metaslab_is_unique(msp, &dva[i])) break; /* try another metaslab */ } if (i == d) break; } if (msp != NULL) { search->ms_weight = msp->ms_weight; search->ms_start = msp->ms_start + 1; search->ms_allocator = msp->ms_allocator; search->ms_primary = msp->ms_primary; } return (msp); } static void metaslab_active_mask_verify(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) return; if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); VERIFY3S(msp->ms_allocator, !=, -1); VERIFY(!msp->ms_primary); return; } if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); VERIFY3S(msp->ms_allocator, ==, -1); return; } } static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator, boolean_t try_hard) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; for (int i = 0; i < d; i++) { if (activation_weight == METASLAB_WEIGHT_PRIMARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_CLAIM; break; } } /* * If we don't have enough metaslabs active to fill the entire array, we * just use the 0th slot. */ if (mg->mg_ms_ready < mg->mg_allocators * 3) allocator = 0; metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; /* * At the end of the metaslab tree are the already-active metaslabs, * first the primaries, then the secondaries. When we resume searching * through the tree, we need to consider ms_allocator and ms_primary so * we start in the location right after where we left off, and don't * accidentally loop forever considering the same metaslabs. */ search->ms_allocator = -1; search->ms_primary = B_TRUE; for (;;) { boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); if (activation_weight == METASLAB_WEIGHT_PRIMARY && mga->mga_primary != NULL) { msp = mga->mga_primary; /* * Even though we don't hold the ms_lock for the * primary metaslab, those fields should not * change while we hold the mg_lock. Thus it is * safe to make assertions on them. */ ASSERT(msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && mga->mga_secondary != NULL) { msp = mga->mga_secondary; /* * See comment above about the similar assertions * for the primary metaslab. */ ASSERT(!msp->ms_primary); ASSERT3S(msp->ms_allocator, ==, allocator); ASSERT(msp->ms_loaded); was_active = B_TRUE; ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); } else { msp = find_valid_metaslab(mg, activation_weight, dva, d, want_unique, asize, allocator, try_hard, zal, search, &was_active); } mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } mutex_enter(&msp->ms_lock); metaslab_active_mask_verify(msp); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE3(ms__activation__attempt, metaslab_t *, msp, uint64_t, activation_weight, boolean_t, was_active); #endif /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we * were blocked on the metaslab lock. We check the * active status first to see if we need to set_selected_txg * a new metaslab. */ if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { ASSERT3S(msp->ms_allocator, ==, -1); mutex_exit(&msp->ms_lock); continue; } /* * If the metaslab was activated for another allocator * while we were waiting in the ms_lock above, or it's * a primary and we're seeking a secondary (or vice versa), * we go back and select a new metaslab. */ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && (msp->ms_allocator != -1) && (msp->ms_allocator != allocator || ((activation_weight == METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { ASSERT(msp->ms_loaded); ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || msp->ms_allocator != -1); mutex_exit(&msp->ms_lock); continue; } /* * This metaslab was used for claiming regions allocated * by the ZIL during pool import. Once these regions are * claimed we don't need to keep the CLAIM bit set * anymore. Passivate this metaslab to zero its activation * mask. */ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && activation_weight != METASLAB_WEIGHT_CLAIM) { ASSERT(msp->ms_loaded); ASSERT3S(msp->ms_allocator, ==, -1); metaslab_passivate(msp, msp->ms_weight & ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } metaslab_set_selected_txg(msp, txg); int activation_error = metaslab_activate(msp, allocator, activation_weight); metaslab_active_mask_verify(msp); /* * If the metaslab was activated by another thread for * another allocator or activation_weight (EBUSY), or it * failed because another metaslab was assigned as primary * for this allocator (EEXIST) we continue using this * metaslab for our allocation, rather than going on to a * worse metaslab (we waited for that metaslab to be loaded * after all). * * If the activation failed due to an I/O error or ENOSPC we * skip to the next metaslab. */ boolean_t activated; if (activation_error == 0) { activated = B_TRUE; } else if (activation_error == EBUSY || activation_error == EEXIST) { activated = B_FALSE; } else { mutex_exit(&msp->ms_lock); continue; } ASSERT(msp->ms_loaded); /* * Now that we have the lock, recheck to see if we should * continue to use this metaslab for this allocation. The * the metaslab is now loaded so metaslab_should_allocate() * can accurately determine if the allocation attempt should * proceed. */ if (!metaslab_should_allocate(msp, asize, try_hard)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again * as we can't manipulate this metaslab until it's committed * to disk. If this metaslab is being initialized, we shouldn't * allocate from it since the allocated region might be * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_CONDENSING, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } else if (msp->ms_disabled > 0) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_DISABLED, allocator); if (activated) { metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); } mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ if (activated) metaslab_segment_may_passivate(msp); break; } next: ASSERT(msp->ms_loaded); /* * This code is disabled out because of issues with * tracepoints in non-gpl kernel modules. */ #if 0 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, uint64_t, asize); #endif /* * We were unable to allocate from this metaslab so determine * a new weight for this metaslab. Now that we have loaded * the metaslab we can provide a better hint to the metaslab * selector. * * For space-based metaslabs, we use the maximum block size. * This information is only available when the metaslab * is loaded and is more accurate than the generic free * space weight that was calculated by metaslab_weight(). * This information allows us to quickly compare the maximum * available allocation in the metaslab to the allocation * size being requested. * * For segment-based metaslabs, determine the new weight * based on the highest bucket in the range tree. We * explicitly use the loaded segment weight (i.e. the range * tree histogram) since it contains the space that is * currently available for allocation and is accurate * even within a sync pass. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { weight = metaslab_largest_allocatable(msp); WEIGHT_SET_SPACEBASED(weight); } else { weight = metaslab_weight_from_range_tree(msp); } if (activated) { metaslab_passivate(msp, weight); } else { /* * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. * * Note that we could attempt to use something like * metaslab_recalculate_weight_and_sort() that * retains the activation mask here. That function * uses metaslab_weight() to set the weight though * which is not as accurate as the calculations * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); } metaslab_active_mask_verify(msp); /* * We have just failed an allocation attempt, check * that metaslab_should_allocate() agrees. Otherwise, * we may end up in an infinite loop retrying the same * metaslab. */ ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); mutex_exit(&msp->ms_lock); } mutex_exit(&msp->ms_lock); kmem_free(search, sizeof (*search)); return (offset); } static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, int allocator, boolean_t try_hard) { uint64_t offset; offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, dva, d, allocator, try_hard); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate * the minimum gang block size so it must be out of * space. We must notify the allocation throttle * to start skipping allocation attempts to this * metaslab group until more space becomes available. * Note: this failure cannot be caused by the * allocation throttle since the allocation throttle * is only responsible for skipping devices and * not failing block allocations. */ mg->mg_no_free_space = B_TRUE; } } mg->mg_allocations++; mutex_exit(&mg->mg_lock); return (offset); } /* * Allocate a block for the specified i/o. */ int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. * This will result in more split blocks when using device removal, * and a large number of split blocks coupled with ztest-induced * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ if (psize >= metaslab_force_ganging && metaslab_force_ganging_pct > 0 && (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); } /* * Start at the rotor and loop through all mgs until we find something. * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * * If we are doing ditto or log blocks, try to spread them across * consecutive vdevs. If we're forced to reuse a vdev before we've * allocated all of our ditto blocks, then try and spread them out on * that vdev as much as possible. If it turns out to not be possible, * gradually lower our standards until anything becomes acceptable. * Also, allocating on consecutive vdevs (as opposed to random vdevs) * gives us hope of containing our fault domains to something we're * able to reason about. Otherwise, any two top-level vdev failures * will guarantee the loss of data. With consecutive allocation, * only two adjacent top-level vdev failures will result in data loss. * * If we are doing gang blocks (hintdva is non-NULL), try to keep * ourselves on the same vdev as our gang block header. That * way, we can hope for locality in vdev_cache, plus it makes our * fault domains something tractable. */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); /* * It's possible the vdev we're using as the hint no * longer exists or its mg has been closed (e.g. by * device removal). Consult the rotor when * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { mg = vdev_get_mg(vd, mc); if (flags & METASLAB_HINTBP_AVOID) mg = mg->mg_next; } else { mg = mca->mca_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else { ASSERT(mca->mca_rotor != NULL); mg = mca->mca_rotor; } /* * If the hint put us into the wrong metaslab class, or into a * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mca->mca_rotor; rotor = mg; top: do { boolean_t allocatable; ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } /* * Determine if the selected metaslab group is eligible * for allocations. If we're ganging then don't allow * this metaslab group to skip allocations since that would * inadvertently return ENOSPC and suspend the pool * even though space is still available. */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, flags, psize, allocator, d); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_NOT_ALLOCATABLE, allocator); goto next; } /* * Avoid writing single-copy data to an unhealthy, * non-redundant vdev, unless we've already tried all * other vdevs. */ if (vd->vdev_state < VDEV_STATE_HEALTHY && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); goto next; } ASSERT(mg->mg_class == mc); uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* * If we don't need to try hard, then require that the * block be on a different metaslab from any other DVAs * in this BP (unique=true). If we are trying hard, then * allow any metaslab to be used (unique=false). */ uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, !try_hard, dva, d, allocator, try_hard); if (offset != -1ULL) { /* * If we've just selected this metaslab group, * figure out whether the corresponding vdev is * over- or under-used relative to the pool, * and set an allocation bias to even it out. * * Bias is also used to compensate for unequally * sized vdevs so that space is allocated fairly. */ if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc; int64_t ratio; /* * Calculate how much more or less we should * try to allocate from this device during * this iteration around the rotor. * * This basically introduces a zero-centered * bias towards the devices with the most * free space, while compensating for vdev * size differences. * * Examples: * vdev V1 = 16M/128M * vdev V2 = 16M/128M * ratio(V1) = 100% ratio(V2) = 100% * * vdev V1 = 16M/128M * vdev V2 = 64M/128M * ratio(V1) = 127% ratio(V2) = 72% * * vdev V1 = 16M/128M * vdev V2 = 64M/512M * ratio(V1) = 40% ratio(V2) = 160% */ ratio = (vs_free * mc->mc_alloc_groups * 100) / (mc_free + 1); mg->mg_bias = ((ratio - 100) * (int64_t)mg->mg_aliquot) / 100; } else if (!metaslab_bias_enabled) { mg->mg_bias = 0; } if ((flags & METASLAB_ZIL) || atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); DVA_SET_OFFSET(&dva[d], offset); DVA_SET_GANG(&dva[d], ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); DVA_SET_ASIZE(&dva[d], asize); return (0); } next: mca->mca_rotor = mg->mg_next; mca->mca_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* * If we haven't tried hard, perhaps do so now. */ if (!try_hard && (zfs_metaslab_try_hard_before_gang || GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || psize <= 1 << spa->spa_min_ashift)) { METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } memset(&dva[d], 0, sizeof (dva_t)); metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } void metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, boolean_t checkpoint) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; ASSERT(vdev_is_concrete(vd)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); if (range_tree_is_empty(msp->ms_freeing) && range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); range_tree_add(msp->ms_checkpointing, offset, asize); } else { range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } void metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; boolean_t *checkpoint = arg; ASSERT3P(checkpoint, !=, NULL); if (vd->vdev_ops->vdev_op_remap != NULL) vdev_indirect_mark_obsolete(vd, offset, size); else metaslab_free_impl(vd, offset, size, *checkpoint); } static void metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, boolean_t checkpoint) { spa_t *spa = vd->vdev_spa; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) return; if (spa->spa_vdev_removal != NULL && spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when * we complete the removal, we first change the vdev to be * an indirect vdev (in open context), and then (in syncing * context) clear spa_vdev_removal. */ free_from_removing_vdev(vd, offset, size); } else if (vd->vdev_ops->vdev_op_remap != NULL) { vdev_indirect_mark_obsolete(vd, offset, size); vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } else { metaslab_free_concrete(vd, offset, size, checkpoint); } } typedef struct remap_blkptr_cb_arg { blkptr_t *rbca_bp; spa_remap_cb_t rbca_cb; vdev_t *rbca_remap_vd; uint64_t rbca_remap_offset; void *rbca_cb_arg; } remap_blkptr_cb_arg_t; static void remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { remap_blkptr_cb_arg_t *rbca = arg; blkptr_t *bp = rbca->rbca_bp; /* We can not remap split blocks. */ if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) return; ASSERT0(inner_offset); if (rbca->rbca_cb != NULL) { /* * At this point we know that we are not handling split * blocks and we invoke the callback on the previous * vdev which must be indirect. */ ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); /* set up remap_blkptr_cb_arg for the next call */ rbca->rbca_remap_vd = vd; rbca->rbca_remap_offset = offset; } /* * The phys birth time is that of dva[0]. This ensures that we know * when each dva was written, so that resilver can determine which * blocks need to be scrubbed (i.e. those written during the time * the vdev was offline). It also ensures that the key used in * the ARC hash table is unique (i.e. dva[0] + phys_birth). If * we didn't change the phys_birth, a lookup in the ARC for a * remapped BP could find the data that was previously stored at * this vdev + offset. */ vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, DVA_GET_VDEV(&bp->blk_dva[0])); vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); BP_SET_PHYSICAL_BIRTH(bp, physical_birth); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); } /* * If the block pointer contains any indirect DVAs, modify them to refer to * concrete DVAs. Note that this will sometimes not be possible, leaving * the indirect DVA in place. This happens if the indirect DVA spans multiple * segments in the mapping (i.e. it is a "split block"). * * If the BP was remapped, calls the callback on the original dva (note the * callback can be called multiple times if the original indirect DVA refers * to another indirect DVA, etc). * * Returns TRUE if the BP was remapped. */ boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) { remap_blkptr_cb_arg_t rbca; if (!zfs_remap_blkptr_enable) return (B_FALSE); if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) return (B_FALSE); /* * Dedup BP's can not be remapped, because ddt_phys_select() depends * on DVA[0] being the same in the BP as in the DDT (dedup table). */ if (BP_GET_DEDUP(bp)) return (B_FALSE); /* * Gang blocks can not be remapped, because * zio_checksum_gang_verifier() depends on the DVA[0] that's in * the BP used to read the gang block header (GBH) being the same * as the DVA[0] that we allocated for the GBH. */ if (BP_IS_GANG(bp)) return (B_FALSE); /* * Embedded BP's have no DVA to remap. */ if (BP_GET_NDVAS(bp) < 1) return (B_FALSE); /* * Note: we only remap dva[0]. If we remapped other dvas, we * would no longer know what their phys birth txg is. */ dva_t *dva = &bp->blk_dva[0]; uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops->vdev_op_remap == NULL) return (B_FALSE); rbca.rbca_bp = bp; rbca.rbca_cb = callback; rbca.rbca_remap_vd = vd; rbca.rbca_remap_offset = offset; rbca.rbca_cb_arg = arg; /* * remap_blkptr_cb() will be called in order for each level of * indirection, until a concrete vdev is reached or a split block is * encountered. old_vd and old_offset are updated within the callback * as we go from the one indirect vdev to the next one (either concrete * or indirect again) in that order. */ vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); /* Check if the DVA wasn't remapped because it is a split block */ if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) return (B_FALSE); return (B_TRUE); } /* * Undo the allocation of a DVA which happened in the given transaction group. */ void metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { metaslab_t *msp; vdev_t *vd; uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (txg > spa_freeze_txg(spa)) return; if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", (u_longlong_t)vdev, (u_longlong_t)offset, (u_longlong_t)size); return; } ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } /* * Free the block represented by the given DVA. */ void metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd = vdev_lookup_top(spa, vdev); ASSERT(DVA_IS_VALID(dva)); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); if (DVA_GET_GANG(dva)) { size = vdev_gang_header_asize(vd); } metaslab_free_impl(vd, offset, size, checkpoint); } /* * Reserve some allocation slots. The reservation system must be called * before we call into the allocator. If there aren't any available slots * then the I/O will be throttled until an I/O completes and its slots are * freed up. The function returns true if it was successful in placing * the reservation. */ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; uint64_t max = mca->mca_alloc_max_slots; ASSERT(mc->mc_alloc_throttle_enabled); if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) || zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { /* * The potential race between _count() and _add() is covered * by the allocator lock in most cases, or irrelevant due to * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others. * But even if we assume some other non-existing scenario, the * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. * * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } return (B_FALSE); } void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); } static int metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { metaslab_t *msp; spa_t *spa = vd->vdev_spa; int error = 0; if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) return (SET_ERROR(ENXIO)); ASSERT3P(vd->vdev_ms, !=, NULL); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); if (error == EBUSY) { ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); error = 0; } } if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); range_tree_remove(msp->ms_allocatable, offset, size); range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); if (!multilist_link_active(&msp->ms_class_txg_node)) { msp->ms_selected_txg = txg; multilist_sublist_insert_head(mls, msp); } multilist_sublist_unlock(mls); if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total += size; } mutex_exit(&msp->ms_lock); return (0); } typedef struct metaslab_claim_cb_arg_t { uint64_t mcca_txg; int mcca_error; } metaslab_claim_cb_arg_t; static void metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner_offset; metaslab_claim_cb_arg_t *mcca_arg = arg; if (mcca_arg->mcca_error == 0) { mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, size, mcca_arg->mcca_txg); } } int metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) { if (vd->vdev_ops->vdev_op_remap != NULL) { metaslab_claim_cb_arg_t arg; /* * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ ASSERT(!spa_writeable(vd->vdev_spa)); arg.mcca_error = 0; arg.mcca_txg = txg; vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_claim_impl_cb, &arg); if (arg.mcca_error == 0) { arg.mcca_error = metaslab_claim_concrete(vd, offset, size, txg); } return (arg.mcca_error); } else { return (metaslab_claim_concrete(vd, offset, size, txg)); } } /* * Intent log support: upon opening the pool after a crash, notify the SPA * of blocks that the intent log has allocated for immediate write, but * which are still considered free by the SPA because the last transaction * group didn't commit yet. */ static int metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) { uint64_t vdev = DVA_GET_VDEV(dva); uint64_t offset = DVA_GET_OFFSET(dva); uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { return (SET_ERROR(ENXIO)); } ASSERT(DVA_IS_VALID(dva)); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); return (metaslab_claim_impl(vd, offset, size, txg)); } int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_allocator[allocator].mca_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator, B_FALSE); memset(&dva[d], 0, sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); } else { /* * Update the metaslab group's queue depth * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); spa_config_exit(spa, SCL_ALLOC, FTAG); BP_SET_BIRTH(bp, txg, 0); return (0); } void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that * the blocks that we free that are part of the checkpoint won't be * reused until the checkpoint is discarded or we revert to it. * * The checkpoint flag is passed down the metaslab_free code path * and is set whenever we want to add a block to the checkpoint's * accounting. That is, we "checkpoint" blocks that existed at the * time the checkpoint was created and are therefore referenced by * the checkpointed uberblock. * * Note that, we don't checkpoint any blocks if the current * syncing txg <= spa_checkpoint_txg. We want these frees to sync * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint * there is no way it was created in the current txg. */ ASSERT(!now); ASSERT3U(spa_syncing_txg(spa), ==, txg); checkpoint = B_TRUE; } spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { if (now) { metaslab_unalloc_dva(spa, &dva[d], txg); } else { ASSERT3U(txg, ==, spa_syncing_txg(spa)); metaslab_free_dva(spa, &dva[d], checkpoint); } } spa_config_exit(spa, SCL_FREE, FTAG); } int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) { const dva_t *dva = bp->blk_dva; int ndvas = BP_GET_NDVAS(bp); int error = 0; ASSERT(!BP_IS_HOLE(bp)); if (txg != 0) { /* * First do a dry run to make sure all DVAs are claimable, * so we don't have to unwind from partial failures below. */ if ((error = metaslab_claim(spa, bp, 0)) != 0) return (error); } spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); for (int d = 0; d < ndvas; d++) { error = metaslab_claim_dva(spa, &dva[d], txg); if (error != 0) break; } spa_config_exit(spa, SCL_ALLOC, FTAG); ASSERT(error == 0 || txg == 0); return (error); } static void metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { (void) inner, (void) arg; if (vd->vdev_ops == &vdev_indirect_ops) return; metaslab_check_free_impl(vd, offset, size); } static void metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) { metaslab_t *msp; spa_t *spa __maybe_unused = vd->vdev_spa; if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; if (vd->vdev_ops->vdev_op_remap != NULL) { vd->vdev_ops->vdev_op_remap(vd, offset, size, metaslab_check_free_impl_cb, NULL); return; } ASSERT(vdev_is_concrete(vd)); ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { range_tree_verify_not_present(msp->ms_allocatable, offset, size); } /* * Check all segments that currently exist in the freeing pipeline. * * It would intuitively make sense to also check the current allocating * tree since metaslab_unalloc_dva() exists for extents that are * allocated and freed in the same sync pass within the same txg. * Unfortunately there are places (e.g. the ZIL) where we allocate a * segment but then we free part of it within the same txg * [see zil_sync()]. Thus, we don't call range_tree_verify() in the * current allocating tree. */ range_tree_verify_not_present(msp->ms_freeing, offset, size); range_tree_verify_not_present(msp->ms_checkpointing, offset, size); range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) range_tree_verify_not_present(msp->ms_defer[j], offset, size); range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } void metaslab_check_free(spa_t *spa, const blkptr_t *bp) { if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) return; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd = vdev_lookup_top(spa, vdev); uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); if (DVA_GET_GANG(&bp->blk_dva[i])) size = vdev_gang_header_asize(vd); ASSERT3P(vd, !=, NULL); metaslab_check_free_impl(vd, offset, size); } spa_config_exit(spa, SCL_VDEV, FTAG); } static void metaslab_group_disable_wait(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); while (mg->mg_disabled_updating) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } } static void metaslab_group_disabled_increment(metaslab_group_t *mg) { ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); ASSERT(mg->mg_disabled_updating); while (mg->mg_ms_disabled >= max_disabled_ms) { cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); } mg->mg_ms_disabled++; ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); } /* * Mark the metaslab as disabled to prevent any allocations on this metaslab. * We must also track how many metaslabs are currently disabled within a * metaslab group and limit them to prevent allocation failures from * occurring because all metaslabs are disabled. */ void metaslab_disable(metaslab_t *msp) { ASSERT(!MUTEX_HELD(&msp->ms_lock)); metaslab_group_t *mg = msp->ms_group; mutex_enter(&mg->mg_ms_disabled_lock); /* * To keep an accurate count of how many threads have disabled * a specific metaslab group, we only allow one thread to mark * the metaslab group at a time. This ensures that the value of * ms_disabled will be accurate when we decide to mark a metaslab * group as disabled. To do this we force all other threads * to wait till the metaslab's mg_disabled_updating flag is no * longer set. */ metaslab_group_disable_wait(mg); mg->mg_disabled_updating = B_TRUE; if (msp->ms_disabled == 0) { metaslab_group_disabled_increment(mg); } mutex_enter(&msp->ms_lock); msp->ms_disabled++; mutex_exit(&msp->ms_lock); mg->mg_disabled_updating = B_FALSE; cv_broadcast(&mg->mg_ms_disabled_cv); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) { metaslab_group_t *mg = msp->ms_group; spa_t *spa = mg->mg_vd->vdev_spa; /* * Wait for the outstanding IO to be synced to prevent newly * allocated blocks from being overwritten. This used by * initialize and TRIM which are modifying unallocated space. */ if (sync) txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&mg->mg_ms_disabled_lock); mutex_enter(&msp->ms_lock); if (--msp->ms_disabled == 0) { mg->mg_ms_disabled--; cv_broadcast(&mg->mg_ms_disabled_cv); if (unload) metaslab_unload(msp); } mutex_exit(&msp->ms_lock); mutex_exit(&mg->mg_ms_disabled_lock); } void metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) { ms->ms_unflushed_dirty = dirty; } static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { vdev_t *vd = ms->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); metaslab_unflushed_phys_t entry = { .msp_unflushed_txg = metaslab_unflushed_txg(ms), }; uint64_t entry_size = sizeof (entry); uint64_t entry_offset = ms->ms_id * entry_size; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) { object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object, tx)); } else { VERIFY0(err); } dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, &entry, tx); } void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } boolean_t metaslab_unflushed_dirty(metaslab_t *ms) { return (ms->ms_unflushed_dirty); } uint64_t metaslab_unflushed_txg(metaslab_t *ms) { return (ms->ms_unflushed_txg); } ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, "Allocation granularity (a.k.a. stripe size)"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, "Load all metaslabs when pool is first opened"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, "Prevent metaslabs from being unloaded"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, "Preload potential metaslabs during reassessment"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW, "Max number of metaslabs per group to preload"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW, "Delay in txgs after metaslab was last used before unloading"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW, "Delay in milliseconds after metaslab was last used before unloading"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be free to make it " "eligible for allocation"); ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW, "Percentage of metaslab group size that should be considered eligible " "for allocations unless all metaslab groups within the metaslab class " "have also crossed this threshold"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, "Use the fragmentation metric to prefer less fragmented metaslabs"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT, ZMOD_RW, "Fragmentation for metaslab to allow allocation"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, "Prefer metaslabs with lower LBAs"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, "Enable metaslab group biasing"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZMOD_RW, "Enable segment-based metaslab selection"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, "Blocks larger than this size are sometimes forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, "Percentage of large blocks that will be forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, ZMOD_RW, "Try hard to allocate before ganging"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW, "Normally only consider this many of the best metaslabs in each vdev"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator, param_set_active_allocator, param_get_charp, ZMOD_RW, "SPA active allocator"); -/* END CSTYLED */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 71122542758d..493884cf04c4 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -1,749 +1,747 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include #include #include #include /* * Multi-Modifier Protection (MMP) attempts to prevent a user from importing * or opening a pool on more than one host at a time. In particular, it * prevents "zpool import -f" on a host from succeeding while the pool is * already imported on another host. There are many other ways in which a * device could be used by two hosts for different purposes at the same time * resulting in pool damage. This implementation does not attempt to detect * those cases. * * MMP operates by ensuring there are frequent visible changes on disk (a * "heartbeat") at all times. And by altering the import process to check * for these changes and failing the import when they are detected. This * functionality is enabled by setting the 'multihost' pool property to on. * * Uberblocks written by the txg_sync thread always go into the first * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP. * They are used to hold uberblocks which are exactly the same as the last * synced uberblock except that the ub_timestamp and mmp_config are frequently * updated. Like all other uberblocks, the slot is written with an embedded * checksum, and slots with invalid checksums are ignored. This provides the * "heartbeat", with no risk of overwriting good uberblocks that must be * preserved, e.g. previous txgs and associated block pointers. * * Three optional fields are added to uberblock structure; ub_mmp_magic, * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells * the importing host the settings of zfs_multihost_interval and * zfs_multihost_fail_intervals on the host which last had (or currently has) * the pool imported. These determine how long a host must wait to detect * activity in the pool, before concluding the pool is not in use. The * mmp_delay field is a decaying average of the amount of time between * completion of successive MMP writes, in nanoseconds. It indicates whether * MMP is enabled. * * During import an activity test may now be performed to determine if * the pool is in use. The activity test is typically required if the * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is * POOL_STATE_ACTIVE, and the pool is not a root pool. * * The activity test finds the "best" uberblock (highest txg, timestamp, and, if * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits * some time, and finds the "best" uberblock again. If any of the mentioned * fields have different values in the newly read uberblock, the pool is in use * by another host and the import fails. In order to assure the accuracy of the * activity test, the default values result in an activity test duration of 20x * the mmp write interval. * * The duration of the "zpool import" activity test depends on the information * available in the "best" uberblock: * * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0: * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2 * * In this case, a weak guarantee is provided. Since the host which last had * the pool imported will suspend the pool if no mmp writes land within * fail_intervals * multihost_interval ms, the absence of writes during that * time means either the pool is not imported, or it is imported but the pool * is suspended and no further writes will occur. * * Note that resuming the suspended pool on the remote host would invalidate * this guarantee, and so it is not allowed. * * The factor of 2 provides a conservative safety factor and derives from * MMP_IMPORT_SAFETY_FACTOR; * * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0: * (ub_mmp_config.multihost_interval + ub_mmp_delay) * * zfs_multihost_import_intervals * * In this case no guarantee can provided. However, as long as some devices * are healthy and connected, it is likely that at least one write will land * within (multihost_interval + mmp_delay) because multihost_interval is * enough time for a write to be attempted to each leaf vdev, and mmp_delay * is enough for one to land, based on past delays. Multiplying by * zfs_multihost_import_intervals provides a conservative safety factor. * * 3) If uberblock was written by zfs-0.7: * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals * * The same logic as case #2 applies, but we do not know remote tunables. * * We use the local value for zfs_multihost_interval because the original MMP * did not record this value in the uberblock. * * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect * that. We will have waited enough time for zfs_multihost_import_intervals * writes to be issued and all but one to land. * * single device pool example delays * * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval, * no I/O delay * 100 device pool example delays * * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval, * no I/O delay * * 4) Otherwise, this uberblock was written by a pre-MMP zfs: * zfs_multihost_import_intervals * zfs_multihost_interval * * In this case local tunables are used. By default this product = 10s, long * enough for a pool with any activity at all to write at least one * uberblock. No guarantee can be provided. * * Additionally, the duration is then extended by a random 25% to attempt to to * detect simultaneous imports. For example, if both partner hosts are rebooted * at the same time and automatically attempt to import the pool. */ /* * Used to control the frequency of mmp writes which are performed when the * 'multihost' pool property is on. This is one factor used to determine the * length of the activity check during import. * * On average an mmp write will be issued for each leaf vdev every * zfs_multihost_interval milliseconds. In practice, the observed period can * vary with the I/O load and this observed value is the ub_mmp_delay which is * stored in the uberblock. The minimum allowed value is 100 ms. */ uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; /* * Used to control the duration of the activity test on import. Smaller values * of zfs_multihost_import_intervals will reduce the import time but increase * the risk of failing to detect an active pool. The total activity check time * is never allowed to drop below one second. A value of 0 is ignored and * treated as if it was set to 1. */ uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; /* * Controls the behavior of the pool when mmp write failures or delays are * detected. * * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are * ignored. The failures will still be reported to the ZED which depending on * its configuration may take action such as suspending the pool or taking a * device offline. * * When zfs_multihost_fail_intervals > 0, the pool will be suspended if * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass * without a successful mmp write. This guarantees the activity test will see * mmp writes if the pool is imported. A value of 1 is ignored and treated as * if it was set to 2, because a single leaf vdev pool will issue a write once * per multihost_interval and thus any variation in latency would cause the * pool to be suspended. */ uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; static const void *const mmp_tag = "mmp_write_uberblock"; static __attribute__((noreturn)) void mmp_thread(void *arg); void mmp_init(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); mmp->mmp_kstat_id = 1; } void mmp_fini(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_destroy(&mmp->mmp_thread_lock); cv_destroy(&mmp->mmp_thread_cv); mutex_destroy(&mmp->mmp_io_lock); } static void mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr) { CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG); mutex_enter(&mmp->mmp_thread_lock); } static void mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr) { ASSERT(*mpp != NULL); *mpp = NULL; cv_broadcast(&mmp->mmp_thread_cv); CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */ } void mmp_thread_start(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; if (spa_writeable(spa)) { mutex_enter(&mmp->mmp_thread_lock); if (!mmp->mmp_thread) { mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, spa, 0, &p0, TS_RUN, defclsyspri); zfs_dbgmsg("MMP thread started pool '%s' " "gethrtime %llu", spa_name(spa), gethrtime()); } mutex_exit(&mmp->mmp_thread_lock); } } void mmp_thread_stop(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_enter(&mmp->mmp_thread_lock); mmp->mmp_thread_exiting = 1; cv_broadcast(&mmp->mmp_thread_cv); while (mmp->mmp_thread) { cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock); } mutex_exit(&mmp->mmp_thread_lock); zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu", spa_name(spa), gethrtime()); ASSERT(mmp->mmp_thread == NULL); mmp->mmp_thread_exiting = 0; } typedef enum mmp_vdev_state_flag { MMP_FAIL_NOT_WRITABLE = (1 << 0), MMP_FAIL_WRITE_PENDING = (1 << 1), } mmp_vdev_state_flag_t; /* * Find a leaf vdev to write an MMP block to. It must not have an outstanding * mmp write (if so a new write will also likely block). If there is no usable * leaf, a nonzero error value is returned. The error value returned is a bit * field. * * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an * outstanding MMP write. * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable. */ static int mmp_next_leaf(spa_t *spa) { vdev_t *leaf; vdev_t *starting_leaf; int fail_mask = 0; ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock)); ASSERT(spa_config_held(spa, SCL_STATE, RW_READER)); ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE); ASSERT(!list_is_empty(&spa->spa_leaf_list)); if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) { spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list); spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen; } leaf = spa->spa_mmp.mmp_last_leaf; if (leaf == NULL) leaf = list_head(&spa->spa_leaf_list); starting_leaf = leaf; do { leaf = list_next(&spa->spa_leaf_list, leaf); if (leaf == NULL) { leaf = list_head(&spa->spa_leaf_list); ASSERT3P(leaf, !=, NULL); } /* * We skip unwritable, offline, detached, and dRAID spare * devices as they are either not legal targets or the write * may fail or not be seen by other hosts. Skipped dRAID * spares can never be written so the fail mask is not set. */ if (!vdev_writeable(leaf) || leaf->vdev_offline || leaf->vdev_detached) { fail_mask |= MMP_FAIL_NOT_WRITABLE; } else if (leaf->vdev_ops == &vdev_draid_spare_ops) { continue; } else if (leaf->vdev_mmp_pending != 0) { fail_mask |= MMP_FAIL_WRITE_PENDING; } else { spa->spa_mmp.mmp_last_leaf = leaf; return (0); } } while (leaf != starting_leaf); ASSERT(fail_mask); return (fail_mask); } /* * MMP writes are issued on a fixed schedule, but may complete at variable, * much longer, intervals. The mmp_delay captures long periods between * successful writes for any reason, including disk latency, scheduling delays, * etc. * * The mmp_delay is usually calculated as a decaying average, but if the latest * delay is higher we do not average it, so that we do not hide sudden spikes * which the importing host must wait for. * * If writes are occurring frequently, such as due to a high rate of txg syncs, * the mmp_delay could become very small. Since those short delays depend on * activity we cannot count on, we never allow mmp_delay to get lower than rate * expected if only mmp_thread writes occur. * * If an mmp write was skipped or fails, and we have already waited longer than * mmp_delay, we need to update it so the next write reflects the longer delay. * * Do not set mmp_delay if the multihost property is not on, so as not to * trigger an activity check on import. */ static void mmp_delay_update(spa_t *spa, boolean_t write_completed) { mmp_thread_t *mts = &spa->spa_mmp; hrtime_t delay = gethrtime() - mts->mmp_last_write; ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); if (spa_multihost(spa) == B_FALSE) { mts->mmp_delay = 0; return; } if (delay > mts->mmp_delay) mts->mmp_delay = delay; if (write_completed == B_FALSE) return; mts->mmp_last_write = gethrtime(); /* * strictly less than, in case delay was changed above. */ if (delay < mts->mmp_delay) { hrtime_t min_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) / MAX(1, vdev_count_leaves(spa)); mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), min_delay); } } static void mmp_write_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; mmp_thread_t *mts = zio->io_private; mutex_enter(&mts->mmp_io_lock); uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; mmp_delay_update(spa, (zio->io_error == 0)); vd->vdev_mmp_pending = 0; vd->vdev_mmp_kstat_id = 0; mutex_exit(&mts->mmp_io_lock); spa_config_exit(spa, SCL_STATE, mmp_tag); spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error, mmp_write_duration); abd_free(zio->io_abd); } /* * When the uberblock on-disk is updated by a spa_sync, * creating a new "best" uberblock, update the one stored * in the mmp thread state, used for mmp writes. */ void mmp_update_uberblock(spa_t *spa, uberblock_t *ub) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_enter(&mmp->mmp_io_lock); mmp->mmp_ub = *ub; mmp->mmp_seq = 1; mmp->mmp_ub.ub_timestamp = gethrestime_sec(); mmp_delay_update(spa, B_TRUE); mutex_exit(&mmp->mmp_io_lock); } /* * Choose a random vdev, label, and MMP block, and write over it * with a copy of the last-synced uberblock, whose timestamp * has been updated to reflect that the pool is in use. */ static void mmp_write_uberblock(spa_t *spa) { int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; mmp_thread_t *mmp = &spa->spa_mmp; uberblock_t *ub; vdev_t *vd = NULL; int label, error; uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " "gethrtime %llu", spa_name(spa), lock_acquire_time, gethrtime()); mutex_enter(&mmp->mmp_io_lock); error = mmp_next_leaf(spa); /* * spa_mmp_history has two types of entries: * Issued MMP write: records time issued, error status, etc. * Skipped MMP write: an MMP write could not be issued because no * suitable leaf vdev was available. See comment above struct * spa_mmp_history for details. */ if (error) { mmp_delay_update(spa, B_FALSE); if (mmp->mmp_skip_error == error) { spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); } else { mmp->mmp_skip_error = error; spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg, gethrestime_sec(), mmp->mmp_delay, NULL, 0, mmp->mmp_kstat_id++, error); zfs_dbgmsg("MMP error choosing leaf pool '%s' " "gethrtime %llu fail_mask %#x", spa_name(spa), gethrtime(), error); } mutex_exit(&mmp->mmp_io_lock); spa_config_exit(spa, SCL_STATE, mmp_tag); return; } vd = spa->spa_mmp.mmp_last_leaf; if (mmp->mmp_skip_error != 0) { mmp->mmp_skip_error = 0; zfs_dbgmsg("MMP write after skipping due to unavailable " "leaves, pool '%s' gethrtime %llu leaf %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)vd->vdev_guid); } if (mmp->mmp_zio_root == NULL) mmp->mmp_zio_root = zio_root(spa, NULL, NULL, flags | ZIO_FLAG_GODFATHER); if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) { /* * Want to reset mmp_seq when timestamp advances because after * an mmp_seq wrap new values will not be chosen by * uberblock_compare() as the "best". */ mmp->mmp_ub.ub_timestamp = gethrestime_sec(); mmp->mmp_seq = 1; } ub = &mmp->mmp_ub; ub->ub_mmp_magic = MMP_MAGIC; ub->ub_mmp_delay = mmp->mmp_delay; ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) | MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) | MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals)); vd->vdev_mmp_pending = gethrtime(); vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id; zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags); abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); abd_zero_off(ub_abd, sizeof (uberblock_t), VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t)); mmp->mmp_seq++; mmp->mmp_kstat_id++; mutex_exit(&mmp->mmp_io_lock); offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - MMP_BLOCKS_PER_LABEL + random_in_range(MMP_BLOCKS_PER_LABEL)); label = random_in_range(VDEV_LABELS); vdev_label_write(zio, vd, label, ub_abd, offset, VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, flags | ZIO_FLAG_DONT_PROPAGATE); (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0); zio_nowait(zio); } static __attribute__((noreturn)) void mmp_thread(void *arg) { spa_t *spa = (spa_t *)arg; mmp_thread_t *mmp = &spa->spa_mmp; boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( zfs_multihost_interval)); uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals); hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval; boolean_t last_spa_suspended; boolean_t last_spa_multihost; uint64_t last_mmp_interval; uint32_t last_mmp_fail_intervals; hrtime_t last_mmp_fail_ns; callb_cpr_t cpr; int skip_wait = 0; mmp_thread_enter(mmp, &cpr); /* * There have been no MMP writes yet. Setting mmp_last_write here gives * us one mmp_fail_ns period, which is consistent with the activity * check duration, to try to land an MMP write before MMP suspends the * pool (if so configured). */ mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); mutex_exit(&mmp->mmp_io_lock); while (!mmp->mmp_thread_exiting) { hrtime_t next_time = gethrtime() + MSEC2NSEC(MMP_DEFAULT_INTERVAL); int leaves = MAX(vdev_count_leaves(spa), 1); /* Detect changes in tunables or state */ last_spa_suspended = suspended; last_spa_multihost = multihost; suspended = spa_suspended(spa); multihost = spa_multihost(spa); last_mmp_interval = mmp_interval; last_mmp_fail_intervals = mmp_fail_intervals; last_mmp_fail_ns = mmp_fail_ns; mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( zfs_multihost_interval)); mmp_fail_intervals = MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals); /* Smooth so pool is not suspended when reducing tunables */ if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) { mmp_fail_ns = (mmp_fail_ns * 31 + mmp_fail_intervals * mmp_interval) / 32; } else { mmp_fail_ns = mmp_fail_intervals * mmp_interval; } if (mmp_interval != last_mmp_interval || mmp_fail_intervals != last_mmp_fail_intervals) { /* * We want other hosts to see new tunables as quickly as * possible. Write out at higher frequency than usual. */ skip_wait += leaves; } if (multihost) next_time = gethrtime() + mmp_interval / leaves; if (mmp_fail_ns != last_mmp_fail_ns) { zfs_dbgmsg("MMP interval change pool '%s' " "gethrtime %llu last_mmp_interval %llu " "mmp_interval %llu last_mmp_fail_intervals %u " "mmp_fail_intervals %u mmp_fail_ns %llu " "skip_wait %d leaves %d next_time %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)last_mmp_interval, (u_longlong_t)mmp_interval, last_mmp_fail_intervals, mmp_fail_intervals, (u_longlong_t)mmp_fail_ns, skip_wait, leaves, (u_longlong_t)next_time); } /* * MMP off => on, or suspended => !suspended: * No writes occurred recently. Update mmp_last_write to give * us some time to try. */ if ((!last_spa_multihost && multihost) || (last_spa_suspended && !suspended)) { zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu " "last_spa_multihost %u multihost %u " "last_spa_suspended %u suspended %u", spa_name(spa), (u_longlong_t)gethrtime(), last_spa_multihost, multihost, last_spa_suspended, suspended); mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mmp->mmp_delay = mmp_interval; mutex_exit(&mmp->mmp_io_lock); } /* * MMP on => off: * mmp_delay == 0 tells importing node to skip activity check. */ if (last_spa_multihost && !multihost) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_delay = 0; mutex_exit(&mmp->mmp_io_lock); } /* * Suspend the pool if no MMP write has succeeded in over * mmp_interval * mmp_fail_intervals nanoseconds. */ if (multihost && !suspended && mmp_fail_intervals && (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, (u_longlong_t)mmp_fail_ns, (u_longlong_t)spa->spa_uberblock.ub_txg); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", spa_name(spa), NSEC2MSEC(gethrtime() - mmp->mmp_last_write), gethrtime()); zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); } if (multihost && !suspended) mmp_write_uberblock(spa); if (skip_wait > 0) { next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) / leaves; skip_wait--; } CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), CALLOUT_FLAG_ABSOLUTE); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); } /* Outstanding writes are allowed to complete. */ zio_wait(mmp->mmp_zio_root); mmp->mmp_zio_root = NULL; mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); thread_exit(); } /* * Signal the MMP thread to wake it, when it is sleeping on * its cv. Used when some module parameter has changed and * we want the thread to know about it. * Only signal if the pool is active and mmp thread is * running, otherwise there is no thread to wake. */ static void mmp_signal_thread(spa_t *spa) { mmp_thread_t *mmp = &spa->spa_mmp; mutex_enter(&mmp->mmp_thread_lock); if (mmp->mmp_thread) cv_broadcast(&mmp->mmp_thread_cv); mutex_exit(&mmp->mmp_thread_lock); } void mmp_signal_all_threads(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa))) { if (spa->spa_state == POOL_STATE_ACTIVE) mmp_signal_thread(spa); } mutex_exit(&spa_namespace_lock); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, param_set_multihost_interval, spl_param_get_u64, ZMOD_RW, "Milliseconds between mmp writes to each leaf"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW, "Max allowed period without a successful mmp write"); ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW, "Number of zfs_multihost_interval periods to wait for activity"); diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 718bbb34a8d5..0dd7da1aa197 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -1,359 +1,357 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2021 by Delphix. All rights reserved. */ #include #include #ifdef ZFS_DEBUG /* * Reference count tracking is disabled by default. It's memory requirements * are reasonable, however as implemented it consumes a significant amount of * cpu time. Until its performance is improved it should be manually enabled. */ int reference_tracking_enable = B_FALSE; static uint_t reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); } static int zfs_refcount_compare(const void *x1, const void *x2) { const reference_t *r1 = (const reference_t *)x1; const reference_t *r2 = (const reference_t *)x2; int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); int cmp = cmp1 ? cmp1 : cmp2; return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), offsetof(reference_t, ref_link.a)); list_create(&rc->rc_removed, sizeof (reference_t), offsetof(reference_t, ref_link.l)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; } void zfs_refcount_create_tracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_TRUE; } void zfs_refcount_create_untracked(zfs_refcount_t *rc) { zfs_refcount_create(rc); rc->rc_tracked = B_FALSE; } void zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; void *cookie = NULL; ASSERT3U(rc->rc_count, ==, number); while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) kmem_cache_free(reference_cache, ref); avl_destroy(&rc->rc_tree); while ((ref = list_remove_head(&rc->rc_removed))) kmem_cache_free(reference_cache, ref); list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } void zfs_refcount_destroy(zfs_refcount_t *rc) { zfs_refcount_destroy_many(rc, 0); } int zfs_refcount_is_zero(zfs_refcount_t *rc) { return (zfs_refcount_count(rc) == 0); } int64_t zfs_refcount_count(zfs_refcount_t *rc) { return (atomic_load_64(&rc->rc_count)); } int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { reference_t *ref; int64_t count; if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); } ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; ref->ref_search = B_FALSE; mutex_enter(&rc->rc_mtx); avl_add(&rc->rc_tree, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } int64_t zfs_refcount_add(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_add_many(rc, 1, holder)); } void zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { if (likely(!rc->rc_tracked)) (void) zfs_refcount_add_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_add(rc, holder); } int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { reference_t *ref, s; int64_t count; if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } s.ref_holder = holder; s.ref_number = number; s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); ref = avl_find(&rc->rc_tree, &s, NULL); if (unlikely(ref == NULL)) { panic("No such hold %p on refcount %llx", holder, (u_longlong_t)(uintptr_t)rc); return (-1); } avl_remove(&rc->rc_tree, ref); if (reference_history > 0) { list_insert_head(&rc->rc_removed, ref); if (rc->rc_removed_count >= reference_history) { ref = list_remove_tail(&rc->rc_removed); kmem_cache_free(reference_cache, ref); } else { rc->rc_removed_count++; } } else { kmem_cache_free(reference_cache, ref); } rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); return (count); } int64_t zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) { return (zfs_refcount_remove_many(rc, 1, holder)); } void zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { if (likely(!rc->rc_tracked)) (void) zfs_refcount_remove_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_remove(rc, holder); } void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { avl_tree_t tree; list_t removed; reference_t *ref; void *cookie = NULL; uint64_t count; uint_t removed_count; avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), offsetof(reference_t, ref_link.a)); list_create(&removed, sizeof (reference_t), offsetof(reference_t, ref_link.l)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; avl_swap(&tree, &src->rc_tree); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; if (avl_is_empty(&dst->rc_tree)) avl_swap(&dst->rc_tree, &tree); else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) avl_add(&dst->rc_tree, ref); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); avl_destroy(&tree); list_destroy(&removed); } void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { reference_t *ref, s; if (likely(!rc->rc_tracked)) return; s.ref_holder = current_holder; s.ref_number = number; s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ref = avl_find(&rc->rc_tree, &s, NULL); ASSERT(ref); ref->ref_holder = new_holder; avl_update(&rc->rc_tree, ref); mutex_exit(&rc->rc_mtx); } void zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, const void *new_holder) { return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder, new_holder)); } /* * If tracking is enabled, return true if a reference exists that matches * the "holder" tag. If tracking is disabled, then return true if a reference * might be held. */ boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref, s; avl_index_t idx; boolean_t res; if (likely(!rc->rc_tracked)) return (zfs_refcount_count(rc) > 0); s.ref_holder = holder; s.ref_number = 0; s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ref = avl_find(&rc->rc_tree, &s, &idx); if (likely(ref == NULL)) ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); res = ref && ref->ref_holder == holder; mutex_exit(&rc->rc_mtx); return (res); } /* * If tracking is enabled, return true if a reference does not exist that * matches the "holder" tag. If tracking is disabled, always return true * since the reference might not be held. */ boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { reference_t *ref, s; avl_index_t idx; boolean_t res; if (likely(!rc->rc_tracked)) return (B_TRUE); mutex_enter(&rc->rc_mtx); s.ref_holder = holder; s.ref_number = 0; s.ref_search = B_TRUE; ref = avl_find(&rc->rc_tree, &s, &idx); if (likely(ref == NULL)) ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); res = ref == NULL || ref->ref_holder != holder; mutex_exit(&rc->rc_mtx); return (res); } EXPORT_SYMBOL(zfs_refcount_create); EXPORT_SYMBOL(zfs_refcount_destroy); EXPORT_SYMBOL(zfs_refcount_is_zero); EXPORT_SYMBOL(zfs_refcount_count); EXPORT_SYMBOL(zfs_refcount_add); EXPORT_SYMBOL(zfs_refcount_remove); EXPORT_SYMBOL(zfs_refcount_held); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW, "Track reference holders to refcount_t objects"); ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW, "Maximum reference holders being tracked"); -/* END CSTYLED */ #endif /* ZFS_DEBUG */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 5a616adb41a2..b83c982c13fd 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1,11072 +1,11066 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. * Copyright (c) 2023, 2024, Klara Inc. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" #include /* * spa_thread() existed on Illumos as a parent thread for the various worker * threads that actually run the pool, as a way to both reference the entire * pool work as a single object, and to share properties like scheduling * options. It has not yet been adapted to Linux or FreeBSD. This define is * used to mark related parts of the code to make things easier for the reader, * and to compile this code out. It can be removed when someone implements it, * moves it to some Illumos-specific place, or removes it entirely. */ #undef HAVE_SPA_THREAD /* * The "System Duty Cycle" scheduling class is an Illumos feature to help * prevent CPU-intensive kernel threads from affecting latency on interactive * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is * gated behind a define. On Illumos SDC depends on spa_thread(), but * spa_thread() also has other uses, so this is a separate define. */ #undef HAVE_SYSDC /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ ZTI_MODE_SYNC, /* sync thread assigned */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "iss", "iss_h", "int", "int_h" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_SCALE * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs * that scales with the number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for high priority I/Os that * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT * implementation, so separate high priority threads are used there. */ static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ #ifdef illumos { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ #else { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ #endif { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport); static void spa_vdev_resilver_done(spa_t *spa); /* * Percentage of all CPUs that can be used by the metaslab preload taskq. */ static uint_t metaslab_preload_pct = 50; static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ static uint_t zio_taskq_batch_tpq; /* threads per taskq */ #ifdef HAVE_SYSDC static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ #endif #ifdef HAVE_SPA_THREAD static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif static uint_t zio_taskq_write_tpq = 16; /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). * This is used by zdb for spacemaps verification. */ boolean_t spa_mode_readable_spacemaps = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ static int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ static const boolean_t zfs_pause_spa_sync = B_FALSE; /* * Variables to indicate the livelist condense zthr func should wait at certain * points for the livelist to be removed - used to test condense/destroy races */ static int zfs_livelist_condense_zthr_pause = 0; static int zfs_livelist_condense_sync_pause = 0; /* * Variables to track whether or not condense cancellation has been * triggered in testing. */ static int zfs_livelist_condense_sync_cancel = 0; static int zfs_livelist_condense_zthr_cancel = 0; /* * Variable to track whether or not extra ALLOC blkptrs were added to a * livelist entry while it was being condensed (caused by the way we track * remapped blkptrs in dbuf_remap_impl) */ static int zfs_livelist_condense_new_alloc = 0; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static int spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) { zpool_prop_t prop = zpool_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_NONE; uint64_t intval; int err; /* * NB: Not all properties lookups via this API require * the spa props lock, so they must explicitly grab it here. */ switch (prop) { case ZPOOL_PROP_DEDUPCACHED: err = ddt_get_pool_dedup_cached(spa, &intval); if (err != 0) return (SET_ERROR(err)); break; default: return (SET_ERROR(EINVAL)); } spa_prop_add_list(outnvl, prop, NULL, intval, src); return (0); } int spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, nvlist_t *outnvl) { int err = 0; if (props == NULL) return (0); for (unsigned int i = 0; i < n_props && err == 0; i++) { err = spa_prop_add(spa, props[i], outnvl); } return (err); } /* * Add a user property (source=src, propname=propval) to an nvlist. */ static void spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, zprop_source_t src) { nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t *nv) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; const zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL, brt_get_used(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL, brt_get_saved(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL, brt_get_ratio(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, ddt_get_ddt_dsize(spa), src); spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL, spa_get_last_scrubbed_txg(spa), src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_compatibility != NULL) { spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY, spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t *nv) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t *za; dsl_pool_t *dp; int err = 0; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); za = zap_attribute_alloc(); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nv); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) goto out; /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za->za_name)) == ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name)) continue; switch (za->za_integer_length) { case 8: /* integer property */ if (za->za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_dataset_t *ds = NULL; err = dsl_dataset_hold_obj(dp, za->za_first_integer, FTAG, &ds); if (err != 0) break; strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); } else { strval = NULL; intval = za->za_first_integer; } spa_prop_add_list(nv, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za->za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za->za_name, 1, za->za_num_integers, strval); if (err) { kmem_free(strval, za->za_num_integers); break; } if (prop != ZPOOL_PROP_INVAL) { spa_prop_add_list(nv, prop, strval, 0, src); } else { src = ZPROP_SRC_LOCAL; spa_prop_add_user(nv, za->za_name, strval, src); } kmem_free(strval, za->za_num_integers); break; default: break; } } zap_cursor_fini(&zc); out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); zap_attribute_free(za); if (err && err != ENOENT) return (err); return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; const char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: /* * Sanitize the input. */ if (zfs_prop_user(propname)) { if (strlen(propname) >= ZAP_MAXNAMELEN) { error = SET_ERROR(ENAMETOOLONG); break; } if (strlen(fnvpair_value_string(elem)) >= ZAP_MAXVALUELEN) { error = SET_ERROR(E2BIG); break; } } else if (zpool_prop_feature(propname)) { if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; } else { error = SET_ERROR(EINVAL); break; } break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DEDUP_TABLE_QUOTA: error = nvpair_value_uint64(elem, &intval); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: case ZPOOL_PROP_AUTOTRIM: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_MULTIHOST: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); if (!error) { uint32_t hostid = zone_get_hostid(NULL); if (hostid) spa->spa_hostid = hostid; else error = SET_ERROR(ENOTSUP); } break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* Must be ZPL. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; default: break; } if (error) break; } (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { const char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_INVAL && zfs_prop_user(nvpair_name(elem))) { need_sync = B_TRUE; break; } if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver = 0; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid __maybe_unused = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", (u_longlong_t)oldguid, (u_longlong_t)*newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. * * The GUID of the pool will be changed to the value pointed to by guidp. * The GUID may not be set to the reserverd value of 0. * The new GUID will be generated if guidp is NULL. */ int spa_change_guid(spa_t *spa, const uint64_t *guidp) { uint64_t guid; int error; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); if (guidp != NULL) { guid = *guidp; if (guid == 0) { error = SET_ERROR(EINVAL); goto out; } if (spa_guid_exists(guid, 0)) { error = SET_ERROR(EEXIST); goto out; } } else { guid = spa_generate_guid(NULL); } error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { /* * Clear the kobj flag from all the vdevs to allow * vdev_cache_process_kobj_evt() to post events to all the * vdevs since GUID is updated. */ vdev_clear_kobj_evt(spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } out: mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { const spa_error_entry_t *sa = (const spa_error_entry_t *)a; const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); return (TREE_ISIGN(ret)); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t cpus, flags = TASKQ_DYNAMIC; switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >, 0); break; case ZTI_MODE_SYNC: /* * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, * not to exceed the number of spa allocators, and align to it. */ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); count = MAX(count, (zio_taskq_batch_pct + 99) / 100); count = MIN(count, spa->spa_alloc_count); while (spa->spa_alloc_count % count != 0 && spa->spa_alloc_count < count * 2) count--; /* * zio_taskq_batch_pct is unbounded and may exceed 100%, but no * single taskq may have more threads than 100% of online cpus. */ value = (zio_taskq_batch_pct + count / 2) / count; value = MIN(value, 100); flags |= TASKQ_THREADS_CPU_PCT; break; case ZTI_MODE_SCALE: flags |= TASKQ_THREADS_CPU_PCT; /* * We want more taskqs to reduce lock contention, but we want * less for better request ordering and CPU utilization. */ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); if (zio_taskq_batch_tpq > 0) { count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / zio_taskq_batch_tpq); } else { /* * Prefer 6 threads per taskq, but no more taskqs * than threads in them on large systems. For 80%: * * taskq taskq total * cpus taskqs percent threads threads * ------- ------- ------- ------- ------- * 1 1 80% 1 1 * 2 1 80% 1 1 * 4 1 80% 3 3 * 8 2 40% 3 6 * 16 3 27% 4 12 * 32 5 16% 5 25 * 64 7 11% 7 49 * 128 10 8% 10 100 * 256 14 6% 15 210 */ count = 1 + cpus / 6; while (count * count > cpus) count--; } /* Limit each taskq within 100% to not trigger assertion. */ count = MAX(count, (zio_taskq_batch_pct + 99) / 100); value = (zio_taskq_batch_pct + count / 2) / count; break; case ZTI_MODE_NULL: tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_taskqs_init()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; if (count > 1) (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); else (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); #ifdef HAVE_SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { (void) zio_taskq_basedc; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { #endif pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important * priority than the other taskqs. * * Under Linux and FreeBSD this means incrementing * the priority value as opposed to platforms like * illumos where it should be decremented. * * On FreeBSD, if priorities divided by four (RQ_PPQ) * are equal then a difference between them is * insignificant. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { #if defined(__linux__) pri++; #elif defined(__FreeBSD__) pri += 4; #else #error "unknown OS" #endif } tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); #ifdef HAVE_SYSDC } #endif tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT3U(tqs->stqs_count, ==, 0); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } #ifdef _KERNEL /* * The READ and WRITE rows of zio_taskqs are configurable at module load time * by setting zio_taskq_read or zio_taskq_write. * * Example (the defaults for READ and WRITE) * zio_taskq_read='fixed,1,8 null scale null' * zio_taskq_write='sync null scale null' * * Each sets the entire row at a time. * * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number * of threads per taskq. * * 'null' can only be set on the high-priority queues (queue selection for * high-priority queues will fall back to the regular queue if the high-pri * is NULL. */ static const char *const modes[ZTI_NMODES] = { "fixed", "scale", "sync", "null" }; /* Parse the incoming config string. Modifies cfg */ static int spa_taskq_param_set(zio_type_t t, char *cfg) { int err = 0; zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; char *next = cfg, *tok, *c; /* * Parse out each element from the string and fill `row`. The entire * row has to be set at once, so any errors are flagged by just * breaking out of this loop early. */ uint_t q; for (q = 0; q < ZIO_TASKQ_TYPES; q++) { /* `next` is the start of the config */ if (next == NULL) break; /* Eat up leading space */ while (isspace(*next)) next++; if (*next == '\0') break; /* Mode ends at space or end of string */ tok = next; next = strchr(tok, ' '); if (next != NULL) *next++ = '\0'; /* Parameters start after a comma */ c = strchr(tok, ','); if (c != NULL) *c++ = '\0'; /* Match mode string */ uint_t mode; for (mode = 0; mode < ZTI_NMODES; mode++) if (strcmp(tok, modes[mode]) == 0) break; if (mode == ZTI_NMODES) break; /* Invalid canary */ row[q].zti_mode = ZTI_NMODES; /* Per-mode setup */ switch (mode) { /* * FIXED is parameterised: number of queues, and number of * threads per queue. */ case ZTI_MODE_FIXED: { /* No parameters? */ if (c == NULL || *c == '\0') break; /* Find next parameter */ tok = c; c = strchr(tok, ','); if (c == NULL) break; /* Take digits and convert */ unsigned long long nq; if (!(isdigit(*tok))) break; err = ddi_strtoull(tok, &tok, 10, &nq); /* Must succeed and also end at the next param sep */ if (err != 0 || tok != c) break; /* Move past the comma */ tok++; /* Need another number */ if (!(isdigit(*tok))) break; /* Remember start to make sure we moved */ c = tok; /* Take digits */ unsigned long long ntpq; err = ddi_strtoull(tok, &tok, 10, &ntpq); /* Must succeed, and moved forward */ if (err != 0 || tok == c || *tok != '\0') break; /* * sanity; zero queues/threads make no sense, and * 16K is almost certainly more than anyone will ever * need and avoids silly numbers like UINT32_MAX */ if (nq == 0 || nq >= 16384 || ntpq == 0 || ntpq >= 16384) break; const zio_taskq_info_t zti = ZTI_P(ntpq, nq); row[q] = zti; break; } case ZTI_MODE_SCALE: { const zio_taskq_info_t zti = ZTI_SCALE; row[q] = zti; break; } case ZTI_MODE_SYNC: { const zio_taskq_info_t zti = ZTI_SYNC; row[q] = zti; break; } case ZTI_MODE_NULL: { /* * Can only null the high-priority queues; the general- * purpose ones have to exist. */ if (q != ZIO_TASKQ_ISSUE_HIGH && q != ZIO_TASKQ_INTERRUPT_HIGH) break; const zio_taskq_info_t zti = ZTI_NULL; row[q] = zti; break; } default: break; } /* Ensure we set a mode */ if (row[q].zti_mode == ZTI_NMODES) break; } /* Didn't get a full row, fail */ if (q < ZIO_TASKQ_TYPES) return (SET_ERROR(EINVAL)); /* Eat trailing space */ if (next != NULL) while (isspace(*next)) next++; /* If there's anything left over then fail */ if (next != NULL && *next != '\0') return (SET_ERROR(EINVAL)); /* Success! Copy it into the real config */ for (q = 0; q < ZIO_TASKQ_TYPES; q++) zio_taskqs[t][q] = row[q]; return (0); } static int spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) { int pos = 0; /* Build paramater string from live config */ const char *sep = ""; for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { const zio_taskq_info_t *zti = &zio_taskqs[t][q]; if (zti->zti_mode == ZTI_MODE_FIXED) pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, modes[zti->zti_mode], zti->zti_count, zti->zti_value); else pos += sprintf(&buf[pos], "%s%s", sep, modes[zti->zti_mode]); sep = " "; } if (add_newline) buf[pos++] = '\n'; buf[pos] = '\0'; return (pos); } #ifdef __linux__ static int spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) { char *cfg = kmem_strdup(val); int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); kmem_free(cfg, strlen(val)+1); return (-err); } static int spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) { return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); } static int spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) { char *cfg = kmem_strdup(val); int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); kmem_free(cfg, strlen(val)+1); return (-err); } static int spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) { return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); } #else /* * On FreeBSD load-time parameters can be set up before malloc() is available, * so we have to do all the parsing work on the stack. */ #define SPA_TASKQ_PARAM_MAX (128) static int spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; int err; (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); } static int spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; int err; (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); } #endif #endif /* _KERNEL */ /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. */ void spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, zio_t *zio, boolean_t cutinline) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(zio); ASSERT(taskq_empty_ent(&zio->io_tqent)); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && ZIO_HAS_ALLOCATOR(zio)) { tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, &zio->io_tqent); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) { psetid_t zio_taskq_psrset_bind = PS_NONE; callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } #ifdef HAVE_SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } #endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif extern metaslab_ops_t *metaslab_allocator(spa_t *spa); /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, spa_mode_t mode) { metaslab_ops_t *msp = metaslab_allocator(spa); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_read_spacemaps = spa_mode_readable_spacemaps; spa->spa_normal_class = metaslab_class_create(spa, msp); spa->spa_log_class = metaslab_class_create(spa, msp); spa->spa_embedded_log_class = metaslab_class_create(spa, msp); spa->spa_special_class = metaslab_class_create(spa, msp); spa->spa_dedup_class = metaslab_class_create(spa, msp); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef HAVE_SPA_THREAD /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* HAVE_SPA_THREAD */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_healed, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); spa_activate_os(spa); spa_keystore_init(&spa->spa_keystore); /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy * resolution of various deadlocks. * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. * * A taskq per pool allows one to keep the pools independent. * This way if one pool is suspended, it will not impact another. * * The preferred location to dispatch a zvol minor task is a sync * task. In this context, there is easy access to the spa_t and minimal * error handling is required because the sync task must succeed. */ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); /* * The taskq to preload metaslabs. */ spa->spa_metaslab_taskq = taskq_create("z_metaslab", metaslab_preload_pct, maxclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); if (spa->spa_zvol_taskq) { taskq_destroy(spa->spa_zvol_taskq); spa->spa_zvol_taskq = NULL; } if (spa->spa_metaslab_taskq) { taskq_destroy(spa->spa_metaslab_taskq); spa->spa_metaslab_taskq = NULL; } if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; } if (spa->spa_upgrade_taskq) { taskq_destroy(spa->spa_upgrade_taskq); spa->spa_upgrade_taskq = NULL; } txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; metaslab_class_destroy(spa->spa_embedded_log_class); spa->spa_embedded_log_class = NULL; metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); avl_destroy(&spa->spa_errlist_healed); spa_keystore_fini(&spa->spa_keystore); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } spa_deactivate_os(spa); } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } static boolean_t spa_should_flush_logs_on_unload(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return (B_FALSE); if (!spa_writeable(spa)) return (B_FALSE); if (!spa->spa_sync_on) return (B_FALSE); if (spa_state(spa) != POOL_STATE_EXPORTED) return (B_FALSE); if (zfs_keep_log_spacemaps_at_export) return (B_FALSE); return (B_TRUE); } /* * Opens a transaction that will set the flag that will instruct * spa_sync to attempt to flush all the metaslabs for that txg. */ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); } static void spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; log_summary_entry_t *e; while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { VERIFY0(e->lse_mscount); kmem_free(e, sizeof (log_summary_entry_t)); } spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; } static void spa_destroy_aux_threads(spa_t *spa) { if (spa->spa_condense_zthr != NULL) { zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } if (spa->spa_livelist_delete_zthr != NULL) { zthr_destroy(spa->spa_livelist_delete_zthr); spa->spa_livelist_delete_zthr = NULL; } if (spa->spa_livelist_condense_zthr != NULL) { zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } if (spa->spa_raidz_expand_zthr != NULL) { zthr_destroy(spa->spa_raidz_expand_zthr); spa->spa_raidz_expand_zthr = NULL; } } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); spa_load_note(spa, "UNLOADING"); spa_wake_waiters(spa); /* * If we have set the spa_final_txg, we have already performed the * tasks below in spa_export_common(). We should not redo it here since * we delay the final TXGs beyond what spa_final_txg is set at. */ if (spa->spa_final_txg == UINT64_MAX) { /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_t *root_vdev = spa->spa_root_vdev; vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); l2arc_spa_rebuild_stop(spa); } } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ taskq_wait(spa->spa_metaslab_taskq); if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } spa_destroy_aux_threads(spa); spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); brt_unload(spa); spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); if (spa->spa_spares.sav_vdevs) { for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; if (spa->spa_l2cache.sav_vdevs) { for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } if (spa->spa_compatibility != NULL) { spa_strfree(spa->spa_compatibility); spa->spa_compatibility = NULL; } spa->spa_raidz_expand = NULL; spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ if (spa->spa_spares.sav_vdevs) { for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); } if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, spa->spa_spares.sav_count); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; if (sav->sav_config == NULL) { nl2cache = 0; newvdevs = NULL; goto out; } VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); /* * Upon cache device addition to a pool or pool * creation with a cache device or if the header * of the device is invalid we issue an async * TRIM command for the whole device which will * execute if l2arc_trim_ahead > 0. */ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); if (sav->sav_count > 0) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, sav->sav_count); out: /* * Purge vdevs that were dropped */ if (oldvdevs) { for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); } for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); vmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); nv = fnvlist_alloc(); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t * const *)child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } /* * Passivate any log vdevs (note, does not apply to embedded log metaslabs). */ static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } } return (slog_found); } /* * Activate any log vdevs (note, does not apply to embedded log metaslabs). */ static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_activate(tvd->vdev_mg); } } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { boolean_t sle_verify_data; uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of inflight bytes is the log2 fraction of the arc size. * By default, we set it to 1/16th of the arc. */ static uint_t spa_load_verify_shift = 4; static int spa_load_verify_metadata = B_TRUE; static int spa_load_verify_data = B_TRUE; static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zio_t *rio = arg; spa_load_error_t *sle = rio->io_private; (void) zilog, (void) dnp; /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); /* * Sanity check the block pointer in order to detect obvious damage * before using the contents in subsequent checks or in zio_read(). * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); if (!BP_IS_METADATA(bp) && (!spa_load_verify_data || !sle->sle_verify_data)) return (0); uint64_t maxinflight_bytes = arc_target_bytes() >> spa_load_verify_shift; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes >= maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) dp, (void) arg; if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || policy.zlp_maxmeta == UINT64_MAX) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); /* * Verify data only if we are rewinding or error limit was set. * Otherwise nothing except dbgmsg care about it to waste time. */ sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || (policy.zlp_maxdata < UINT64_MAX); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); ASSERT0(spa->spa_load_verify_bytes); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts); fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } boolean_t spa_livelist_delete_check(spa_t *spa) { return (spa->spa_livelists_to_delete != 0); } static boolean_t spa_livelist_delete_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; return (spa_livelist_delete_check(spa)); } static int delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = arg; zio_free(spa, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); return (0); } static int dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) { int err; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); zap_cursor_init(&zc, os, zap_obj); err = zap_cursor_retrieve(&zc, za); zap_cursor_fini(&zc); if (err == 0) *llp = za->za_first_integer; zap_attribute_free(za); return (err); } /* * Components of livelist deletion that must be performed in syncing * context: freeing block pointers and updating the pool-wide data * structures to indicate how much work is left to do */ typedef struct sublist_delete_arg { spa_t *spa; dsl_deadlist_t *ll; uint64_t key; bplist_t *to_free; } sublist_delete_arg_t; static void sublist_delete_sync(void *arg, dmu_tx_t *tx) { sublist_delete_arg_t *sda = arg; spa_t *spa = sda->spa; dsl_deadlist_t *ll = sda->ll; uint64_t key = sda->key; bplist_t *to_free = sda->to_free; bplist_iterate(to_free, delete_blkptr_cb, spa, tx); dsl_deadlist_remove_entry(ll, key, tx); } typedef struct livelist_delete_arg { spa_t *spa; uint64_t ll_obj; uint64_t zap_obj; } livelist_delete_arg_t; static void livelist_delete_sync(void *arg, dmu_tx_t *tx) { livelist_delete_arg_t *lda = arg; spa_t *spa = lda->spa; uint64_t ll_obj = lda->ll_obj; uint64_t zap_obj = lda->zap_obj; objset_t *mos = spa->spa_meta_objset; uint64_t count; /* free the livelist and decrement the feature count */ VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); dsl_deadlist_free(mos, ll_obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); VERIFY0(zap_count(mos, zap_obj, &count)); if (count == 0) { /* no more livelists to delete */ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, tx)); VERIFY0(zap_destroy(mos, zap_obj, tx)); spa->spa_livelists_to_delete = 0; spa_notify_waiters(spa); } } /* * Load in the value for the livelist to be removed and open it. Then, * load its first sublist and determine which block pointers should actually * be freed. Then, call a synctask which performs the actual frees and updates * the pool-wide livelist data. */ static void spa_livelist_delete_cb(void *arg, zthr_t *z) { spa_t *spa = arg; uint64_t ll_obj = 0, count; objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj = spa->spa_livelists_to_delete; /* * Determine the next livelist to delete. This function should only * be called if there is at least one deleted clone. */ VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); VERIFY0(zap_count(mos, ll_obj, &count)); if (count > 0) { dsl_deadlist_t *ll; dsl_deadlist_entry_t *dle; bplist_t to_free; ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); VERIFY0(dsl_deadlist_open(ll, mos, ll_obj)); dle = dsl_deadlist_first(ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, z, NULL); if (err == 0) { sublist_delete_arg_t sync_arg = { .spa = spa, .ll = ll, .key = dle->dle_mintxg, .to_free = &to_free }; zfs_dbgmsg("deleting sublist (id %llu) from" " livelist %llu, %lld remaining", (u_longlong_t)dle->dle_bpobj.bpo_object, (u_longlong_t)ll_obj, (longlong_t)count - 1); VERIFY0(dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } else { VERIFY3U(err, ==, EINTR); } bplist_clear(&to_free); bplist_destroy(&to_free); dsl_deadlist_close(ll); kmem_free(ll, sizeof (dsl_deadlist_t)); } else { livelist_delete_arg_t sync_arg = { .spa = spa, .ll_obj = ll_obj, .zap_obj = zap_obj }; zfs_dbgmsg("deletion of livelist %llu completed", (u_longlong_t)ll_obj); VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } } static void spa_start_livelist_destroy_thread(spa_t *spa) { ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); spa->spa_livelist_delete_zthr = zthr_create("z_livelist_destroy", spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, minclsyspri); } typedef struct livelist_new_arg { bplist_t *allocs; bplist_t *frees; } livelist_new_arg_t; static int livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(tx == NULL); livelist_new_arg_t *lna = arg; if (bp_freed) { bplist_append(lna->frees, bp); } else { bplist_append(lna->allocs, bp); zfs_livelist_condense_new_alloc++; } return (0); } typedef struct livelist_condense_arg { spa_t *spa; bplist_t to_keep; uint64_t first_size; uint64_t next_size; } livelist_condense_arg_t; static void spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) { livelist_condense_arg_t *lca = arg; spa_t *spa = lca->spa; bplist_t new_frees; dsl_dataset_t *ds = spa->spa_to_condense.ds; /* Have we been cancelled? */ if (spa->spa_to_condense.cancelled) { zfs_livelist_condense_sync_cancel++; goto out; } dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; /* * It's possible that the livelist was changed while the zthr was * running. Therefore, we need to check for new blkptrs in the two * entries being condensed and continue to track them in the livelist. * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), * it's possible that the newly added blkptrs are FREEs or ALLOCs so * we need to sort them into two different bplists. */ uint64_t first_obj = first->dle_bpobj.bpo_object; uint64_t next_obj = next->dle_bpobj.bpo_object; uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; bplist_create(&new_frees); livelist_new_arg_t new_bps = { .allocs = &lca->to_keep, .frees = &new_frees, }; if (cur_first_size > lca->first_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, livelist_track_new_cb, &new_bps, lca->first_size)); } if (cur_next_size > lca->next_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, livelist_track_new_cb, &new_bps, lca->next_size)); } dsl_deadlist_clear_entry(first, ll, tx); ASSERT(bpobj_is_empty(&first->dle_bpobj)); dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); bplist_destroy(&new_frees); char dsname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, dsname); zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, (u_longlong_t)cur_next_size, (u_longlong_t)first->dle_bpobj.bpo_object, (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); out: dmu_buf_rele(ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); spa->spa_to_condense.syncing = B_FALSE; } static void spa_livelist_condense_cb(void *arg, zthr_t *t) { while (zfs_livelist_condense_zthr_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); spa_t *spa = arg; dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; uint64_t first_size, next_size; livelist_condense_arg_t *lca = kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); bplist_create(&lca->to_keep); /* * Process the livelists (matching FREEs and ALLOCs) in open context * so we have minimal work in syncing context to condense. * * We save bpobj sizes (first_size and next_size) to use later in * syncing context to determine if entries were added to these sublists * while in open context. This is possible because the clone is still * active and open for normal writes and we want to make sure the new, * unprocessed blockpointers are inserted into the livelist normally. * * Note that dsl_process_sub_livelist() both stores the size number of * blockpointers and iterates over them while the bpobj's lock held, so * the sizes returned to us are consistent which what was actually * processed. */ int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, &first_size); if (err == 0) err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, t, &next_size); if (err == 0) { while (zfs_livelist_condense_sync_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_mark_netfree(tx); dmu_tx_hold_space(tx, 1); err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); if (err == 0) { /* * Prevent the condense zthr restarting before * the synctask completes. */ spa->spa_to_condense.syncing = B_TRUE; lca->spa = spa; lca->first_size = first_size; lca->next_size = next_size; dsl_sync_task_nowait(spa_get_dsl(spa), spa_livelist_condense_sync, lca, tx); dmu_tx_commit(tx); return; } } /* * Condensing can not continue: either it was externally stopped or * we were unable to assign to a tx because the pool has run out of * space. In the second case, we'll just end up trying to condense * again in a later txg. */ ASSERT(err != 0); bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; if (err == EINTR) zfs_livelist_condense_zthr_cancel++; } /* * Check that there is something to condense but that a condense is not * already in progress and that condensing has not been cancelled. */ static boolean_t spa_livelist_condense_cb_check(void *arg, zthr_t *z) { (void) z; spa_t *spa = arg; if ((spa->spa_to_condense.ds != NULL) && (spa->spa_to_condense.syncing == B_FALSE) && (spa->spa_to_condense.cancelled == B_FALSE)) { return (B_TRUE); } return (B_FALSE); } static void spa_start_livelist_condensing_thread(spa_t *spa) { spa->spa_to_condense.ds = NULL; spa->spa_to_condense.first = NULL; spa->spa_to_condense.next = NULL; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); spa->spa_livelist_condense_zthr = zthr_create("z_livelist_condense", spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa, minclsyspri); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create("z_checkpoint_discard", spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa, minclsyspri); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { const char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); spa_import_progress_set_notes(spa, "spa_load()"); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); return (error); } #ifdef ZFS_DEBUG /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && vd->vdev_root_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_root_zap)); } if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } #else #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) #endif /* * Determine whether the activity check is required. */ static boolean_t spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, nvlist_t *config) { uint64_t state = 0; uint64_t hostid = 0; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; nvlist_t *nvinfo; if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, &tryconfig_txg); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, &tryconfig_timestamp); (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, &tryconfig_mmp_seq); } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* * Disable the MMP activity check - This is used by zdb which * is intended to be used on potentially active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) return (B_FALSE); /* * Skip the activity check when the MMP feature is disabled. */ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) return (B_FALSE); /* * If the tryconfig_ values are nonzero, they are the results of an * earlier tryimport. If they all match the uberblock we just found, * then the pool has not changed and we return false so we do not test * a second time. */ if (tryconfig_txg && tryconfig_txg == ub->ub_txg && tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && tryconfig_mmp_seq && tryconfig_mmp_seq == (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) return (B_FALSE); /* * Allow the activity check to be skipped when importing the pool * on the same host which last imported it. Since the hostid from * configuration may be stale use the one read from the label. */ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* * Skip the activity test when the pool was cleanly exported. */ if (state != POOL_STATE_ACTIVE) return (B_FALSE); return (B_TRUE); } /* * Nanoseconds the activity check must watch for changes on-disk. */ static uint64_t spa_activity_check_duration(spa_t *spa, uberblock_t *ub) { uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); uint64_t multihost_interval = MSEC2NSEC( MMP_INTERVAL_OK(zfs_multihost_interval)); uint64_t import_delay = MAX(NANOSEC, import_intervals * multihost_interval); /* * Local tunables determine a minimum duration except for the case * where we know when the remote host will suspend the pool if MMP * writes do not land. * * See Big Theory comment at the top of mmp.c for the reasoning behind * these cases and times. */ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) > 0) { /* MMP on remote host will suspend pool after failed writes */ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_FAIL_INT(ub), (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { /* MMP on remote host will never suspend pool */ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)MMP_INTERVAL(ub), (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals); } else if (MMP_VALID(ub)) { /* * zfs-0.7 compatibility case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " "import_intervals=%llu leaves=%u", (u_longlong_t)import_delay, (u_longlong_t)ub->ub_mmp_delay, (u_longlong_t)import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " "import_intervals=%llu", (u_longlong_t)import_delay, (u_longlong_t)multihost_interval, (u_longlong_t)import_intervals); } return (import_delay); } /* * Remote host activity check. * * error results: * 0 - no activity detected * EREMOTEIO - remote activity detected * EINTR - user canceled the operation */ static int spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; hrtime_t import_expire, now; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; cv_init(&cv, NULL, CV_DEFAULT, NULL); mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); /* * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed * during the earlier tryimport. If the txg recorded there is 0 then * the pool is known to be active on another host. * * Otherwise, the pool might be in use on another host. Check for * changes in the uberblocks on disk if necessary. */ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { vdev_uberblock_load(rvd, ub, &mmp_label); error = SET_ERROR(EREMOTEIO); goto out; } } import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * random_in_range(250) / 1000; import_expire = gethrtime() + import_delay; if (importing) { spa_import_progress_set_notes(spa, "Checking MMP activity, " "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); } int iterations = 0; while ((now = gethrtime()) < import_expire) { if (importing && iterations++ % 30 == 0) { spa_import_progress_set_notes(spa, "Checking MMP " "activity, %llu ms remaining", (u_longlong_t)NSEC2MSEC(import_expire - now)); } if (importing) { (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); } vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { zfs_dbgmsg("multihost activity detected " "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, (u_longlong_t)timestamp, (u_longlong_t)ub->ub_timestamp, (u_longlong_t)mmp_config, (u_longlong_t)ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } if (mmp_label) { nvlist_free(mmp_label); mmp_label = NULL; } error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); if (error != -1) { error = SET_ERROR(EINTR); break; } error = 0; } out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); /* * If the pool is determined to be active store the status in the * spa->spa_load_info nvlist. If the remote hostname or hostid are * available from configuration read from disk store them as well. * This allows 'zpool import' to generate a more useful message. * * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { const char *hostname = ""; uint64_t hostid = 0; if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { hostname = fnvlist_lookup_string(mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { hostid = fnvlist_lookup_uint64(mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, 0); error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); } if (mmp_label) nvlist_free(mmp_label); return (error); } /* * Called from zfs_ioc_clear for a pool that was suspended * after failing mmp write checks. */ boolean_t spa_mmp_remote_host_activity(spa_t *spa) { ASSERT(spa_multihost(spa) && spa_suspended(spa)); nvlist_t *best_label; uberblock_t best_ub; /* * Locate the best uberblock on disk */ vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); if (best_label) { /* * confirm that the best hostid matches our hostid */ if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && spa_get_hostid(spa) != fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { nvlist_free(best_label); return (B_TRUE); } nvlist_free(best_label); } else { return (B_TRUE); } if (!MMP_VALID(&best_ub) || !MMP_FAIL_INT_VALID(&best_ub) || MMP_FAIL_INT(&best_ub) == 0) { return (B_TRUE); } if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { zfs_dbgmsg("txg mismatch detected during pool clear " "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", (u_longlong_t)spa->spa_uberblock.ub_txg, (u_longlong_t)best_ub.ub_txg, (u_longlong_t)spa->spa_uberblock.ub_timestamp, (u_longlong_t)best_ub.ub_timestamp); return (B_TRUE); } /* * Perform an activity check looking for any remote writer */ return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, B_FALSE) != 0); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; const char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; const char *comment; const char *compatibility; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); ASSERT(spa->spa_compatibility == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, &compatibility) == 0) spa->spa_compatibility = spa_strdup(compatibility); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (spa->spa_load_max_txg != UINT64_MAX) { (void) spa_import_progress_set_max_txg(spa_guid(spa), (u_longlong_t)spa->spa_load_max_txg); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); if (ub->ub_raidz_reflow_info != 0) { spa_load_note(spa, "uberblock raidz_reflow_info: " "state=%u offset=%llu", (int)RRSS_GET_STATE(ub), (u_longlong_t)RRSS_GET_OFFSET(ub)); } /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ activity_check = spa_activity_check_required(spa, ub, label, spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } int error = spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); spa->spa_label_features = fnvlist_dup(features); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; unsup_feat = fnvlist_alloc(); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { fnvlist_add_string(unsup_feat, nvpair_name(nvp), ""); } } if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); if (error != 0) { nvlist_free(mos_config); spa_config_exit(spa, SCL_ALL, FTAG); spa_load_failed(spa, "spa_config_parse failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); /* * If 'zpool import' used a cached config, then the on-disk hostid and * hostname may be different to the cached config in ways that should * prevent import. Userspace can't discover this without a scan, but * we know, so we add these values to LOAD_INFO so the caller can know * the difference. * * Note that we have to do this before the config is regenerated, * because the new config will have the hostid and hostname for this * host, in readiness for import. */ if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Encryption was added before bookmark_v2, even though bookmark_v2 * is now a dependency. If this pool has encryption enabled without * bookmark_v2, trigger an errata message. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* Load the last scrubbed txg. */ error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG, &spa->spa_scrubbed_last_txg, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the livelist deletion field. If a livelist is queued for * deletion, indicate that in the spa */ error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, &spa->spa_livelists_to_delete, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { nvlist_free(mos_config); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace = 0; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA, &spa->spa_dedup_table_quota); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } error = spa_ld_log_spacemaps(spa); if (error != 0) { spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_load_brt(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = brt_load(spa); if (error != 0) { spa_load_failed(spa, "brt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asynchronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); spa_import_progress_add(spa); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; hrtime_t load_start = gethrtime(); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Drop the namespace lock for the rest of the function. */ spa->spa_load_thread = curthread; mutex_exit(&spa_namespace_lock); /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) goto fail; /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) goto fail; /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) goto fail; /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) goto fail; /* * Retrieve pool properties from the MOS. */ spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) goto fail; /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) goto fail; /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) goto fail; spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) goto fail; spa_import_progress_set_notes(spa, "Loading BRT"); error = spa_ld_load_brt(spa); if (error != 0) goto fail; /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) goto fail; if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP); goto fail; } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) goto fail; /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_import_progress_set_notes(spa, "Calculating deflated space"); spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ spa_import_progress_set_notes(spa, "Starting import"); if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * Before we do any zio_write's, complete the raidz expansion * scratch space copying, if necessary. */ if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) vdev_raidz_reflow_copy_scratch(spa); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ spa_import_progress_set_notes(spa, "Syncing ZIL claims"); txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_import_progress_set_notes(spa, "Updating configs"); spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check if a rebuild was in progress and if so resume it. * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ spa_import_progress_set_notes(spa, "Starting resilvers"); if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); } /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open", NULL); spa_import_progress_set_notes(spa, "Restarting device removals"); spa_restart_removal(spa); spa_spawn_aux_threads(spa); /* * Delete any inconsistent datasets. * * Note: * Since we may be issuing deletes for clones here, * we make sure to do so after we've spawned all the * auxiliary threads above (from which the livelist * deletion zthr is part of). */ spa_import_progress_set_notes(spa, "Cleaning up inconsistent objsets"); (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ spa_import_progress_set_notes(spa, "Cleaning up temporary userrefs"); dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_import_progress_set_notes(spa, "Restarting initialize"); vdev_initialize_restart(spa->spa_root_vdev); spa_import_progress_set_notes(spa, "Restarting TRIM"); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); spa_import_progress_set_notes(spa, "Finished importing"); } zio_handle_import_delay(spa, gethrtime() - load_start); spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); fail: mutex_enter(&spa_namespace_lock); spa->spa_load_thread = NULL; cv_broadcast(&spa_namespace_cv); return (error); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); spa_import_progress_remove(spa_guid(spa)); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; spa_import_progress_remove(spa_guid(spa)); return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, const void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; firstopen = B_TRUE; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { *config = fnvlist_dup(spa->spa_config); fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER && config != NULL) { fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } if (firstopen) zvol_create_minors_recursive(spa_name(spa)); *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, const void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, const void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares)); if (nspares != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares)); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { guid = fnvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID); VERIFY0(nvlist_lookup_uint64_array(spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } else { vs->vs_state = spa->spa_spares.sav_vdevs[i]->vdev_state; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); if (nl2cache != 0) { fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY0(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); vdev_get_stats(vd, vs); vdev_config_generate_stats(vd, l2cache[i]); } } } static void spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { ASSERT(za->za_integer_length == sizeof (uint64_t) && za->za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za->za_name, za->za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { ASSERT(za->za_integer_length == sizeof (uint64_t) && za->za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za->za_name, za->za_first_integer)); } zap_cursor_fini(&zc); } zap_attribute_free(za); } static void spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) { int i; for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t feature = spa_feature_table[i]; uint64_t refcount; if (feature_get_refcount(spa, &feature, &refcount) != 0) continue; VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); } } /* * Store a list of pool features and their reference counts in the * config. * * The first time this is called on a spa, allocate a new nvlist, fetch * the pool features and reference counts from disk, then save the list * in the spa. In subsequent calls on the same spa use the saved nvlist * and refresh its values from the cached reference counts. This * ensures we don't block here on I/O on a suspended pool so 'zpool * clear' can resume the pool. */ static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); mutex_enter(&spa->spa_feat_stats_lock); features = spa->spa_feat_stats; if (features != NULL) { spa_feature_stats_from_cache(spa, features); } else { VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); spa->spa_feat_stats = features; spa_feature_stats_from_disk(spa, features); } VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features)); mutex_exit(&spa->spa_feat_stats_lock); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; fnvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); fnvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_approx_errlog_size(spa)); if (spa_suspended(spa)) { fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode); fnvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, spa->spa_suspended); } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatenating with the * current dev list. */ VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs)); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) newdevs[i] = fnvlist_dup(olddevs[i]); for (i = 0; i < ndevs; i++) newdevs[i + oldndevs] = fnvlist_dup(devs[i]); fnvlist_remove(sav->sav_config, config); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)newdevs, ndevs + oldndevs); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ sav->sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(sav->sav_config, config, (const nvlist_t * const *)devs, ndevs); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Verify encryption parameters for spa creation. If we are encrypting, we must * have the encryption feature flag enabled. */ static int spa_create_check_encryption_params(dsl_crypto_params_t *dcp, boolean_t has_encryption) { if (dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT && !has_encryption) return (SET_ERROR(ENOTSUP)); return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; const char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; spa_feature_t feat; const char *feat_name; const char *poolname; nvlist_t *nvl; if (props == NULL || nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) poolname = (char *)pool; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(poolname, nvl, altroot); fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Temporary pool names should never be written to disk. */ if (poolname != pool) spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; feat_name = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(feat_name, &feat)); if (feat == SPA_FEATURE_ENCRYPTION) has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; } } /* verify encryption params, if they were provided */ if (dcp != NULL) { error = spa_create_check_encryption_params(dcp, has_encryption); if (error != 0) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } } if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (ENOTSUP); } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP)); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); /* * Create BRT table and BRT table object. */ brt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) spa_history_create_obj(spa, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create", tx); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); spa->spa_dedup_table_quota = zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } for (int i = 0; i < ndraid; i++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(dp); mmp_thread_start(spa); txg_wait_synced(dp, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; spa_import_os(spa); mutex_exit(&spa_namespace_lock); return (0); } /* * Import a non-root pool into the system. */ int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; const char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; spa_mode_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = SPA_MODE_READ; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); else spa->spa_spares.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, nspares); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; spa->spa_spares.sav_label_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) fnvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE); else spa->spa_l2cache.sav_config = fnvlist_alloc(); fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, nl2cache); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; spa->spa_l2cache.sav_label_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import", NULL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); zvol_create_minors_recursive(pool); spa_import_os(spa); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; const char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); mutex_enter(&spa_namespace_lock); spa = spa_add(name, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); kmem_free(name, MAXPATHLEN); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } /* * spa_import() relies on a pool config fetched by spa_try_import() * for spare/cache devices. Import flags are not passed to * spa_tryimport(), which makes it return early due to a missing log * device and missing retrieving the cache device and spare eventually. * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch * the correct configuration regardless of the missing log device. */ spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp); fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname; dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { int error = 0; spa_t *spa; hrtime_t export_start = gethrtime(); if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_is_exporting) { /* the pool is being exported by another thread */ mutex_exit(&spa_namespace_lock); return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); } spa->spa_is_exporting = B_TRUE; /* * Put a hold on the pool, drop the namespace lock, stop async tasks * and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); spa->spa_export_thread = curthread; spa_close(spa, FTAG); if (spa->spa_state == POOL_STATE_UNINITIALIZED) { mutex_exit(&spa_namespace_lock); goto export_spa; } /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ if (spa->spa_sync_on) { txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); } /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { error = SET_ERROR(EBUSY); goto fail; } mutex_exit(&spa_namespace_lock); /* * At this point we no longer hold the spa_namespace_lock and * there were no references on the spa. Future spa_lookups will * notice the spa->spa_export_thread and wait until we signal * that we are finshed. */ if (spa->spa_sync_on) { vdev_t *rvd = spa->spa_root_vdev; /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { error = SET_ERROR(EXDEV); mutex_enter(&spa_namespace_lock); goto fail; } /* * We're about to export or destroy this pool. Make sure * we stop all initialization and trim activity here before * we set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); l2arc_spa_rebuild_stop(spa); /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; vdev_config_dirty(rvd); spa_config_exit(spa, SCL_ALL, FTAG); } /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some * time flushing as many metaslabs as we can in an attempt to * destroy log space maps and save import time. This has to be * done before we set the spa_final_txg, otherwise * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. * spa_should_flush_logs_on_unload() should be called after * spa_state has been set to the new_state. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; spa_config_exit(spa, SCL_ALL, FTAG); } } export_spa: spa_export_os(spa); if (new_state == POOL_STATE_DESTROYED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); else if (new_state == POOL_STATE_EXPORTED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) *oldconfig = fnvlist_dup(spa->spa_config); if (new_state == POOL_STATE_EXPORTED) zio_handle_export_delay(spa, gethrtime() - export_start); /* * Take the namespace lock for the actual spa_t removal */ mutex_enter(&spa_namespace_lock); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); } else { /* * If spa_remove() is not called for this spa_t and * there is any possibility that it can be reused, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; spa->spa_export_thread = NULL; } /* * Wake up any waiters in spa_lookup() */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (0); fail: spa->spa_is_exporting = B_FALSE; spa->spa_export_thread = NULL; spa_async_resume(spa); /* * Wake up any waiters in spa_lookup() */ cv_broadcast(&spa_namespace_cv); mutex_exit(&spa_namespace_lock); return (error); } /* * Destroy a storage pool. */ int spa_destroy(const char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(const char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * This is called as a synctask to increment the draid feature flag */ static void spa_draid_feature_incr(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int draid = (int)(uintptr_t)arg; for (int c = 0; c < draid; c++) spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); } /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); } /* * The virtual dRAID spares must be added after vdev tree is created * and the vdev guids are generated. The guid of their associated * dRAID is stored in the config and used when opening the spare. */ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, rvd->vdev_children)) == 0) { if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; } else { return (spa_vdev_exit(spa, vd, txg, error)); } /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz or a dRAID */ if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, ZFS_ERR_ASHIFT_MISMATCH)); } } } for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We can't increment a feature while holding spa_vdev so we * have to do it in a synctask. */ if (ndraid != 0) { dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, (void *)(uintptr_t)ndraid, tx); dmu_tx_commit(tx); } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a vdev specified by its guid. The vdev type can be * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a * single device). When the vdev is a single device, a mirror vdev will be * automatically inserted. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. * * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) * should be performed instead of traditional healing reconstruction. From * an administrators perspective these are both resilver operations. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare = B_FALSE; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); if (dsl_scan_resilvering(spa_get_dsl(spa)) || dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_REBUILD_IN_PROGRESS)); } if (spa->spa_vdev_removal != NULL) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_DEVRM_IN_PROGRESS)); } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; if (raidz) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * Can't expand a raidz while prior expand is in progress. */ if (spa->spa_raidz_expand != NULL) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); } } else if (!oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); } if (raidz) pvd = oldvd; else pvd = oldvd->vdev_parent; if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * log, dedup and special vdevs should not be replaced by spares. */ if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } /* * A dRAID spare can only replace a child of its parent dRAID vdev. */ if (newvd->vdev_ops == &vdev_draid_spare_ops && oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (rebuild) { /* * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable * vdevs types are the root vdev, a mirror, or dRAID. */ tvd = pvd; if (pvd->vdev_top != NULL) tvd = pvd->vdev_top; if (tvd->vdev_ops != &vdev_mirror_ops && tvd->vdev_ops != &vdev_root_ops && tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } if (!replacing) { /* * For attach, the only allowable parent is a mirror or * the root vdev. A raidz vdev can be attached to, but * you cannot attach to a raidz child. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && !raidz) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) { return (spa_vdev_exit(spa, newrootvd, txg, ZFS_ERR_ASHIFT_MISMATCH)); } /* * RAIDZ-expansion-specific checks. */ if (raidz) { if (vdev_raidz_attach_check(newvd) != 0) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * Fail early if a child is not healthy or being replaced */ for (int i = 0; i < oldvd->vdev_children; i++) { if (vdev_is_dead(oldvd->vdev_child[i]) || !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, newrootvd, txg, ENXIO)); } /* Also fail if reserved boot area is in-use */ if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) != 0) { return (spa_vdev_exit(spa, newrootvd, txg, EADDRINUSE)); } } } if (raidz) { /* * Note: oldvdpath is freed by spa_strfree(), but * kmem_asprintf() is freed by kmem_strfree(), so we have to * move it to a spa_strdup-ed string. */ char *tmp = kmem_asprintf("raidz%u-%u", (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); oldvdpath = spa_strdup(tmp); kmem_strfree(tmp); } else { oldvdpath = spa_strdup(oldvd->vdev_path); } newvdpath = spa_strdup(newvd->vdev_path); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/old", newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } spa_strfree(oldvdpath); oldvdpath = spa_strdup(oldvd->vdev_path); } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (!raidz && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); } ASSERT(pvd->vdev_top->vdev_parent == rvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(pvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; if (raidz) { /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_wait(tvd); dtl_max_txg = spa_vdev_config_enter(spa); tvd->vdev_rz_expanding = B_TRUE; vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); vdev_config_dirty(tvd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, dtl_max_txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, newvd, tx); dmu_tx_commit(tx); } else { vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver or rebuild to restart in the future. * We do this to ensure that dmu_sync-ed blocks have been * stitched into the respective datasets. */ if (rebuild) { newvd->vdev_rebuild_txg = txg; vdev_rebuild(tvd); } else { newvd->vdev_resilver_txg = txg; if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); } } } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing or a spare vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd __maybe_unused = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a normal spare, then it * implies that the spare should become a real disk, and be removed * from the active spare list for the pool. dRAID spares on the * other hand are coupled to the pool and thus should never be removed * from the spares list. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; if (last_cvd->vdev_isspare && last_cvd->vdev_ops != &vdev_draid_spare_ops) { unspare = B_TRUE; } } /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); spa_notify_waiters(spa); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } static int spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_UNINIT && vd->vdev_initialize_thread != NULL) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } switch (cmd_type) { case POOL_INITIALIZE_START: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; case POOL_INITIALIZE_UNINIT: vdev_uninitialize(vd); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); return (0); } int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all initialize threads to stop. */ vdev_initialize_stop_wait(spa, &vd_list); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } static int spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } else if (!vd->vdev_has_trim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } else if (secure && !vd->vdev_has_securetrim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } mutex_enter(&vd->vdev_trim_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate a TRIM action we check to see if the * vdev_trim_thread is NULL. We do this instead of using the * vdev_trim_state since there might be a previous TRIM process * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_TRIM_SUSPEND && vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_TRIM_START: vdev_trim(vd, rate, partial, secure); break; case POOL_TRIM_CANCEL: vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); break; case POOL_TRIM_SUSPEND: vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_trim_lock); return (0); } /* * Initiates a manual TRIM for the requested vdevs. This kicks off individual * TRIM threads for each child vdev. These threads pass over all of the free * space in the vdev's metaslabs and issues TRIM commands for that space. */ int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping TRIM. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the TRIM operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, rate, partial, secure, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all TRIM threads to stop. */ vdev_trim_stop_wait(spa, &vd_list); /* Sync out the TRIM state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; const char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* deal with indirect vdevs */ if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == &vdev_indirect_ops) continue; /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c]) || vdev_resilver_needed(vml[c], NULL, NULL)) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ nvl = fnvlist_alloc(); fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); /* * Temporarily stop the initializing and TRIM activity. We set the * state to ACTIVE so that we know to resume initializing or TRIM * once the split has completed. */ list_t vd_initialize_list; list_create(&vd_initialize_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); list_t vd_trim_list; list_create(&vd_trim_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); mutex_exit(&vml[c]->vdev_initialize_lock); mutex_enter(&vml[c]->vdev_trim_lock); vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); mutex_exit(&vml[c]->vdev_trim_lock); } } vdev_initialize_stop_wait(spa, &vd_initialize_list); vdev_trim_stop_wait(spa, &vd_trim_list); list_destroy(&vd_initialize_list); list_destroy(&vd_trim_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; newspa->spa_is_splitting = B_TRUE; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { newspa->spa_config_splitting = fnvlist_alloc(); fnvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* * Need to be sure the detachable VDEV is not * on any *other* txg's DTL list to prevent it * from being accessed after it's freed. */ for (int t = 0; t < TXG_SIZE; t++) { (void) txg_list_remove_this( &tvd->vdev_dtl_list, vml[c], t); } vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); newspa->spa_is_splitting = B_FALSE; kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing or trimming disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); /* * If a detach was not performed above replace waiters will not have * been notified. In which case we must do so now. */ spa_notify_waiters(spa); } /* * Update the stored path or FRU for this vdev. */ static int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { return (spa_scan_range(spa, func, 0, 0)); } int spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, uint64_t txgend) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); if (func == POOL_SCAN_RESILVER && !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) return (SET_ERROR(ENOTSUP)); if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } if (func == POOL_SCAN_ERRORSCRUB && !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) return (SET_ERROR(ENOTSUP)); return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); /* Tell userspace that the vdev is gone. */ zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { if (vd->vdev_fault_wanted) { vd->vdev_fault_wanted = B_FALSE; vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } static __attribute__((noreturn)) void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; dsl_pool_t *dp = spa->spa_dsl_pool; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), (u_longlong_t)new_space, (u_longlong_t)(new_space - old_space)); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be marked faulted. */ if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE || tasks & SPA_ASYNC_REBUILD_DONE || tasks & SPA_ASYNC_DETACH_SPARE) { spa_vdev_resilver_done(spa); } /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_TRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_cancel(condense_thread); zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; if (raidz_expand_thread != NULL) zthr_cancel(raidz_expand_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_cancel(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_cancel(ll_condense_thread); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_resume(condense_thread); zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; if (raidz_expand_thread != NULL) zthr_resume(raidz_expand_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_resume(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_resume(ll_condense_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } int spa_async_tasks(spa_t *spa) { return (spa->spa_async_tasks); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); } int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *pio = arg; zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, pio->io_flags)); return (0); } static int bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (spa_free_sync_cb(arg, bp, tx)); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; /* * Note: * If the log space map feature is active, we stop deferring * frees to the next TXG and therefore running this function * would be considered a no-op as spa_deferred_bpobj should * not have any entries. * * That said we run this function anyway (instead of returning * immediately) for the edge-case scenario where we just * activated the log space map feature in this TXG but we have * deferred frees from the previous TXG. */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); memset(packed + nvsize, 0, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } nvroot = fnvlist_alloc(); if (sav->sav_count == 0) { fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)NULL, 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); fnvlist_add_nvlist_array(nvroot, config, (const nvlist_t * const *)list, sav->sav_count); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_root_zap != 0 && spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_root_zap, tx)); } if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za->za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); zap_attribute_free(za); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za->za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); zap_attribute_free(za); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", (longlong_t)version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; const char *strval, *fname; zpool_prop_t prop; const char *propname; const char *elemname = nvpair_name(elem); zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(elemname)) { case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persistent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. We also need to * update the cache file to keep it in sync with the * MOS version. It's unnecessary to do this for pool * creation since the vdev's configuration has already * been dirtied. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", elemname, strval); break; case ZPOOL_PROP_COMPATIBILITY: strval = fnvpair_value_string(elem); if (spa->spa_compatibility != NULL) spa_strfree(spa->spa_compatibility); spa->spa_compatibility = spa_strdup(strval); /* * Dirty the configuration on vdevs as above. */ if (tx->tx_txg != TXG_INITIAL) { vdev_config_dirty(spa->spa_root_vdev); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; case ZPOOL_PROP_INVAL: if (zpool_prop_feature(elemname)) { fname = strchr(elemname, '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", elemname); break; } else if (!zfs_prop_user(elemname)) { ASSERT(zpool_prop_feature(elemname)); break; } zfs_fallthrough; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ if (prop == ZPOOL_PROP_INVAL) { propname = elemname; proptype = PROP_TYPE_STRING; } else { propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); } if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", elemname, strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", elemname, (longlong_t)intval); switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOTRIM: spa->spa_autotrim = intval; spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; case ZPOOL_PROP_DEDUP_TABLE_QUOTA: spa->spa_dedup_table_quota = intval; break; default: break; } } else { ASSERT(0); /* not allowed */ } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } uint64_t obsolete_sm_object = 0; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); } /* * Set the top-level vdev's max queue depth. Evaluate each top-level's * async write queue depth in case it changed. The max queue depth will * not change in the middle of syncing out this txg. */ static void spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; metaslab_class_t *normal = spa_normal_class(spa); metaslab_class_t *special = spa_special_class(spa); metaslab_class_t *dedup = spa_dedup_class(spa); uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || !metaslab_group_initialized(mg)) continue; metaslab_class_t *mc = mg->mg_class; if (mc != normal && mc != special && mc != dedup) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ for (int i = 0; i < mg->mg_allocators; i++) { ASSERT0(zfs_refcount_count( &(mg->mg_allocator[i].mga_alloc_queue_depth))); } mg->mg_max_alloc_queue_depth = max_queue_depth; for (int i = 0; i < mg->mg_allocators; i++) { mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } for (int i = 0; i < spa->spa_alloc_count; i++) { ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. mca_alloc_slots)); ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. mca_alloc_slots)); ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. mca_alloc_slots)); normal->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; special->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; dedup->mc_allocator[i].mca_alloc_max_slots = slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; } static void spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } } static void spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) { objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; uint64_t txg = tx->tx_txg; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free || spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { /* * If the log space map feature is active we don't * care about deferred frees and the deferred bpobj * as the log space map should effectively have the * same results (i.e. appending only to one object). */ spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); spa_flush_metaslabs(spa, tx); vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); if (pass == 1) { /* * dsl_pool_sync() -> dp_sync_tasks may have dirtied * the config. If that happens, this txg should not * be a no-op. So we must sync the config to the MOS * before checking for no-op. * * Note that when the config is dirty, it will * be written to the MOS (i.e. the MOS will be * dirtied) every time we call spa_sync_config_object() * in this txg. Therefore we can't call this after * dsl_pool_sync() every pass, because it would * prevent us from converging, since we'd dirty * the MOS every pass. * * Sync tasks can only be processed in pass 1, so * there's no need to do this in later passes. */ spa_sync_config_object(spa, tx); } /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock * (e.g. if we have sync tasks but no dirty user data). We need * to check the uberblock's rootbp because it is updated if we * have synced out dirty data (though in this case the MOS will * most likely also be dirty due to second order effects, we * don't want to rely on that here). */ if (pass == 1 && BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this * TXG is a no-op. Avoid syncing deferred frees, so * that we can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); } /* * Rewrite the vdev configuration (which includes the uberblock) to * commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few random * top-level vdevs that are known to be visible in the config cache * (see spa_vdev_add() for a complete description). If there *are* dirty * vdevs, sync the uberblock to all vdevs. */ static void spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; for (;;) { int error = 0; /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = random_in_range(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { vdev_t *vd = NULL; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Now that there can be no more cloning in this transaction group, * but we are still before issuing frees, we can process pending BRT * updates. */ brt_pending_apply(spa, txg); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_allocs[i].spaa_lock); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { /* Avoid holding the write lock unless actually necessary */ if (vd->vdev_aux == NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); continue; } /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); dsl_pool_t *dp = spa->spa_dsl_pool; dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { vdev_t *rvd = spa->spa_root_vdev; int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } spa_sync_adjust_vdev_max_queue_depth(spa); spa_sync_condense_indirect(spa, tx); spa_sync_iterate_to_convergence(spa, tx); #ifdef ZFS_DEBUG if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } #endif if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_allocs[i].spaa_lock); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); mutex_exit(&spa->spa_allocs[i].spaa_lock); } /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); /* spa_embedded_log_class has only one metaslab per vdev. */ metaslab_class_evict_old(spa->spa_special_class, txg); metaslab_class_evict_old(spa->spa_dedup_class, txg); spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) vdev_autotrim_kick(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } taskq_t * spa_sync_tq_create(spa_t *spa, const char *name) { kthread_t **kthreads; ASSERT(spa->spa_sync_tq == NULL); ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); /* * - do not allow more allocators than cpus. * - there may be more cpus than allocators. * - do not allow more sync taskq threads than allocators or cpus. */ int nthreads = spa->spa_alloc_count; spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * nthreads, KM_SLEEP); spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); VERIFY(spa->spa_sync_tq != NULL); VERIFY(kthreads != NULL); spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < nthreads; i++, ti++) { ti->sti_thread = kthreads[i]; ti->sti_allocator = i; } kmem_free(kthreads, sizeof (*kthreads) * nthreads); return (spa->spa_sync_tq); } void spa_sync_tq_destroy(spa_t *spa) { ASSERT(spa->spa_sync_tq != NULL); taskq_wait(spa->spa_sync_tq); taskq_destroy(spa->spa_sync_tq); kmem_free(spa->spa_syncthreads, sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); spa->spa_sync_tq = NULL; } uint_t spa_acq_allocator(spa_t *spa) { int i; if (spa->spa_alloc_count == 1) return (0); mutex_enter(&spa->spa_allocs_use->sau_lock); uint_t r = spa->spa_allocs_use->sau_rotor; do { if (++r == spa->spa_alloc_count) r = 0; } while (spa->spa_allocs_use->sau_inuse[r]); spa->spa_allocs_use->sau_inuse[r] = B_TRUE; spa->spa_allocs_use->sau_rotor = r; mutex_exit(&spa->spa_allocs_use->sau_lock); spa_syncthread_info_t *ti = spa->spa_syncthreads; for (i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { ti->sti_allocator = r; break; } } ASSERT3S(i, <, spa->spa_alloc_count); return (r); } void spa_rel_allocator(spa_t *spa, uint_t allocator) { if (spa->spa_alloc_count > 1) spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; } void spa_select_allocator(zio_t *zio) { zbookmark_phys_t *bm = &zio->io_bookmark; spa_t *spa = zio->io_spa; ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* * A gang block (for example) may have inherited its parent's * allocator, in which case there is nothing further to do here. */ if (ZIO_HAS_ALLOCATOR(zio)) return; ASSERT(spa != NULL); ASSERT(bm != NULL); /* * First try to use an allocator assigned to the syncthread, and set * the corresponding write issue taskq for the allocator. * Note, we must have an open pool to do this. */ if (spa->spa_sync_tq != NULL) { spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { zio->io_allocator = ti->sti_allocator; return; } } } /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20); zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } static boolean_t spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) { (void) spa; int i; uint64_t vdev_guid; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &vdev_guid) == 0 && vdev_guid == guid) return (B_TRUE); } return (B_FALSE); } boolean_t spa_has_l2cache(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } uint64_t spa_total_metaslabs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t m = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; m += vd->vdev_ms_count; } return (m); } /* * Notify any waiting threads that some activity has switched from being in- * progress to not-in-progress so that the thread can wake up and determine * whether it is finished waiting. */ void spa_notify_waiters(spa_t *spa) { /* * Acquiring spa_activities_lock here prevents the cv_broadcast from * happening between the waiting thread's check and cv_wait. */ mutex_enter(&spa->spa_activities_lock); cv_broadcast(&spa->spa_activities_cv); mutex_exit(&spa->spa_activities_lock); } /* * Notify any waiting threads that the pool is exporting, and then block until * they are finished using the spa_t. */ void spa_wake_waiters(spa_t *spa) { mutex_enter(&spa->spa_activities_lock); spa->spa_waiters_cancel = B_TRUE; cv_broadcast(&spa->spa_activities_cv); while (spa->spa_waiters != 0) cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); spa->spa_waiters_cancel = B_FALSE; mutex_exit(&spa->spa_activities_lock); } /* Whether the vdev or any of its descendants are being initialized/trimmed. */ static boolean_t spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); ASSERT(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? &vd->vdev_initialize_lock : &vd->vdev_trim_lock; mutex_exit(&spa->spa_activities_lock); mutex_enter(lock); mutex_enter(&spa->spa_activities_lock); boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); mutex_exit(lock); if (in_progress) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], activity)) return (B_TRUE); } return (B_FALSE); } /* * If use_guid is true, this checks whether the vdev specified by guid is * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool * is being initialized/trimmed. The caller must hold the config lock and * spa_activities_lock. */ static int spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, zpool_wait_activity_t activity, boolean_t *in_progress) { mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); vdev_t *vd; if (use_guid) { vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (EINVAL); } } else { vd = spa->spa_root_vdev; } *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (0); } /* * Locking for waiting threads * --------------------------- * * Waiting threads need a way to check whether a given activity is in progress, * and then, if it is, wait for it to complete. Each activity will have some * in-memory representation of the relevant on-disk state which can be used to * determine whether or not the activity is in progress. The in-memory state and * the locking used to protect it will be different for each activity, and may * not be suitable for use with a cvar (e.g., some state is protected by the * config lock). To allow waiting threads to wait without any races, another * lock, spa_activities_lock, is used. * * When the state is checked, both the activity-specific lock (if there is one) * and spa_activities_lock are held. In some cases, the activity-specific lock * is acquired explicitly (e.g. the config lock). In others, the locking is * internal to some check (e.g. bpobj_is_empty). After checking, the waiting * thread releases the activity-specific lock and, if the activity is in * progress, then cv_waits using spa_activities_lock. * * The waiting thread is woken when another thread, one completing some * activity, updates the state of the activity and then calls * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only * needs to hold its activity-specific lock when updating the state, and this * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. * * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, * and because it is held when the waiting thread checks the state of the * activity, it can never be the case that the completing thread both updates * the activity state and cv_broadcasts in between the waiting thread's check * and cv_wait. Thus, a waiting thread can never miss a wakeup. * * In order to prevent deadlock, when the waiting thread does its check, in some * cases it will temporarily drop spa_activities_lock in order to acquire the * activity-specific lock. The order in which spa_activities_lock and the * activity specific lock are acquired in the waiting thread is determined by * the order in which they are acquired in the completing thread; if the * completing thread calls spa_notify_waiters with the activity-specific lock * held, then the waiting thread must also acquire the activity-specific lock * first. */ static int spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); switch (activity) { case ZPOOL_WAIT_CKPT_DISCARD: *in_progress = (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == ENOENT); break; case ZPOOL_WAIT_FREE: *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || spa_livelist_delete_check(spa)); break; case ZPOOL_WAIT_INITIALIZE: case ZPOOL_WAIT_TRIM: error = spa_vdev_activity_in_progress(spa, use_tag, tag, activity, in_progress); break; case ZPOOL_WAIT_REPLACE: mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); break; case ZPOOL_WAIT_REMOVE: *in_progress = (spa->spa_removing_phys.sr_state == DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: *in_progress = vdev_rebuild_active(spa->spa_root_vdev); if (*in_progress) break; zfs_fallthrough; case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); scanning = (scn->scn_phys.scn_state == DSS_SCANNING); paused = dsl_scan_is_paused_scrub(scn); *in_progress = (scanning && !paused && is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } case ZPOOL_WAIT_RAIDZ_EXPAND: { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); break; } default: panic("unrecognized value for activity %d", activity); } return (error); } static int spa_wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { /* * The tag is used to distinguish between instances of an activity. * 'initialize' and 'trim' are the only activities that we use this for. * The other activities can only have a single instance in progress in a * pool at one time, making the tag unnecessary. * * There can be multiple devices being replaced at once, but since they * all finish once resilvering finishes, we don't bother keeping track * of them individually, we just wait for them all to finish. */ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && activity != ZPOOL_WAIT_TRIM) return (EINVAL); if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) return (EINVAL); spa_t *spa; int error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); /* * Increment the spa's waiter count so that we can call spa_close and * still ensure that the spa_t doesn't get freed before this thread is * finished with it when the pool is exported. We want to call spa_close * before we start waiting because otherwise the additional ref would * prevent the pool from being exported or destroyed throughout the * potentially long wait. */ mutex_enter(&spa->spa_activities_lock); spa->spa_waiters++; spa_close(spa, FTAG); *waited = B_FALSE; for (;;) { boolean_t in_progress; error = spa_activity_in_progress(spa, activity, use_tag, tag, &in_progress); if (error || !in_progress || spa->spa_waiters_cancel) break; *waited = B_TRUE; if (cv_wait_sig(&spa->spa_activities_cv, &spa->spa_activities_lock) == 0) { error = EINTR; break; } } spa->spa_waiters--; cv_signal(&spa->spa_waiters_cv); mutex_exit(&spa->spa_activities_lock); return (error); } /* * Wait for a particular instance of the specified activity to complete, where * the instance is identified by 'tag' */ int spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); } /* * Wait for all instances of the specified activity complete */ int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); if (resource) { ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); ev->resource = resource; } #else (void) spa, (void) vd, (void) hist_nvl, (void) name; #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL if (ev) { zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); kmem_free(ev, sizeof (*ev)); } #else (void) ev; #endif } /* * Post a zevent corresponding to the given sysevent. The 'name' must be one * of the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); EXPORT_SYMBOL(spa_get_stats); EXPORT_SYMBOL(spa_create); EXPORT_SYMBOL(spa_import); EXPORT_SYMBOL(spa_tryimport); EXPORT_SYMBOL(spa_destroy); EXPORT_SYMBOL(spa_export); EXPORT_SYMBOL(spa_reset); EXPORT_SYMBOL(spa_async_request); EXPORT_SYMBOL(spa_async_suspend); EXPORT_SYMBOL(spa_async_resume); EXPORT_SYMBOL(spa_inject_addref); EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); /* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); /* spare statech is global across all pools) */ EXPORT_SYMBOL(spa_spare_add); EXPORT_SYMBOL(spa_spare_remove); EXPORT_SYMBOL(spa_spare_exists); EXPORT_SYMBOL(spa_spare_activate); /* L2ARC statech is global across all pools) */ EXPORT_SYMBOL(spa_l2cache_add); EXPORT_SYMBOL(spa_l2cache_remove); EXPORT_SYMBOL(spa_l2cache_exists); EXPORT_SYMBOL(spa_l2cache_activate); EXPORT_SYMBOL(spa_l2cache_drop); /* scanning */ EXPORT_SYMBOL(spa_scan); EXPORT_SYMBOL(spa_scan_range); EXPORT_SYMBOL(spa_scan_stop); /* spa syncing */ EXPORT_SYMBOL(spa_sync); /* only for DMU use */ EXPORT_SYMBOL(spa_sync_allpools); /* properties */ EXPORT_SYMBOL(spa_prop_set); EXPORT_SYMBOL(spa_prop_get); EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, "Percentage of CPUs to run a metaslab preload taskq"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, "log2 fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, "Set to traverse data on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, "Percentage of CPUs to run an IO worker thread"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, "Set the livelist condense synctask to pause"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the zthr function"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, "Configure IO queues for read IO"); ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, "Configure IO queues for write IO"); #endif -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, "Number of CPUs per write issue taskq"); diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 1efff47f87a0..4c3721c159be 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -1,640 +1,638 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Storage Pool Checkpoint * * A storage pool checkpoint can be thought of as a pool-wide snapshot or * a stable version of extreme rewind that guarantees no blocks from the * checkpointed state will have been overwritten. It remembers the entire * state of the storage pool (e.g. snapshots, dataset names, etc..) from the * point that it was taken and the user can rewind back to that point even if * they applied destructive operations on their datasets or even enabled new * zpool on-disk features. If a pool has a checkpoint that is no longer * needed, the user can discard it. * * == On disk data structures used == * * - The pool has a new feature flag and a new entry in the MOS. The feature * flag is set to active when we create the checkpoint and remains active * until the checkpoint is fully discarded. The entry in the MOS config * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that * references the state of the pool when we take the checkpoint. The entry * remains populated until we start discarding the checkpoint or we rewind * back to it. * * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, * which persists until the checkpoint is fully discarded. The space map * contains entries that have been freed in the current state of the pool * but we want to keep around in case we decide to rewind to the checkpoint. * [see vdev_checkpoint_sm] * * - Each metaslab's ms_sm space map behaves the same as without the * checkpoint, with the only exception being the scenario when we free * blocks that belong to the checkpoint. In this case, these blocks remain * ALLOCATED in the metaslab's space map and they are added as FREE in the * vdev's checkpoint space map. * * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that * the uberblock was checkpointed. For normal uberblocks this field is 0. * * == Overview of operations == * * - To create a checkpoint, we first wait for the current TXG to be synced, * so we can use the most recently synced uberblock (spa_ubsync) as the * checkpointed uberblock. Then we use an early synctask to place that * uberblock in MOS config, increment the feature flag for the checkpoint * (marking it active), and setting spa_checkpoint_txg (see its use below) * to the TXG of the checkpointed uberblock. We use an early synctask for * the aforementioned operations to ensure that no blocks were dirtied * between the current TXG and the TXG of the checkpointed uberblock * (e.g the previous txg). * * - When a checkpoint exists, we need to ensure that the blocks that * belong to the checkpoint are freed but never reused. This means that * these blocks should never end up in the ms_allocatable or the ms_freeing * trees of a metaslab. Therefore, whenever there is a checkpoint the new * ms_checkpointing tree is used in addition to the aforementioned ones. * * Whenever a block is freed and we find out that it is referenced by the * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), * we place it in the ms_checkpointing tree instead of the ms_freeingtree. * This way, we divide the blocks that are being freed into checkpointed * and not-checkpointed blocks. * * In order to persist these frees, we write the extents from the * ms_freeingtree to the ms_sm as usual, and the extents from the * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these * checkpointed extents will remain allocated in the metaslab's ms_sm space * map, and therefore won't be reused [see metaslab_sync()]. In addition, * when we discard the checkpoint, we can find the entries that have * actually been freed in vdev_checkpoint_sm. * [see spa_checkpoint_discard_thread_sync()] * * - To discard the checkpoint we use an early synctask to delete the * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, * and wakeup the discarding zthr thread (an open-context async thread). * We use an early synctask to ensure that the operation happens before any * new data end up in the checkpoint's data structures. * * Once the synctask is done and the discarding zthr is awake, we discard * the checkpointed data over multiple TXGs by having the zthr prefetching * entries from vdev_checkpoint_sm and then starting a synctask that places * them as free blocks into their respective ms_allocatable and ms_sm * structures. * [see spa_checkpoint_discard_thread()] * * When there are no entries left in the vdev_checkpoint_sm of all * top-level vdevs, a final synctask runs that decrements the feature flag. * * - To rewind to the checkpoint, we first use the current uberblock and * open the MOS so we can access the checkpointed uberblock from the MOS * config. After we retrieve the checkpointed uberblock, we use it as the * current uberblock for the pool by writing it to disk with an updated * TXG, opening its version of the MOS, and moving on as usual from there. * [see spa_ld_checkpoint_rewind()] * * An important note on rewinding to the checkpoint has to do with how we * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL * blocks that have not been claimed by the time we took the checkpoint * as they should no longer be valid. * [see comment in zil_claim()] * * == Miscellaneous information == * * - In the hypothetical event that we take a checkpoint, remove a vdev, * and attempt to rewind, the rewind would fail as the checkpointed * uberblock would reference data in the removed device. For this reason * and others of similar nature, we disallow the following operations that * can change the config: * vdev removal and attach/detach, mirror splitting, and pool reguid. * * - As most of the checkpoint logic is implemented in the SPA and doesn't * distinguish datasets when it comes to space accounting, having a * checkpoint can potentially break the boundaries set by dataset * reservations. */ #include #include #include #include #include #include #include #include #include #include /* * The following parameter limits the amount of memory to be used for the * prefetching of the checkpoint space map done on each vdev while * discarding the checkpoint. * * The reason it exists is because top-level vdevs with long checkpoint * space maps can potentially take up a lot of memory depending on the * amount of checkpointed data that has been freed within them while * the pool had a checkpoint. */ static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; int spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) { if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); memset(pcs, 0, sizeof (pool_checkpoint_stat_t)); int error = zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); ASSERT(error == 0 || error == ENOENT); if (error == ENOENT) pcs->pcs_state = CS_CHECKPOINT_DISCARDING; else pcs->pcs_state = CS_CHECKPOINT_EXISTS; pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; return (0); } static void spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; spa->spa_checkpoint_info.sci_timestamp = 0; spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); spa_notify_waiters(spa); spa_history_log_internal(spa, "spa discard checkpoint", tx, "finished discarding checkpointed state from the pool"); } typedef struct spa_checkpoint_discard_sync_callback_arg { vdev_t *sdc_vd; uint64_t sdc_txg; uint64_t sdc_entry_limit; } spa_checkpoint_discard_sync_callback_arg_t; static int spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) { spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; vdev_t *vd = sdc->sdc_vd; metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; uint64_t end = sme->sme_offset + sme->sme_run; if (sdc->sdc_entry_limit == 0) return (SET_ERROR(EINTR)); /* * Since the space map is not condensed, we know that * none of its entries is crossing the boundaries of * its respective metaslab. * * That said, there is no fundamental requirement that * the checkpoint's space map entries should not cross * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ VERIFY3U(sme->sme_type, ==, SM_FREE); VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* * At this point we should not be processing any * other frees concurrently, so the lock is technically * unnecessary. We use the lock anyway though to * potentially save ourselves from future headaches. */ mutex_enter(&ms->ms_lock); if (range_tree_is_empty(ms->ms_freeing)) vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, sme->sme_run); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; sdc->sdc_entry_limit--; return (0); } #ifdef ZFS_DEBUG static void spa_checkpoint_accounting_verify(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t ckpoint_sm_space_sum = 0; uint64_t vs_ckpoint_space_sum = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (vd->vdev_checkpoint_sm != NULL) { ckpoint_sm_space_sum += -space_map_allocated(vd->vdev_checkpoint_sm); vs_ckpoint_space_sum += vd->vdev_stat.vs_checkpoint_space; ASSERT3U(ckpoint_sm_space_sum, ==, vs_ckpoint_space_sum); } else { ASSERT0(vd->vdev_stat.vs_checkpoint_space); } } ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); } #endif static void spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd = arg; int error; /* * The space map callback is applied only to non-debug entries. * Because the number of debug entries is less or equal to the * number of non-debug entries, we want to ensure that we only * read what we prefetched from open-context. * * Thus, we set the maximum entries that the space map callback * will be applied to be half the entries that could fit in the * imposed memory limit. * * Note that since this is a conservative estimate we also * assume the worst case scenario in our computation where each * entry is two-word. */ uint64_t max_entry_limit = (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; /* * Iterate from the end of the space map towards the beginning, * placing its entries on ms_freeing and removing them from the * space map. The iteration stops if one of the following * conditions is true: * * 1] We reached the beginning of the space map. At this point * the space map should be completely empty and * space_map_incremental_destroy should have returned 0. * The next step would be to free and close the space map * and remove its entry from its vdev's top zap. This allows * spa_checkpoint_discard_thread() to move on to the next vdev. * * 2] We reached the memory limit (amount of memory used to hold * space map entries in memory) and space_map_incremental_destroy * returned EINTR. This means that there are entries remaining * in the space map that will be cleared in a future invocation * of this function by spa_checkpoint_discard_thread(). */ spa_checkpoint_discard_sync_callback_arg_t sdc; sdc.sdc_vd = vd; sdc.sdc_txg = tx->tx_txg; sdc.sdc_entry_limit = max_entry_limit; uint64_t words_before = space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, spa_checkpoint_discard_sync_callback, &sdc, tx); uint64_t words_after = space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); #ifdef ZFS_DEBUG spa_checkpoint_accounting_verify(vd->vdev_spa); #endif zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, " "deleted %llu words - %llu words are left", (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id, (u_longlong_t)(words_before - words_after), (u_longlong_t)words_after); if (error != EINTR) { if (error != 0) { zfs_panic_recover("zfs: error %lld was returned " "while incrementally destroying the checkpoint " "space map of vdev %llu\n", (longlong_t)error, vd->vdev_id); } ASSERT0(words_after); ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); space_map_free(vd->vdev_checkpoint_sm, tx); space_map_close(vd->vdev_checkpoint_sm); vd->vdev_checkpoint_sm = NULL; VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); } } static boolean_t spa_checkpoint_discard_is_done(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(!spa_has_checkpoint(spa)); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) return (B_FALSE); ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); } return (B_TRUE); } boolean_t spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) { (void) zthr; spa_t *spa = arg; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (B_FALSE); if (spa_has_checkpoint(spa)) return (B_FALSE); return (B_TRUE); } void spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; while (vd->vdev_checkpoint_sm != NULL) { space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; int numbufs; dmu_buf_t **dbp; if (zthr_iscancelled(zthr)) return; ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); uint64_t size = MIN(space_map_length(checkpoint_sm), zfs_spa_discard_memory_limit); uint64_t offset = space_map_length(checkpoint_sm) - size; /* * Ensure that the part of the space map that will * be destroyed by the synctask, is prefetched in * memory before the synctask runs. */ int error = dmu_buf_hold_array_by_bonus( checkpoint_sm->sm_dbuf, offset, size, B_TRUE, FTAG, &numbufs, &dbp); if (error != 0) { zfs_panic_recover("zfs: error %d was returned " "while prefetching checkpoint space map " "entries of vdev %llu\n", error, vd->vdev_id); } VERIFY0(dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_thread_sync, vd, 0, ZFS_SPACE_CHECK_NONE)); dmu_buf_rele_array(dbp, numbufs, FTAG); } } VERIFY(spa_checkpoint_discard_is_done(spa)); VERIFY0(spa->spa_checkpoint_info.sci_dspace); VERIFY0(dsl_sync_task(spa->spa_name, NULL, spa_checkpoint_discard_complete_sync, spa, 0, ZFS_SPACE_CHECK_NONE)); } static int spa_checkpoint_check(void *arg, dmu_tx_t *tx) { (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ENOTSUP)); if (!spa_top_vdevs_spacemap_addressable(spa)) return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); if (spa->spa_raidz_expand != NULL) return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); return (0); } static void spa_checkpoint_sync(void *arg, dmu_tx_t *tx) { (void) arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; uberblock_t checkpoint = spa->spa_ubsync; /* * At this point, there should not be a checkpoint in the MOS. */ ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); ASSERT0(spa->spa_checkpoint_info.sci_timestamp); ASSERT0(spa->spa_checkpoint_info.sci_dspace); /* * Since the checkpointed uberblock is the one that just got synced * (we use spa_ubsync), its txg must be equal to the txg number of * the txg we are syncing, minus 1. */ ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); /* * Once the checkpoint is in place, we need to ensure that none of * its blocks will be marked for reuse after it has been freed. * When there is a checkpoint and a block is freed, we compare its * birth txg to the txg of the checkpointed uberblock to see if the * block is part of the checkpoint or not. Therefore, we have to set * spa_checkpoint_txg before any frees happen in this txg (which is * why this is done as an early_synctask as explained in the comment * in spa_checkpoint()). */ spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint, tx)); /* * Increment the feature refcount and thus activate the feature. * Note that the feature will be deactivated when we've * completely discarded all checkpointed state (both vdev * space maps and uberblock). */ spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); spa_history_log_internal(spa, "spa checkpoint", tx, "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg); } /* * Create a checkpoint for the pool. */ int spa_checkpoint(const char *pool) { int error; spa_t *spa; error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); mutex_enter(&spa->spa_vdev_top_lock); /* * Wait for current syncing txg to finish so the latest synced * uberblock (spa_ubsync) has all the changes that we expect * to see if we were to revert later to the checkpoint. In other * words we want the checkpointed uberblock to include/reference * all the changes that were pending at the time that we issued * the checkpoint command. */ txg_wait_synced(spa_get_dsl(spa), 0); /* * As the checkpointed uberblock references blocks from the previous * txg (spa_ubsync) we want to ensure that are not freeing any of * these blocks in the same txg that the following synctask will * run. Thus, we run it as an early synctask, so the dirty changes * that are synced to disk afterwards during zios and other synctasks * do not reuse checkpointed blocks. */ error = dsl_early_sync_task(pool, spa_checkpoint_check, spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); mutex_exit(&spa->spa_vdev_top_lock); spa_close(spa, FTAG); return (error); } static int spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) { (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); if (spa->spa_checkpoint_txg == 0) return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); VERIFY0(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); return (0); } static void spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) { (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, tx)); spa->spa_checkpoint_txg = 0; zthr_wakeup(spa->spa_checkpoint_discard_zthr); spa_history_log_internal(spa, "spa discard checkpoint", tx, "started discarding checkpointed state from the pool"); } /* * Discard the checkpoint from a pool. */ int spa_checkpoint_discard(const char *pool) { /* * Similarly to spa_checkpoint(), we want our synctask to run * before any pending dirty data are written to disk so they * won't end up in the checkpoint's data structures (e.g. * ms_checkpointing and vdev_checkpoint_sm) and re-create any * space maps that the discarding open-context thread has * deleted. * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] */ return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, spa_checkpoint_discard_sync, NULL, 0, ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); } EXPORT_SYMBOL(spa_checkpoint_get_stats); EXPORT_SYMBOL(spa_checkpoint_discard_thread); EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW, "Limit for memory used in prefetching the checkpoint space map done " "on each vdev while discarding the checkpoint"); -/* END CSTYLED */ diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index a49e28ee7a43..18b3970ac0dc 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -1,1498 +1,1496 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2014, Delphix. All rights reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, 2022, George Amanakis. All rights reserved. */ /* * Routines to manage the on-disk persistent error log. * * Each pool stores a log of all logical data errors seen during normal * operation. This is actually the union of two distinct logs: the last log, * and the current log. All errors seen are logged to the current log. When a * scrub completes, the current log becomes the last log, the last log is thrown * out, and the current log is reinitialized. This way, if an error is somehow * corrected, a new scrub will show that it no longer exists, and will be * deleted from the log when the scrub completes. * * The log is stored using a ZAP object whose key is a string form of the * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an * optional 'objset:object' human-readable string describing the data. When an * error is first logged, this string will be empty, indicating that no name is * known. This prevents us from having to issue a potentially large amount of * I/O to discover the object name during an error path. Instead, we do the * calculation when the data is requested, storing the result so future queries * will be faster. * * If the head_errlog feature is enabled, a different on-disk format is used. * The error log of each head dataset is stored separately in the zap object * and keyed by the head id. This enables listing every dataset affected in * userland. In order to be able to track whether an error block has been * modified or added to snapshots since it was marked as an error, a new tuple * is introduced: zbookmark_err_phys_t. It allows the storage of the birth * transaction group of an error block on-disk. The birth transaction group is * used by check_filesystem() to assess whether this block was freed, * re-written or added to a snapshot since its marking as an error. * * This log is then shipped into an nvlist where the key is the dataset name and * the value is the object name. Userland is then responsible for uniquifying * this list and displaying it to the user. */ #include #include #include #include #include #include #include #include #include #define NAME_MAX_LEN 64 typedef struct clones { uint64_t clone_ds; list_node_t node; } clones_t; /* * spa_upgrade_errlog_limit : A zfs module parameter that controls the number * of on-disk error log entries that will be converted to the new * format when enabling head_errlog. Defaults to 0 which converts * all log entries. */ static uint_t spa_upgrade_errlog_limit = 0; /* * Convert a bookmark to a string. */ static void bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) { (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); } /* * Convert an err_phys to a string. */ static void errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) { (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); } /* * Convert a string to a err_phys. */ void name_to_errphys(char *buf, zbookmark_err_phys_t *zep) { zep->zb_object = zfs_strtonum(buf, &buf); ASSERT(*buf == ':'); zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zep->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zep->zb_birth = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } /* * Convert a string to a bookmark. */ static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { zb->zb_objset = zfs_strtonum(buf, &buf); ASSERT(*buf == ':'); zb->zb_object = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); ASSERT(*buf == ':'); zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { zb->zb_objset = dataset; zb->zb_object = zep->zb_object; zb->zb_level = zep->zb_level; zb->zb_blkid = zep->zb_blkid; } static void name_to_object(char *buf, uint64_t *obj) { *obj = zfs_strtonum(buf, &buf); ASSERT(*buf == '\0'); } /* * Retrieve the head filesystem. */ static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) { dsl_dataset_t *ds; int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); ASSERT(head_ds); *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } /* * Log an uncorrectable error to the persistent error log. We add it to the * spa's list of pending errors. The changes are actually synced out to disk * during spa_errlog_sync(). */ void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth) { spa_error_entry_t search; spa_error_entry_t *new; avl_tree_t *tree; avl_index_t where; /* * If we are trying to import a pool, ignore any errors, as we won't be * writing to the pool any time soon. */ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return; mutex_enter(&spa->spa_errlist_lock); /* * If we have had a request to rotate the log, log it to the next list * instead of the current one. */ if (spa->spa_scrub_active || spa->spa_scrub_finished) tree = &spa->spa_errlist_scrub; else tree = &spa->spa_errlist_last; search.se_bookmark = *zb; if (avl_find(tree, &search, &where) != NULL) { mutex_exit(&spa->spa_errlist_lock); return; } new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *zb; /* * If the head_errlog feature is enabled, store the birth txg now. In * case the file is deleted before spa_errlog_sync() runs, we will not * be able to retrieve the birth txg. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { new->se_zep.zb_object = zb->zb_object; new->se_zep.zb_level = zb->zb_level; new->se_zep.zb_blkid = zb->zb_blkid; new->se_zep.zb_birth = birth; } avl_insert(tree, new, where); mutex_exit(&spa->spa_errlist_lock); } int find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg) { objset_t *os; int error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); dnode_t *dn; blkptr_t bp; error = dnode_hold(os, zep->zb_object, FTAG, &dn); if (error != 0) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, NULL); if (error == 0 && BP_IS_HOLE(&bp)) error = SET_ERROR(ENOENT); *birth_txg = BP_GET_LOGICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * This function finds the oldest affected filesystem containing an error * block. */ int find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t *top_affected_fs) { uint64_t oldest_dsobj; int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, &oldest_dsobj); if (error != 0) return (error); dsl_dataset_t *ds; error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); *top_affected_fs = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } #ifdef _KERNEL /* * Copy the bookmark to the end of the user-space buffer which starts at * uaddr and has *count unused entries, and decrement *count by 1. */ static int copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) { if (*count == 0) return (SET_ERROR(ENOMEM)); *count -= 1; if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t), sizeof (zbookmark_phys_t)) != 0) return (SET_ERROR(EFAULT)); return (0); } /* * Each time the error block is referenced by a snapshot or clone, add a * zbookmark_phys_t entry to the userspace array at uaddr. The array is * filled from the back and the in-out parameter *count is modified to be the * number of unused entries at the beginning of the array. The function * scrub_filesystem() is modelled after this one. */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count, list_t *clones_list) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; int error = dsl_dataset_hold_obj_flags(dp, head_ds, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); uint64_t latest_txg; uint64_t txg_to_consider = spa->spa_syncing_txg; boolean_t check_snapshot = B_TRUE; error = find_birth_txg(ds, zep, &latest_txg); /* * If find_birth_txg() errors out otherwise, let txg_to_consider be * equal to the spa's syncing txg: if check_filesystem() errors out * then affected snapshots or clones will not be checked. */ if (error == 0 && zep->zb_birth == latest_txg) { /* Block neither free nor rewritten. */ zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } check_snapshot = B_FALSE; } else if (error == 0) { txg_to_consider = latest_txg; } /* * Retrieve the number of snapshots if the dataset is not a snapshot. */ uint64_t snap_count = 0; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { error = zap_count(spa->spa_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (error != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } } if (snap_count == 0) { /* Filesystem without snapshots. */ dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t), KM_SLEEP); int aff_snap_count = 0; uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { error = dsl_dataset_hold_obj_flags(dp, snap_obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) goto out; if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } boolean_t affected = B_TRUE; if (check_snapshot) { uint64_t blk_txg; error = find_birth_txg(ds, zep, &blk_txg); affected = (error == 0 && zep->zb_birth == blk_txg); } /* Report errors in snapshots. */ if (affected) { snap_obj_array[aff_snap_count] = snap_obj; aff_snap_count++; zbookmark_phys_t zb; zep_to_zb(snap_obj, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); goto out; } } snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); } if (zap_clone == 0 || aff_snap_count == 0) { error = 0; goto out; } /* Check clones. */ zap_cursor_t *zc; zap_attribute_t *za; zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); za = zap_attribute_alloc(); for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer, DS_HOLD_FLAG_DECRYPT, FTAG, &clone); if (error != 0) break; /* * Only clones whose origins were affected could also * have affected snapshots. */ boolean_t found = B_FALSE; for (int i = 0; i < snap_count; i++) { if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj == snap_obj_array[i]) found = B_TRUE; } dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG); if (!found) continue; clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP); ct->clone_ds = za->za_first_integer; list_insert_tail(clones_list, ct); } zap_cursor_fini(zc); zap_attribute_free(za); kmem_free(zc, sizeof (*zc)); out: kmem_free(snap_obj_array, sizeof (*snap_obj_array)); return (error); } static int process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count) { /* * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the * birth txg or the head filesystem of the block pointer. This may * happen e.g. when an encrypted filesystem is not mounted or when * the key is not loaded. In this case do not proceed to * check_filesystem(), instead do the accounting here. */ if (zep->zb_birth == 0 || head_ds == 0) { zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); int error = copyout_entry(&zb, uaddr, count); if (error != 0) { return (error); } return (0); } uint64_t top_affected_fs; uint64_t init_count = *count; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); if (error == 0) { clones_t *ct; list_t clones_list; list_create(&clones_list, sizeof (clones_t), offsetof(clones_t, node)); error = check_filesystem(spa, top_affected_fs, zep, uaddr, count, &clones_list); while ((ct = list_remove_head(&clones_list)) != NULL) { error = check_filesystem(spa, ct->clone_ds, zep, uaddr, count, &clones_list); kmem_free(ct, sizeof (*ct)); if (error) { while (!list_is_empty(&clones_list)) { ct = list_remove_head(&clones_list); kmem_free(ct, sizeof (*ct)); } break; } } list_destroy(&clones_list); } if (error == 0 && init_count == *count) { /* * If we reach this point, no errors have been detected * in the checked filesystems/snapshots. Before returning mark * the error block to be removed from the error lists and logs. */ zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); spa_remove_error(spa, &zb, zep->zb_birth); } return (error); } #endif /* Return the number of errors in the error log */ uint64_t spa_get_last_errlog_size(spa_t *spa) { uint64_t total = 0, count; mutex_enter(&spa->spa_errlog_lock); if (spa->spa_errlog_last != 0 && zap_count(spa->spa_meta_objset, spa->spa_errlog_last, &count) == 0) total += count; mutex_exit(&spa->spa_errlog_lock); return (total); } /* * If a healed bookmark matches an entry in the error log we stash it in a tree * so that we can later remove the related log entries in sync context. */ static void spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, const uint64_t birth) { char name[NAME_MAX_LEN]; if (obj == 0) return; boolean_t held_list = B_FALSE; boolean_t held_log = B_FALSE; if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { bookmark_to_name(healed_zb, name, sizeof (name)); if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset, name) == 0) { if (!MUTEX_HELD(&spa->spa_errlog_lock)) { mutex_enter(&spa->spa_errlog_lock); held_log = B_TRUE; } /* * Found an error matching healed zb, add zb to our * tree of healed errors */ avl_tree_t *tree = &spa->spa_errlist_healed; spa_error_entry_t search; spa_error_entry_t *new; avl_index_t where; search.se_bookmark = *healed_zb; if (!MUTEX_HELD(&spa->spa_errlist_lock)) { mutex_enter(&spa->spa_errlist_lock); held_list = B_TRUE; } if (avl_find(tree, &search, &where) != NULL) { if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); return; } new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *healed_zb; avl_insert(tree, new, where); if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); } return; } zbookmark_err_phys_t healed_zep; healed_zep.zb_object = healed_zb->zb_object; healed_zep.zb_level = healed_zb->zb_level; healed_zep.zb_blkid = healed_zb->zb_blkid; healed_zep.zb_birth = birth; errphys_to_name(&healed_zep, name, sizeof (name)); zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { if (zap_contains(spa->spa_meta_objset, za->za_first_integer, name) == 0) { if (!MUTEX_HELD(&spa->spa_errlog_lock)) { mutex_enter(&spa->spa_errlog_lock); held_log = B_TRUE; } avl_tree_t *tree = &spa->spa_errlist_healed; spa_error_entry_t search; spa_error_entry_t *new; avl_index_t where; search.se_bookmark = *healed_zb; if (!MUTEX_HELD(&spa->spa_errlist_lock)) { mutex_enter(&spa->spa_errlist_lock); held_list = B_TRUE; } if (avl_find(tree, &search, &where) != NULL) { if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); continue; } new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *healed_zb; new->se_zep = healed_zep; avl_insert(tree, new, where); if (held_list) mutex_exit(&spa->spa_errlist_lock); if (held_log) mutex_exit(&spa->spa_errlog_lock); } } zap_cursor_fini(&zc); zap_attribute_free(za); } /* * If this error exists in the given tree remove it. */ static void remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb) { spa_error_entry_t search, *found; avl_index_t where; mutex_enter(&spa->spa_errlist_lock); search.se_bookmark = *zb; if ((found = avl_find(t, &search, &where)) != NULL) { avl_remove(t, found); kmem_free(found, sizeof (spa_error_entry_t)); } mutex_exit(&spa->spa_errlist_lock); } /* * Removes all of the recv healed errors from both on-disk error logs */ static void spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) { char name[NAME_MAX_LEN]; spa_error_entry_t *se; void *cookie = NULL; ASSERT(MUTEX_HELD(&spa->spa_errlog_lock)); while ((se = avl_destroy_nodes(&spa->spa_errlist_healed, &cookie)) != NULL) { remove_error_from_list(spa, s, &se->se_bookmark); remove_error_from_list(spa, l, &se->se_bookmark); if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { bookmark_to_name(&se->se_bookmark, name, sizeof (name)); (void) zap_remove(spa->spa_meta_objset, spa->spa_errlog_last, name, tx); (void) zap_remove(spa->spa_meta_objset, spa->spa_errlog_scrub, name, tx); } else { errphys_to_name(&se->se_zep, name, sizeof (name)); zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { zap_remove(spa->spa_meta_objset, za->za_first_integer, name, tx); } zap_cursor_fini(&zc); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_scrub); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { zap_remove(spa->spa_meta_objset, za->za_first_integer, name, tx); } zap_cursor_fini(&zc); zap_attribute_free(za); } kmem_free(se, sizeof (spa_error_entry_t)); } } /* * Stash away healed bookmarks to remove them from the on-disk error logs * later in spa_remove_healed_errors(). */ void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth) { spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth); spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth); } static uint64_t approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj) { if (spa_err_obj == 0) return (0); uint64_t total = 0; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t count; if (zap_count(spa->spa_meta_objset, za->za_first_integer, &count) == 0) total += count; } zap_cursor_fini(&zc); zap_attribute_free(za); return (total); } /* * Return the approximate number of errors currently in the error log. This * will be nonzero if there are some errors, but otherwise it may be more * or less than the number of entries returned by spa_get_errlog(). */ uint64_t spa_approx_errlog_size(spa_t *spa) { uint64_t total = 0; if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { mutex_enter(&spa->spa_errlog_lock); uint64_t count; if (spa->spa_errlog_scrub != 0 && zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, &count) == 0) total += count; if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && zap_count(spa->spa_meta_objset, spa->spa_errlog_last, &count) == 0) total += count; mutex_exit(&spa->spa_errlog_lock); } else { mutex_enter(&spa->spa_errlog_lock); total += approx_errlog_size_impl(spa, spa->spa_errlog_last); total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub); mutex_exit(&spa->spa_errlog_lock); } mutex_enter(&spa->spa_errlist_lock); total += avl_numnodes(&spa->spa_errlist_last); total += avl_numnodes(&spa->spa_errlist_scrub); mutex_exit(&spa->spa_errlist_lock); return (total); } /* * This function sweeps through an on-disk error log and stores all bookmarks * as error bookmarks in a new ZAP object. At the end we discard the old one, * and spa_update_errlog() will set the spa's on-disk error log to new ZAP * object. */ static void sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t *za; zbookmark_phys_t zb; uint64_t count; *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); /* * If we cannnot perform the upgrade we should clear the old on-disk * error logs. */ if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) { VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); return; } za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { if (spa_upgrade_errlog_limit != 0 && zc.zc_cd == spa_upgrade_errlog_limit) break; name_to_bookmark(za->za_name, &zb); zbookmark_err_phys_t zep; zep.zb_object = zb.zb_object; zep.zb_level = zb.zb_level; zep.zb_blkid = zb.zb_blkid; zep.zb_birth = 0; /* * In case of an error we should simply continue instead of * returning prematurely. See the next comment. */ uint64_t head_ds; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *ds; objset_t *os; int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) continue; head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; /* * The objset and the dnode are required for getting the block * pointer, which is used to determine if BP_IS_HOLE(). If * getting the objset or the dnode fails, do not create a * zap entry (presuming we know the dataset) as this may create * spurious errors that we cannot ever resolve. If an error is * truly persistent, it should re-appear after a scan. */ if (dmu_objset_from_ds(ds, &os) != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } dnode_t *dn; blkptr_t bp; if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } rw_enter(&dn->dn_struct_rwlock, RW_READER); error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp, NULL, NULL); if (error == EACCES) error = 0; else if (!error) zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); if (error != 0 || BP_IS_HOLE(&bp)) continue; uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, *newobj, head_ds, err_obj, tx); } char buf[64]; errphys_to_name(&zep, buf, sizeof (buf)); const char *name = ""; (void) zap_update(spa->spa_meta_objset, err_obj, buf, 1, strlen(name) + 1, name, tx); } zap_cursor_fini(&zc); zap_attribute_free(za); VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); } void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) { uint64_t newobj = 0; mutex_enter(&spa->spa_errlog_lock); if (spa->spa_errlog_last != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); spa->spa_errlog_last = newobj; (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, &spa->spa_errlog_last, tx); } if (spa->spa_errlog_scrub != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); spa->spa_errlog_scrub = newobj; (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx); } mutex_exit(&spa->spa_errlog_lock); } #ifdef _KERNEL /* * If an error block is shared by two datasets it will be counted twice. */ static int process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { if (obj == 0) return (0); zap_cursor_t *zc; zap_attribute_t *za; zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); za = zap_attribute_alloc(); if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (zap_cursor_init(zc, spa->spa_meta_objset, obj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { if (*count == 0) { zap_cursor_fini(zc); kmem_free(zc, sizeof (*zc)); zap_attribute_free(za); return (SET_ERROR(ENOMEM)); } zbookmark_phys_t zb; name_to_bookmark(za->za_name, &zb); int error = copyout_entry(&zb, uaddr, count); if (error != 0) { zap_cursor_fini(zc); kmem_free(zc, sizeof (*zc)); zap_attribute_free(za); return (error); } } zap_cursor_fini(zc); kmem_free(zc, sizeof (*zc)); zap_attribute_free(za); return (0); } for (zap_cursor_init(zc, spa->spa_meta_objset, obj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { zap_cursor_t *head_ds_cursor; zap_attribute_t *head_ds_attr; head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); head_ds_attr = zap_attribute_alloc(); uint64_t head_ds_err_obj = za->za_first_integer; uint64_t head_ds; name_to_object(za->za_name, &head_ds); for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { zbookmark_err_phys_t head_ds_block; name_to_errphys(head_ds_attr->za_name, &head_ds_block); int error = process_error_block(spa, head_ds, &head_ds_block, uaddr, count); if (error != 0) { zap_cursor_fini(head_ds_cursor); kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); zap_attribute_free(head_ds_attr); zap_cursor_fini(zc); zap_attribute_free(za); kmem_free(zc, sizeof (*zc)); return (error); } } zap_cursor_fini(head_ds_cursor); kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); zap_attribute_free(head_ds_attr); } zap_cursor_fini(zc); zap_attribute_free(za); kmem_free(zc, sizeof (*zc)); return (0); } static int process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) { spa_error_entry_t *se; if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { int error = copyout_entry(&se->se_bookmark, uaddr, count); if (error != 0) { return (error); } } return (0); } for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { uint64_t head_ds = 0; int error = get_head_ds(spa, se->se_bookmark.zb_objset, &head_ds); /* * If get_head_ds() errors out, set the head filesystem * to the filesystem stored in the bookmark of the * error block. */ if (error != 0) head_ds = se->se_bookmark.zb_objset; error = process_error_block(spa, head_ds, &se->se_zep, uaddr, count); if (error != 0) return (error); } return (0); } #endif /* * Copy all known errors to userland as an array of bookmarks. This is * actually a union of the on-disk last log and current log, as well as any * pending error requests. * * Because the act of reading the on-disk log could cause errors to be * generated, we have two separate locks: one for the error log and one for the * in-core error lists. We only need the error list lock to log and error, so * we grab the error log lock while we read the on-disk logs, and only pick up * the error list lock when we are finished. */ int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) { int ret = 0; #ifdef _KERNEL /* * The pool config lock is needed to hold a dataset_t via (among other * places) process_error_list() -> process_error_block()-> * find_top_affected_fs(), and lock ordering requires that we get it * before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); if (!ret && !spa->spa_scrub_finished) ret = process_error_log(spa, spa->spa_errlog_last, uaddr, count); mutex_enter(&spa->spa_errlist_lock); if (!ret) ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, count); if (!ret) ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, count); mutex_exit(&spa->spa_errlist_lock); mutex_exit(&spa->spa_errlog_lock); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); #else (void) spa, (void) uaddr, (void) count; #endif return (ret); } /* * Called when a scrub completes. This simply set a bit which tells which AVL * tree to add new errors. spa_errlog_sync() is responsible for actually * syncing the changes to the underlying objects. */ void spa_errlog_rotate(spa_t *spa) { mutex_enter(&spa->spa_errlist_lock); spa->spa_scrub_finished = B_TRUE; mutex_exit(&spa->spa_errlist_lock); } /* * Discard any pending errors from the spa_t. Called when unloading a faulted * pool, as the errors encountered during the open cannot be synced to disk. */ void spa_errlog_drain(spa_t *spa) { spa_error_entry_t *se; void *cookie; mutex_enter(&spa->spa_errlist_lock); cookie = NULL; while ((se = avl_destroy_nodes(&spa->spa_errlist_last, &cookie)) != NULL) kmem_free(se, sizeof (spa_error_entry_t)); cookie = NULL; while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, &cookie)) != NULL) kmem_free(se, sizeof (spa_error_entry_t)); mutex_exit(&spa->spa_errlist_lock); } /* * Process a list of errors into the current on-disk log. */ void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) { spa_error_entry_t *se; char buf[NAME_MAX_LEN]; void *cookie; if (avl_numnodes(t) == 0) return; /* create log if necessary */ if (*obj == 0) *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); /* add errors to the current log */ if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); const char *name = se->se_name ? se->se_name : ""; (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, strlen(name) + 1, name, tx); } } else { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { zbookmark_err_phys_t zep; zep.zb_object = se->se_zep.zb_object; zep.zb_level = se->se_zep.zb_level; zep.zb_blkid = se->se_zep.zb_blkid; zep.zb_birth = se->se_zep.zb_birth; uint64_t head_ds = 0; int error = get_head_ds(spa, se->se_bookmark.zb_objset, &head_ds); /* * If get_head_ds() errors out, set the head filesystem * to the filesystem stored in the bookmark of the * error block. */ if (error != 0) head_ds = se->se_bookmark.zb_objset; uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, *obj, head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, *obj, head_ds, err_obj, tx); } errphys_to_name(&zep, buf, sizeof (buf)); const char *name = se->se_name ? se->se_name : ""; (void) zap_update(spa->spa_meta_objset, err_obj, buf, 1, strlen(name) + 1, name, tx); } } /* purge the error list */ cookie = NULL; while ((se = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(se, sizeof (spa_error_entry_t)); } static void delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) { if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { VERIFY0(dmu_object_free(spa->spa_meta_objset, za->za_first_integer, tx)); } zap_cursor_fini(&zc); zap_attribute_free(za); } VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); } /* * Sync the error log out to disk. This is a little tricky because the act of * writing the error log requires the spa_errlist_lock. So, we need to lock the * error lists, take a copy of the lists, and then reinitialize them. Then, we * drop the error list lock and take the error log lock, at which point we * do the errlog processing. Then, if we encounter an I/O error during this * process, we can successfully add the error to the list. Note that this will * result in the perpetual recycling of errors, but it is an unlikely situation * and not a performance critical operation. */ void spa_errlog_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; avl_tree_t scrub, last; int scrub_finished; mutex_enter(&spa->spa_errlist_lock); /* * Bail out early under normal circumstances. */ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && avl_numnodes(&spa->spa_errlist_last) == 0 && avl_numnodes(&spa->spa_errlist_healed) == 0 && !spa->spa_scrub_finished) { mutex_exit(&spa->spa_errlist_lock); return; } spa_get_errlists(spa, &last, &scrub); scrub_finished = spa->spa_scrub_finished; spa->spa_scrub_finished = B_FALSE; mutex_exit(&spa->spa_errlist_lock); /* * The pool config lock is needed to hold a dataset_t via * sync_error_list() -> get_head_ds(), and lock ordering * requires that we get it before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); /* * Remove healed errors from errors. */ spa_remove_healed_errors(spa, &last, &scrub, tx); /* * Sync out the current list of errors. */ sync_error_list(spa, &last, &spa->spa_errlog_last, tx); /* * Rotate the log if necessary. */ if (scrub_finished) { if (spa->spa_errlog_last != 0) delete_errlog(spa, spa->spa_errlog_last, tx); spa->spa_errlog_last = spa->spa_errlog_scrub; spa->spa_errlog_scrub = 0; sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); } /* * Sync out any pending scrub errors. */ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); /* * Update the MOS to reflect the new values. */ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, &spa->spa_errlog_last, tx); (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx); dmu_tx_commit(tx); mutex_exit(&spa->spa_errlog_lock); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); } static void delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, dmu_tx_t *tx) { if (spa_err_obj == 0) return; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t head_ds; name_to_object(za->za_name, &head_ds); if (head_ds == ds) { (void) zap_remove(spa->spa_meta_objset, spa_err_obj, za->za_name, tx); VERIFY0(dmu_object_free(spa->spa_meta_objset, za->za_first_integer, tx)); break; } } zap_cursor_fini(&zc); zap_attribute_free(za); } void spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) { mutex_enter(&spa->spa_errlog_lock); delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); mutex_exit(&spa->spa_errlog_lock); } static int find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, uint64_t *txg) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; int error = dsl_dataset_hold_obj_flags(dp, old_head, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; while (prev_obj != 0) { dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 && dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) break; if (error != 0) return (error); prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); ASSERT(prev_obj != 0); *txg = prev_obj_txg; return (0); } static void swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t old_head, dmu_tx_t *tx) { if (spa_err_obj == 0) return; uint64_t old_head_errlog; int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, old_head, &old_head_errlog); /* If no error log, then there is nothing to do. */ if (error != 0) return; uint64_t txg; error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg); if (error != 0) return; /* * Create an error log if the file system being promoted does not * already have one. */ uint64_t new_head_errlog; error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, &new_head_errlog); if (error != 0) { new_head_errlog = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, new_head, new_head_errlog, tx); } zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); zbookmark_err_phys_t err_block; for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { const char *name = ""; name_to_errphys(za->za_name, &err_block); if (err_block.zb_birth < txg) { (void) zap_update(spa->spa_meta_objset, new_head_errlog, za->za_name, 1, strlen(name) + 1, name, tx); (void) zap_remove(spa->spa_meta_objset, old_head_errlog, za->za_name, tx); } } zap_cursor_fini(&zc); zap_attribute_free(za); } void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, dmu_tx_t *tx) { mutex_enter(&spa->spa_errlog_lock); swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); mutex_exit(&spa->spa_errlog_lock); } #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); EXPORT_SYMBOL(spa_approx_errlog_size); EXPORT_SYMBOL(spa_get_last_errlog_size); EXPORT_SYMBOL(spa_get_errlog); EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); EXPORT_SYMBOL(spa_errlog_sync); EXPORT_SYMBOL(spa_get_errlists); EXPORT_SYMBOL(spa_delete_dataset_errlog); EXPORT_SYMBOL(spa_swap_errlog); EXPORT_SYMBOL(sync_error_list); EXPORT_SYMBOL(spa_upgrade_errlog); EXPORT_SYMBOL(find_top_affected_fs); EXPORT_SYMBOL(find_birth_txg); EXPORT_SYMBOL(zep_to_zb); EXPORT_SYMBOL(name_to_errphys); #endif -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW, "Limit the number of errors which will be upgraded to the new " "on-disk error log when enabling head_errlog"); -/* END CSTYLED */ diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index f55218e3579b..a95152608578 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1,1408 +1,1406 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2018, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include /* * Log Space Maps * * Log space maps are an optimization in ZFS metadata allocations for pools * whose workloads are primarily random-writes. Random-write workloads are also * typically random-free, meaning that they are freeing from locations scattered * throughout the pool. This means that each TXG we will have to append some * FREE records to almost every metaslab. With log space maps, we hold their * changes in memory and log them altogether in one pool-wide space map on-disk * for persistence. As more blocks are accumulated in the log space maps and * more unflushed changes are accounted in memory, we flush a selected group * of metaslabs every TXG to relieve memory pressure and potential overheads * when loading the pool. Flushing a metaslab to disk relieves memory as we * flush any unflushed changes from memory to disk (i.e. the metaslab's space * map) and saves import time by making old log space maps obsolete and * eventually destroying them. [A log space map is said to be obsolete when all * its entries have made it to their corresponding metaslab space maps]. * * == On disk data structures used == * * - The pool has a new feature flag and a new entry in the MOS. The feature * is activated when we create the first log space map and remains active * for the lifetime of the pool. The new entry in the MOS Directory [refer * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value * pairs are of the form . * This entry is our on-disk reference of the log space maps that exist in * the pool for each TXG and it is used during import to load all the * metaslab unflushed changes in memory. To see how this structure is first * created and later populated refer to spa_generate_syncing_log_sm(). To see * how it is used during import time refer to spa_ld_log_sm_metadata(). * * - Each vdev has a new entry in its vdev_top_zap (see field * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of * each metaslab in this vdev. This field is the on-disk counterpart of the * in-memory field ms_unflushed_txg which tells us from which TXG and onwards * the metaslab haven't had its changes flushed. During import, we use this * to ignore any entries in the space map log that are for this metaslab but * from a TXG before msp_unflushed_txg. At that point, we also populate its * in-memory counterpart and from there both fields are updated every time * we flush that metaslab. * * - A space map is created every TXG and, during that TXG, it is used to log * all incoming changes (the log space map). When created, the log space map * is referenced in memory by spa_syncing_log_sm and its object ID is inserted * to the space map ZAP mentioned above. The log space map is closed at the * end of the TXG and will be destroyed when it becomes fully obsolete. We * know when a log space map has become obsolete by looking at the oldest * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger * than the log space map's TXG, then it means that there is no metaslab who * doesn't have the changes from that log and we can therefore destroy it. * [see spa_cleanup_old_sm_logs()]. * * == Important in-memory structures == * * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in * the pool by their ms_unflushed_txg field. It is primarily used for three * reasons. First of all, it is used during flushing where we try to flush * metaslabs in-order from the oldest-flushed to the most recently flushed * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the * oldest flushed metaslab to distinguish which log space maps have become * obsolete and which ones are still relevant. Finally it tells us which * metaslabs have unflushed changes in a pool where this feature was just * enabled, as we don't immediately add all of the pool's metaslabs but we * add them over time as they go through metaslab_sync(). The reason that * we do that is to ease these pools into the behavior of the flushing * algorithm (described later on). * * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory * counterpart of the space map ZAP mentioned above. It's an AVL tree whose * nodes represent the log space maps in the pool. This in-memory * representation of log space maps in the pool sorts the log space maps by * the TXG that they were created (which is also the TXG of their unflushed * changes). It also contains the following extra information for each * space map: * [1] The number of metaslabs that were last flushed on that TXG. This is * important because if that counter is zero and this is the oldest * log then it means that it is also obsolete. * [2] The number of blocks of that space map. This field is used by the * block heuristic of our flushing algorithm (described later on). * It represents how many blocks of metadata changes ZFS had to write * to disk for that TXG. * * - The per-spa field spa_log_summary is a list of entries that summarizes * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg * AVL tree mentioned above. The reason this exists is that our flushing * algorithm (described later) tries to estimate how many metaslabs to flush * in each TXG by iterating over all the log space maps and looking at their * block counts. Summarizing that information means that don't have to * iterate through each space map, minimizing the runtime overhead of the * flushing algorithm which would be induced in syncing context. In terms of * implementation the log summary is used as a queue: * * we modify or pop entries from its head when we flush metaslabs * * we modify or append entries to its tail when we sync changes. * * - Each metaslab has two new range trees that hold its unflushed changes, * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint. * * == Flushing algorithm == * * The decision of how many metaslabs to flush on a give TXG is guided by * two heuristics: * * [1] The memory heuristic - * We keep track of the memory used by the unflushed trees from all the * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it * stays below a certain threshold which is determined by an arbitrary hard * limit and an arbitrary percentage of the system's memory [see * spa_log_exceeds_memlimit()]. When we see that the memory usage of the * unflushed changes are passing that threshold, we flush metaslabs, which * empties their unflushed range trees, reducing the memory used. * * [2] The block heuristic - * We try to keep the total number of blocks in the log space maps in check * so the log doesn't grow indefinitely and we don't induce a lot of overhead * when loading the pool. At the same time we don't want to flush a lot of * metaslabs too often as this would defeat the purpose of the log space map. * As a result we set a limit in the amount of blocks that we think it's * acceptable for the log space maps to have and try not to cross it. * [see sus_blocklimit from spa_unflushed_stats]. * * In order to stay below the block limit every TXG we have to estimate how * many metaslabs we need to flush based on the current rate of incoming blocks * and our history of log space map blocks. The main idea here is to answer * the question of how many metaslabs do we need to flush in order to get rid * at least an X amount of log space map blocks. We can answer this question * by iterating backwards from the oldest log space map to the newest one * and looking at their metaslab and block counts. At this point the log summary * mentioned above comes handy as it reduces the amount of things that we have * to iterate (even though it may reduce the preciseness of our estimates due * to its aggregation of data). So with that in mind, we project the incoming * rate of the current TXG into the future and attempt to approximate how many * metaslabs would we need to flush from now in order to avoid exceeding our * block limit in different points in the future (granted that we would keep * flushing the same number of metaslabs for every TXG). Then we take the * maximum number from all these estimates to be on the safe side. For the * exact implementation details of algorithm refer to * spa_estimate_metaslabs_to_flush. */ /* * This is used as the block size for the space maps used for the * log space map feature. These space maps benefit from a bigger * block size as we expect to be writing a lot of data to them at * once. */ static const unsigned long zfs_log_sm_blksz = 1ULL << 17; /* * Percentage of the overall system's memory that ZFS allows to be * used for unflushed changes (e.g. the sum of size of all the nodes * in the unflushed trees). * * Note that this value is calculated over 1000000 for finer granularity * (thus the _ppm suffix; reads as "parts per million"). As an example, * the default of 1000 allows 0.1% of memory to be used. */ static uint64_t zfs_unflushed_max_mem_ppm = 1000; /* * Specific hard-limit in memory that ZFS allows to be used for * unflushed changes. */ static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30; /* * The following tunable determines the number of blocks that can be used for * the log space maps. It is expressed as a percentage of the total number of * metaslabs in the pool (i.e. the default of 400 means that the number of log * blocks is capped at 4 times the number of metaslabs). * * This value exists to tune our flushing algorithm, with higher values * flushing metaslabs less often (doing less I/Os) per TXG versus lower values * flushing metaslabs more aggressively with the upside of saving overheads * when loading the pool. Another factor in this tradeoff is that flushing * less often can potentially lead to better utilization of the metaslab space * map's block size as we accumulate more changes per flush. * * Given that this tunable indirectly controls the flush rate (metaslabs * flushed per txg) and that's why making it a percentage in terms of the * number of metaslabs in the pool makes sense here. * * As a rule of thumb we default this tunable to 400% based on the following: * * 1] Assuming a constant flush rate and a constant incoming rate of log blocks * it is reasonable to expect that the amount of obsolete entries changes * linearly from txg to txg (e.g. the oldest log should have the most * obsolete entries, and the most recent one the least). With this we could * say that, at any given time, about half of the entries in the whole space * map log are obsolete. Thus for every two entries for a metaslab in the * log space map, only one of them is valid and actually makes it to the * metaslab's space map. * [factor of 2] * 2] Each entry in the log space map is guaranteed to be two words while * entries in metaslab space maps are generally single-word. * [an extra factor of 2 - 400% overall] * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into * account any consolidation of segments from the log space map to the * unflushed range trees nor their history (e.g. a segment being allocated, * then freed, then allocated again means 3 log space map entries but 0 * metaslab space map entries). Depending on the workload, we've seen ~1.8 * non-obsolete log space map entries per metaslab entry, for a total of * ~600%. Since most of these estimates though are workload dependent, we * default on 400% to be conservative. * * Thus we could say that even in the worst * case of [1] and [2], the factor should end up being 4. * * That said, regardless of the number of metaslabs in the pool we need to * provide upper and lower bounds for the log block limit. * [see zfs_unflushed_log_block_{min,max}] */ static uint_t zfs_unflushed_log_block_pct = 400; /* * If the number of metaslabs is small and our incoming rate is high, we could * get into a situation that we are flushing all our metaslabs every TXG. Thus * we always allow at least this many log blocks. */ static uint64_t zfs_unflushed_log_block_min = 1000; /* * If the log becomes too big, the import time of the pool can take a hit in * terms of performance. Thus we have a hard limit in the size of the log in * terms of blocks. */ static uint64_t zfs_unflushed_log_block_max = (1ULL << 17); /* * Also we have a hard limit in the size of the log in terms of dirty TXGs. */ static uint64_t zfs_unflushed_log_txg_max = 1000; /* * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and * stability of the flushing algorithm (longer summary) vs its runtime overhead * (smaller summary is faster to traverse). */ static uint64_t zfs_max_logsm_summary_length = 10; /* * Tunable that sets the lower bound on the metaslabs to flush every TXG. * * Setting this to 0 has no effect since if the pool is idle we won't even be * creating log space maps and therefore we won't be flushing. On the other * hand if the pool has any incoming workload our block heuristic will start * flushing metaslabs anyway. * * The point of this tunable is to be used in extreme cases where we really * want to flush more metaslabs than our adaptable heuristic plans to flush. */ static uint64_t zfs_min_metaslabs_to_flush = 1; /* * Tunable that specifies how far in the past do we want to look when trying to * estimate the incoming log blocks for the current TXG. * * Setting this too high may not only increase runtime but also minimize the * effect of the incoming rates from the most recent TXGs as we take the * average over all the blocks that we walk * [see spa_estimate_incoming_log_blocks]. */ static uint64_t zfs_max_log_walking = 5; /* * This tunable exists solely for testing purposes. It ensures that the log * spacemaps are not flushed and destroyed during export in order for the * relevant log spacemap import code paths to be tested (effectively simulating * a crash). */ int zfs_keep_log_spacemaps_at_export = 0; static uint64_t spa_estimate_incoming_log_blocks(spa_t *spa) { ASSERT3U(spa_sync_pass(spa), ==, 1); uint64_t steps = 0, sum = 0; for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); sls != NULL && steps < zfs_max_log_walking; sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) { if (sls->sls_txg == spa_syncing_txg(spa)) { /* * skip the log created in this TXG as this would * make our estimations inaccurate. */ continue; } sum += sls->sls_nblocks; steps++; } return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0); } uint64_t spa_log_sm_blocklimit(spa_t *spa) { return (spa->spa_unflushed_stats.sus_blocklimit); } void spa_log_sm_set_blocklimit(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { ASSERT0(spa_log_sm_blocklimit(spa)); return; } uint64_t msdcount = 0; for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e; e = list_next(&spa->spa_log_summary, e)) msdcount += e->lse_msdcount; uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100; spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit, zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); } uint64_t spa_log_sm_nblocks(spa_t *spa) { return (spa->spa_unflushed_stats.sus_nblocks); } /* * Ensure that the in-memory log space map structures and the summary * have the same block and metaslab counts. */ static void spa_log_summary_verify_counts(spa_t *spa) { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0) return; uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed); uint64_t ms_in_summary = 0, blk_in_summary = 0; for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e; e = list_next(&spa->spa_log_summary, e)) { ms_in_summary += e->lse_mscount; blk_in_summary += e->lse_blkcount; } uint64_t ms_in_logs = 0, blk_in_logs = 0; for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { ms_in_logs += sls->sls_mscount; blk_in_logs += sls->sls_nblocks; } VERIFY3U(ms_in_logs, ==, ms_in_summary); VERIFY3U(ms_in_logs, ==, ms_in_avl); VERIFY3U(blk_in_logs, ==, blk_in_summary); VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa)); } static boolean_t summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg) { if (e->lse_end == txg) return (0); if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max, zfs_max_logsm_summary_length)) return (1); uint64_t blocks_per_row = MAX(1, DIV_ROUND_UP(spa_log_sm_blocklimit(spa), zfs_max_logsm_summary_length)); return (blocks_per_row <= e->lse_blkcount); } /* * Update the log summary information to reflect the fact that a metaslab * was flushed or destroyed (e.g due to device removal or pool export/destroy). * * We typically flush the oldest flushed metaslab so the first (and oldest) * entry of the summary is updated. However if that metaslab is getting loaded * we may flush the second oldest one which may be part of an entry later in * the summary. Moreover, if we call into this function from metaslab_fini() * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask * for a txg as an argument so we can locate the appropriate summary entry for * the metaslab. */ void spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty) { /* * We don't track summary data for read-only pools and this function * can be called from metaslab_fini(). In that case return immediately. */ if (!spa_writeable(spa)) return; log_summary_entry_t *target = NULL; for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e != NULL; e = list_next(&spa->spa_log_summary, e)) { if (e->lse_start > txg) break; target = e; } if (target == NULL || target->lse_mscount == 0) { /* * We didn't find a summary entry for this metaslab. We must be * at the teardown of a spa_load() attempt that got an error * while reading the log space maps. */ VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); return; } target->lse_mscount--; if (dirty) target->lse_msdcount--; } /* * Update the log summary information to reflect the fact that we destroyed * old log space maps. Since we can only destroy the oldest log space maps, * we decrement the block count of the oldest summary entry and potentially * destroy it when that count hits 0. * * This function is called after a metaslab is flushed and typically that * metaslab is the oldest flushed, which means that this function will * typically decrement the block count of the first entry of the summary and * potentially free it if the block count gets to zero (its metaslab count * should be zero too at that point). * * There are certain scenarios though that don't work exactly like that so we * need to account for them: * * Scenario [1]: It is possible that after we flushed the oldest flushed * metaslab and we destroyed the oldest log space map, more recent logs had 0 * metaslabs pointing to them so we got rid of them too. This can happen due * to metaslabs being destroyed through device removal, or because the oldest * flushed metaslab was loading but we kept flushing more recently flushed * metaslabs due to the memory pressure of unflushed changes. Because of that, * we always iterate from the beginning of the summary and if blocks_gone is * bigger than the block_count of the current entry we free that entry (we * expect its metaslab count to be zero), we decrement blocks_gone and on to * the next entry repeating this procedure until blocks_gone gets decremented * to 0. Doing this also works for the typical case mentioned above. * * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by * the first (and oldest) entry in the summary. If the first few entries of * the summary were only accounting metaslabs from a device that was just * removed, then the current oldest flushed metaslab could be accounted by an * entry somewhere in the middle of the summary. Moreover flushing that * metaslab will destroy all the log space maps older than its ms_unflushed_txg * because they became obsolete after the removal. Thus, iterating as we did * for scenario [1] works out for this case too. * * Scenario [3]: At times we decide to flush all the metaslabs in the pool * in one TXG (either because we are exporting the pool or because our flushing * heuristics decided to do so). When that happens all the log space maps get * destroyed except the one created for the current TXG which doesn't have * any log blocks yet. As log space maps get destroyed with every metaslab that * we flush, entries in the summary are also destroyed. This brings a weird * corner-case when we flush the last metaslab and the log space map of the * current TXG is in the same summary entry with other log space maps that * are older. When that happens we are eventually left with this one last * summary entry whose blocks are gone (blocks_gone equals the entry's block * count) but its metaslab count is non-zero (because it accounts all the * metaslabs in the pool as they all got flushed). Under this scenario we can't * free this last summary entry as it's referencing all the metaslabs in the * pool and its block count will get incremented at the end of this sync (when * we close the syncing log space map). Thus we just decrement its current * block count and leave it alone. In the case that the pool gets exported, * its metaslab count will be decremented over time as we call metaslab_fini() * for all the metaslabs in the pool and the entry will be freed at * spa_unload_log_sm_metadata(). */ void spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) { log_summary_entry_t *e = list_head(&spa->spa_log_summary); ASSERT3P(e, !=, NULL); if (e->lse_txgcount > 0) e->lse_txgcount--; for (; e != NULL; e = list_head(&spa->spa_log_summary)) { if (e->lse_blkcount > blocks_gone) { e->lse_blkcount -= blocks_gone; blocks_gone = 0; break; } else if (e->lse_mscount == 0) { /* remove obsolete entry */ blocks_gone -= e->lse_blkcount; list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } else { /* Verify that this is scenario [3] mentioned above. */ VERIFY3U(blocks_gone, ==, e->lse_blkcount); /* * Assert that this is scenario [3] further by ensuring * that this is the only entry in the summary. */ VERIFY3P(e, ==, list_tail(&spa->spa_log_summary)); ASSERT3P(e, ==, list_head(&spa->spa_log_summary)); blocks_gone = e->lse_blkcount = 0; break; } } /* * Ensure that there is no way we are trying to remove more blocks * than the # of blocks in the summary. */ ASSERT0(blocks_gone); } void spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg) { spa_log_sm_t target = { .sls_txg = txg }; spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, &target, NULL); if (sls == NULL) { /* * We must be at the teardown of a spa_load() attempt that * got an error while reading the log space maps. */ VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); return; } ASSERT(sls->sls_mscount > 0); sls->sls_mscount--; } void spa_log_sm_increment_current_mscount(spa_t *spa) { spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg); ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa)); last_sls->sls_mscount++; } static void summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, uint64_t metaslabs_dirty, uint64_t nblocks) { log_summary_entry_t *e = list_tail(&spa->spa_log_summary); if (e == NULL || summary_entry_is_full(spa, e, txg)) { e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); e->lse_start = e->lse_end = txg; e->lse_txgcount = 1; list_insert_tail(&spa->spa_log_summary, e); } ASSERT3U(e->lse_start, <=, txg); if (e->lse_end < txg) { e->lse_end = txg; e->lse_txgcount++; } e->lse_mscount += metaslabs_flushed; e->lse_msdcount += metaslabs_dirty; e->lse_blkcount += nblocks; } static void spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) { summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks); } void spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty) { summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0); } void spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg) { log_summary_entry_t *target = NULL; for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e != NULL; e = list_next(&spa->spa_log_summary, e)) { if (e->lse_start > txg) break; target = e; } ASSERT3P(target, !=, NULL); ASSERT3U(target->lse_mscount, !=, 0); target->lse_msdcount++; } /* * This function attempts to estimate how many metaslabs should * we flush to satisfy our block heuristic for the log spacemap * for the upcoming TXGs. * * Specifically, it first tries to estimate the number of incoming * blocks in this TXG. Then by projecting that incoming rate to * future TXGs and using the log summary, it figures out how many * flushes we would need to do for future TXGs individually to * stay below our block limit and returns the maximum number of * flushes from those estimates. */ static uint64_t spa_estimate_metaslabs_to_flush(spa_t *spa) { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(spa_log_sm_blocklimit(spa) != 0); /* * This variable contains the incoming rate that will be projected * and used for our flushing estimates in the future. */ uint64_t incoming = spa_estimate_incoming_log_blocks(spa); /* * At any point in time this variable tells us how many * TXGs in the future we are so we can make our estimations. */ uint64_t txgs_in_future = 1; /* * This variable tells us how much room do we have until we hit * our limit. When it goes negative, it means that we've exceeded * our limit and we need to flush. * * Note that since we start at the first TXG in the future (i.e. * txgs_in_future starts from 1) we already decrement this * variable by the incoming rate. */ int64_t available_blocks = spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; int64_t available_txgs = zfs_unflushed_log_txg_max; for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e; e = list_next(&spa->spa_log_summary, e)) available_txgs -= e->lse_txgcount; /* * This variable tells us the total number of flushes needed to * keep the log size within the limit when we reach txgs_in_future. */ uint64_t total_flushes = 0; /* Holds the current maximum of our estimates so far. */ uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush; /* * For our estimations we only look as far in the future * as the summary allows us. */ for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e; e = list_next(&spa->spa_log_summary, e)) { /* * If there is still room before we exceed our limit * then keep skipping TXGs accumulating more blocks * based on the incoming rate until we exceed it. */ if (available_blocks >= 0 && available_txgs >= 0) { uint64_t skip_txgs = (incoming == 0) ? available_txgs + 1 : MIN(available_txgs + 1, (available_blocks / incoming) + 1); available_blocks -= (skip_txgs * incoming); available_txgs -= skip_txgs; txgs_in_future += skip_txgs; ASSERT3S(available_blocks, >=, -incoming); ASSERT3S(available_txgs, >=, -1); } /* * At this point we're far enough into the future where * the limit was just exceeded and we flush metaslabs * based on the current entry in the summary, updating * our available_blocks. */ ASSERT(available_blocks < 0 || available_txgs < 0); available_blocks += e->lse_blkcount; available_txgs += e->lse_txgcount; total_flushes += e->lse_msdcount; /* * Keep the running maximum of the total_flushes that * we've done so far over the number of TXGs in the * future that we are. The idea here is to estimate * the average number of flushes that we should do * every TXG so that when we are that many TXGs in the * future we stay under the limit. */ max_flushes_pertxg = MAX(max_flushes_pertxg, DIV_ROUND_UP(total_flushes, txgs_in_future)); } return (max_flushes_pertxg); } uint64_t spa_log_sm_memused(spa_t *spa) { return (spa->spa_unflushed_stats.sus_memused); } static boolean_t spa_log_exceeds_memlimit(spa_t *spa) { if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt) return (B_TRUE); uint64_t system_mem_allowed = ((physmem * PAGESIZE) * zfs_unflushed_max_mem_ppm) / 1000000; if (spa_log_sm_memused(spa) > system_mem_allowed) return (B_TRUE); return (B_FALSE); } boolean_t spa_flush_all_logs_requested(spa_t *spa) { return (spa->spa_log_flushall_txg != 0); } void spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); if (spa_sync_pass(spa) != 1) return; if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return; /* * If we don't have any metaslabs with unflushed changes * return immediately. */ if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0) return; /* * During SPA export we leave a few empty TXGs to go by [see * spa_final_dirty_txg() to understand why]. For this specific * case, it is important to not flush any metaslabs as that * would dirty this TXG. * * That said, during one of these dirty TXGs that is less or * equal to spa_final_dirty(), spa_unload() will request that * we try to flush all the metaslabs for that TXG before * exporting the pool, thus we ensure that we didn't get a * request of flushing everything before we attempt to return * immediately. */ if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && !spa_flush_all_logs_requested(spa)) return; /* * We need to generate a log space map before flushing because this * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg) * for this TXG's flushed metaslab count (aka sls_mscount which is * manipulated in many ways down the metaslab_flush() codepath). * * That is not to say that we may generate a log space map when we * don't need it. If we are flushing metaslabs, that means that we * were going to write changes to disk anyway, so even if we were * not flushing, a log space map would have been created anyway in * metaslab_sync(). */ spa_generate_syncing_log_sm(spa, tx); /* * This variable tells us how many metaslabs we want to flush based * on the block-heuristic of our flushing algorithm (see block comment * of log space map feature). We also decrement this as we flush * metaslabs and attempt to destroy old log space maps. */ uint64_t want_to_flush; if (spa_flush_all_logs_requested(spa)) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); want_to_flush = UINT64_MAX; } else { want_to_flush = spa_estimate_metaslabs_to_flush(spa); } /* Used purely for verification purposes */ uint64_t visited = 0; /* * Ideally we would only iterate through spa_metaslabs_by_flushed * using only one variable (curr). We can't do that because * metaslab_flush() mutates position of curr in the AVL when * it flushes that metaslab by moving it to the end of the tree. * Thus we always keep track of the original next node of the * current node (curr) in another variable (next). */ metaslab_t *next = NULL; for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed); curr != NULL; curr = next) { next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr); /* * If this metaslab has been flushed this txg then we've done * a full circle over the metaslabs. */ if (metaslab_unflushed_txg(curr) == txg) break; /* * If we are done flushing for the block heuristic and the * unflushed changes don't exceed the memory limit just stop. */ if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) break; if (metaslab_unflushed_dirty(curr)) { mutex_enter(&curr->ms_sync_lock); mutex_enter(&curr->ms_lock); metaslab_flush(curr, tx); mutex_exit(&curr->ms_lock); mutex_exit(&curr->ms_sync_lock); if (want_to_flush > 0) want_to_flush--; } else metaslab_unflushed_bump(curr, tx, B_FALSE); visited++; } ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); spa_log_sm_set_blocklimit(spa); } /* * Close the log space map for this TXG and update the block counts * for the log's in-memory structure and the summary. */ void spa_sync_close_syncing_log_sm(spa_t *spa) { if (spa_syncing_log_sm(spa) == NULL) return; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa)); sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa)); spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; /* * Note that we can't assert that sls_mscount is not 0, * because there is the case where the first metaslab * in spa_metaslabs_by_flushed is loading and we were * not able to flush any metaslabs the current TXG. */ ASSERT(sls->sls_nblocks != 0); spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks); spa_log_summary_verify_counts(spa); space_map_close(spa->spa_syncing_log_sm); spa->spa_syncing_log_sm = NULL; /* * At this point we tried to flush as many metaslabs as we * can as the pool is getting exported. Reset the "flush all" * so the last few TXGs before closing the pool can be empty * (e.g. not dirty). */ if (spa_flush_all_logs_requested(spa)) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); spa->spa_log_flushall_txg = 0; } } void spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(spa); uint64_t spacemap_zap; int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); if (error == ENOENT) { ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); return; } VERIFY0(error); metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed); uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest); /* Free all log space maps older than the oldest_flushed_txg. */ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls && sls->sls_txg < oldest_flushed_txg; sls = avl_first(&spa->spa_sm_logs_by_txg)) { ASSERT0(sls->sls_mscount); avl_remove(&spa->spa_sm_logs_by_txg, sls); space_map_free_obj(mos, sls->sls_sm_obj, tx); VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks); spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; kmem_free(sls, sizeof (spa_log_sm_t)); } } static spa_log_sm_t * spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg) { spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP); sls->sls_sm_obj = sm_obj; sls->sls_txg = txg; return (sls); } void spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); objset_t *mos = spa_meta_objset(spa); if (spa_syncing_log_sm(spa) != NULL) return; if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)) return; uint64_t spacemap_zap; int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); if (error == ENOENT) { ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); error = 0; spacemap_zap = zap_create(mos, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap, tx)); spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx); } VERIFY0(error); uint64_t sm_obj; ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj), ==, ENOENT); sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx); VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx)); avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg)); /* * We pass UINT64_MAX as the space map's representation size * and SPA_MINBLOCKSHIFT as the shift, to make the space map * accept any sorts of segments since there's no real advantage * to being more restrictive (given that we're already going * to be using 2-word entries). */ VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); spa_log_sm_set_blocklimit(spa); } /* * Find all the log space maps stored in the space map ZAP and sort * them by their TXG in spa_sm_logs_by_txg. */ static int spa_ld_log_sm_metadata(spa_t *spa) { int error; uint64_t spacemap_zap; ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); if (error == ENOENT) { /* the space map ZAP doesn't exist yet */ return (0); } else if (error != 0) { spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]", error); return (error); } zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap); (error = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { uint64_t log_txg = zfs_strtonum(za->za_name, NULL); spa_log_sm_t *sls = spa_log_sm_alloc(za->za_first_integer, log_txg); avl_add(&spa->spa_sm_logs_by_txg, sls); } zap_cursor_fini(&zc); zap_attribute_free(za); if (error != ENOENT) { spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " "zap_cursor_retrieve(spacemap_zap) [error %d]", error); return (error); } for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) }; spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, &target, NULL); /* * At this point if sls is zero it means that a bug occurred * in ZFS the last time the pool was open or earlier in the * import code path. In general, we would have placed a * VERIFY() here or in this case just let the kernel panic * with NULL pointer dereference when incrementing sls_mscount, * but since this is the import code path we can be a bit more * lenient. Thus, for DEBUG bits we always cause a panic, while * in production we log the error and just fail the import. */ ASSERT(sls != NULL); if (sls == NULL) { spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug " "encountered: could not find log spacemap for " "TXG %llu [error %d]", (u_longlong_t)metaslab_unflushed_txg(m), ENOENT); return (ENOENT); } sls->sls_mscount++; } return (0); } typedef struct spa_ld_log_sm_arg { spa_t *slls_spa; uint64_t slls_txg; } spa_ld_log_sm_arg_t; static int spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) { uint64_t offset = sme->sme_offset; uint64_t size = sme->sme_run; uint32_t vdev_id = sme->sme_vdev; spa_ld_log_sm_arg_t *slls = arg; spa_t *spa = slls->slls_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); /* * If the vdev has been removed (i.e. it is indirect or a hole) * skip this entry. The contents of this vdev have already moved * elsewhere. */ if (!vdev_is_concrete(vd)) return (0); metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; ASSERT(!ms->ms_loaded); /* * If we have already flushed entries for this TXG to this * metaslab's space map, then ignore it. Note that we flush * before processing any allocations/frees for that TXG, so * the metaslab's space map only has entries from *before* * the unflushed TXG. */ if (slls->slls_txg < metaslab_unflushed_txg(ms)) return (0); switch (sme->sme_type) { case SM_ALLOC: range_tree_remove_xor_add_segment(offset, offset + size, ms->ms_unflushed_frees, ms->ms_unflushed_allocs); break; case SM_FREE: range_tree_remove_xor_add_segment(offset, offset + size, ms->ms_unflushed_allocs, ms->ms_unflushed_frees); break; default: panic("invalid maptype_t"); break; } if (!metaslab_unflushed_dirty(ms)) { metaslab_set_unflushed_dirty(ms, B_TRUE); spa_log_summary_dirty_flushed_metaslab(spa, metaslab_unflushed_txg(ms)); } return (0); } static int spa_ld_log_sm_data(spa_t *spa) { spa_log_sm_t *sls, *psls; int error = 0; /* * If we are not going to do any writes there is no need * to read the log space maps. */ if (!spa_writeable(spa)) return (0); ASSERT0(spa->spa_unflushed_stats.sus_nblocks); ASSERT0(spa->spa_unflushed_stats.sus_memused); hrtime_t read_logs_starttime = gethrtime(); /* Prefetch log spacemaps dnodes. */ for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj, ZIO_PRIORITY_SYNC_READ); } uint_t pn = 0; uint64_t ps = 0; uint64_t nsm = 0; psls = sls = avl_first(&spa->spa_sm_logs_by_txg); while (sls != NULL) { /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ if (psls != NULL && pn < 16 && (pn < 2 || ps < 2 * dmu_prefetch_max)) { error = space_map_open(&psls->sls_sm, spa_meta_objset(spa), psls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); if (error != 0) { spa_load_failed(spa, "spa_ld_log_sm_data(): " "failed at space_map_open(obj=%llu) " "[error %d]", (u_longlong_t)sls->sls_sm_obj, error); goto out; } dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj, 0, 0, space_map_length(psls->sls_sm), ZIO_PRIORITY_ASYNC_READ); pn++; ps += space_map_length(psls->sls_sm); psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls); continue; } /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ kpreempt(KPREEMPT_SYNC); ASSERT0(sls->sls_nblocks); sls->sls_nblocks = space_map_nblocks(sls->sls_sm); spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; summary_add_data(spa, sls->sls_txg, sls->sls_mscount, 0, sls->sls_nblocks); spa_import_progress_set_notes_nolog(spa, "Read %llu of %lu log space maps", (u_longlong_t)nsm, avl_numnodes(&spa->spa_sm_logs_by_txg)); struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg }; error = space_map_iterate(sls->sls_sm, space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla); if (error != 0) { spa_load_failed(spa, "spa_ld_log_sm_data(): failed " "at space_map_iterate(obj=%llu) [error %d]", (u_longlong_t)sls->sls_sm_obj, error); goto out; } pn--; ps -= space_map_length(sls->sls_sm); nsm++; space_map_close(sls->sls_sm); sls->sls_sm = NULL; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); /* Update log block limits considering just loaded. */ spa_log_sm_set_blocklimit(spa); } hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) " "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg), (u_longlong_t)spa_log_sm_nblocks(spa), (u_longlong_t)zfs_log_sm_blksz, (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime)); out: if (error != 0) { for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { if (sls->sls_sm) { space_map_close(sls->sls_sm); sls->sls_sm = NULL; } } } else { ASSERT0(pn); ASSERT0(ps); } /* * Now that the metaslabs contain their unflushed changes: * [1] recalculate their actual allocated space * [2] recalculate their weights * [3] sum up the memory usage of their unflushed range trees * [4] optionally load them, if debug_load is set * * Note that even in the case where we get here because of an * error (e.g. error != 0), we still want to update the fields * below in order to have a proper teardown in spa_unload(). */ for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { mutex_enter(&m->ms_lock); m->ms_allocated_space = space_map_allocated(m->ms_sm) + range_tree_space(m->ms_unflushed_allocs) - range_tree_space(m->ms_unflushed_frees); vdev_t *vd = m->ms_group->mg_vd; metaslab_space_update(vd, m->ms_group->mg_class, range_tree_space(m->ms_unflushed_allocs), 0, 0); metaslab_space_update(vd, m->ms_group->mg_class, -range_tree_space(m->ms_unflushed_frees), 0, 0); ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK); metaslab_recalculate_weight_and_sort(m); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(m); if (metaslab_debug_load && m->ms_sm != NULL) { VERIFY0(metaslab_load(m)); metaslab_set_selected_txg(m, 0); } mutex_exit(&m->ms_lock); } return (error); } static int spa_ld_unflushed_txgs(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); if (vd->vdev_top_zap == 0) return (0); uint64_t object = 0; int error = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (error == ENOENT) return (0); else if (error != 0) { spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at " "zap_lookup(vdev_top_zap=%llu) [error %d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *ms = vd->vdev_ms[m]; ASSERT(ms != NULL); metaslab_unflushed_phys_t entry; uint64_t entry_size = sizeof (entry); uint64_t entry_offset = ms->ms_id * entry_size; error = dmu_read(mos, object, entry_offset, entry_size, &entry, 0); if (error != 0) { spa_load_failed(spa, "spa_ld_unflushed_txgs(): " "failed at dmu_read(obj=%llu) [error %d]", (u_longlong_t)object, error); return (error); } ms->ms_unflushed_txg = entry.msp_unflushed_txg; ms->ms_unflushed_dirty = B_FALSE; ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(ms->ms_unflushed_frees)); if (ms->ms_unflushed_txg != 0) { mutex_enter(&spa->spa_flushed_ms_lock); avl_add(&spa->spa_metaslabs_by_flushed, ms); mutex_exit(&spa->spa_flushed_ms_lock); } } return (0); } /* * Read all the log space map entries into their respective * metaslab unflushed trees and keep them sorted by TXG in the * SPA's metadata. In addition, setup all the metadata for the * memory and the block heuristics. */ int spa_ld_log_spacemaps(spa_t *spa) { int error; spa_log_sm_set_blocklimit(spa); for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; error = spa_ld_unflushed_txgs(vd); if (error != 0) return (error); } error = spa_ld_log_sm_metadata(spa); if (error != 0) return (error); /* * Note: we don't actually expect anything to change at this point * but we grab the config lock so we don't fail any assertions * when using vdev_lookup_top(). */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); error = spa_ld_log_sm_data(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); return (error); } -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW, "Specific hard-limit in memory that ZFS allows to be used for " "unflushed changes"); ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW, "Percentage of the overall system memory that ZFS allows to be " "used for unflushed changes (value is calculated over 1000000 for " "finer granularity)"); ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW, "Hard limit (upper-bound) in the size of the space map log " "in terms of blocks."); ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW, "Lower-bound limit for the maximum amount of blocks allowed in " "log spacemap (see zfs_unflushed_log_block_max)"); ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW, - "Hard limit (upper-bound) in the size of the space map log " - "in terms of dirty TXGs."); + "Hard limit (upper-bound) in the size of the space map log " + "in terms of dirty TXGs."); ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW, "Tunable used to determine the number of blocks that can be used for " "the spacemap log, expressed as a percentage of the total number of " "metaslabs in the pool (e.g. 400 means the number of log blocks is " "capped at 4 times the number of metaslabs)"); ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW, "The number of past TXGs that the flushing algorithm of the log " "spacemap feature uses to estimate incoming log blocks"); ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, "Prevent the log spacemaps from being flushed and destroyed " "during pool export/destroy"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW, "Maximum number of rows allowed in the summary of the spacemap log"); ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW, "Minimum number of metaslabs to flush per dirty TXG"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a2b887962270..7fae51cc2c52 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1,3179 +1,3177 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2023, 2024, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include #include #include #include /* * SPA locking * * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy * - Held at the start and end of import and export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa zfs_refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* * Everything except dprintf, set_error, spa, and indirect_remap is on * by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ int zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ uint64_t zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ uint64_t zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ uint64_t zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ /* * By default the deadman is enabled. */ int zfs_deadman_enabled = B_TRUE; /* * Controls the behavior of the deadman when it detects a "hung" I/O. * Valid values are zfs_deadman_failmode=. * * wait - Wait for the "hung" I/O (default) * continue - Attempt to recover from a "hung" I/O * panic - Panic the system */ const char *zfs_deadman_failmode = "wait"; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ uint_t spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed (bounded by spa_max_slop). This ensures that we * don't run the pool completely out of space, due to unaccounted changes (e.g. * to the MOS). It also limits the worst-case time to allocate space. If we * have less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are * also part of this 3.2% of space which can't be consumed by normal writes; * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded * log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * Operations that are almost guaranteed to free up space in the absence of * a pool checkpoint can use up to three quarters of the slop space * (e.g zfs destroy). * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net * increase in the amount of space used, it is possible to run the pool * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * * Further, on very large pools, the slop space will be smaller than * 3.2%, to avoid reserving much more space than we actually need; bounded * by spa_max_slop (128GB). * * See also the comments in zfs_space_check_t. */ uint_t spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; /* * Number of allocators to use, per spa instance */ static int spa_num_allocators = 4; static int spa_cpus_per_allocator = 4; /* * Spa active allocator. * Valid values are zfs_active_allocator=. */ const char *zfs_active_allocator = "dynamic"; void spa_load_failed(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } void spa_load_note(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); spa_import_progress_set_notes_nolog(spa, "%s", buf); } /* * By default dedup and user data indirects land in the special class */ static int zfs_ddt_data_is_special = B_TRUE; static int zfs_user_indirect_is_special = B_TRUE; /* * The percentage of special class final space reserved for metadata only. * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only * let metadata into the class. */ static uint_t zfs_special_class_metadata_reserve_pct = 25; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); scl->scl_writer = NULL; scl->scl_write_wanted = 0; scl->scl_count = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); ASSERT(scl->scl_count == 0); } } int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (scl->scl_count != 0) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } return (1); } static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, int mmp_flag) { (void) tag; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || (!mmp_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (scl->scl_count != 0) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 0); } /* * The spa_config_enter_mmp() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is * a single disk in a pool that is responding slowly and presumably about to * fail. */ void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } void spa_config_exit(spa_t *spa, int locks, const void *tag) { (void) tag; for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(scl->scl_count > 0); if (--scl->scl_count == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && scl->scl_count != 0) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); retry: (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); if (spa == NULL) return (NULL); /* * Avoid racing with import/export, which don't hold the namespace * lock for their entire duration. */ if ((spa->spa_load_thread != NULL && spa->spa_load_thread != curthread) || (spa->spa_export_thread != NULL && spa->spa_export_thread != curthread)) { cv_wait(&spa_namespace_cv, &spa_namespace_lock); goto retry; } return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; /* Disable the deadman if the pool is suspended. */ if (spa_suspended(spa)) return; zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, (u_longlong_t)++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev, FTAG); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + MSEC_TO_TICK(zfs_deadman_checktime_ms)); } static int spa_log_sm_sort_by_txg(const void *va, const void *vb) { const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); spa_set_allocator(spa, zfs_active_allocator); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) spa->spa_root = spa_strdup(altroot); /* Do not allow more allocators than fraction of CPUs. */ spa->spa_alloc_count = MAX(MIN(spa_num_allocators, boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1); spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } if (spa->spa_alloc_count > 1) { spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t, sau_inuse[spa->spa_alloc_count]), KM_SLEEP); mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT, NULL); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } list_create(&spa->spa_leaf_list, sizeof (vdev_t), offsetof(vdev_t, vdev_leaf_node)); return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); if (spa->spa_root) spa_strfree(spa->spa_root); while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } for (int i = 0; i < spa->spa_alloc_count; i++) { avl_destroy(&spa->spa_allocs[i].spaa_tree); mutex_destroy(&spa->spa_allocs[i].spaa_lock); } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); if (spa->spa_alloc_count > 1) { mutex_destroy(&spa->spa_allocs_use->sau_lock); kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t, sau_inuse[spa->spa_alloc_count])); } avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); zfs_refcount_destroy(&spa->spa_refcount); spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); cv_destroy(&spa->spa_activities_cv); cv_destroy(&spa->spa_waiters_cv); mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread); (void) zfs_refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held or be part of a pool import/export. */ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock) || spa->spa_load_thread == curthread || spa->spa_export_thread == curthread); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, const void *tag) { (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held or be the spa export thread. We really * compare against spa_minref, which is the number of references * acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock) || spa->spa_export_thread == curthread); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } static void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } static void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } static boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } static void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); } /* * The same as spa_vdev_enter() above but additionally takes the guid of * the vdev being detached. When there is a rebuild in process it will be * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). * The rebuild is canceled if only a single child remains after the detach. */ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); ASSERT0(spa->spa_export_thread); vdev_autotrim_stop_all(spa); if (guid != 0) { vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd) { vdev_rebuild_stop_wait(vd->vdev_top); } } return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, NULL); mutex_exit(&vd->vdev_initialize_lock); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); } /* * The vdev may be both a leaf and top-level device. */ vdev_autotrim_stop_wait(vd); spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_STATE_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; vdev_t *vdev_top; if (vd == NULL || vd == spa->spa_root_vdev) { vdev_top = spa->spa_root_vdev; } else { vdev_top = vd->vdev_top; } if (vd != NULL || error == 0) vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) vdev_state_dirty(vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); memcpy(new, s, len + 1); return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid; if (spa != NULL) { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)); } else { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(guid, 0)); } return (guid); } static boolean_t spa_load_guid_exists(uint64_t guid) { avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa_load_guid(spa) == guid) return (B_TRUE); } return (arc_async_flush_guid_inuse(guid)); } uint64_t spa_generate_load_guid(void) { uint64_t guid; do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_load_guid_exists(guid)); return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; const char *checksum = NULL; const char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } void spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) { /* * We bump the feature refcount for each special vdev added to the pool */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } boolean_t spa_indirect_vdevs_loaded(spa_t *spa) { return (spa->spa_indirect_vdevs_loaded); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strlcpy(buf, spa->spa_root, buflen); } uint32_t spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. */ uint64_t spa_final_dirty_txg(spa_t *spa) { return (spa->spa_final_txg - TXG_DEFER_SIZE); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* * Return the inflated asize for a logical write in bytes. This is used by the * DMU to calculate the space a logical write will require on disk. * If lsize is smaller than the largest physical block size allocatable on this * pool we use its value instead, since the write will end up using the whole * block anyway. */ uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { if (lsize == 0) return (0); /* No inflation needed */ return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to * the value of spa_max_slop. The embedded log space is not included in * spa_dspace. By subtracting it, the usable space (per "zfs list") is a * constant 97% of the total space, regardless of metaslab size (assuming the * default spa_slop_shift=5 and a non-tiny pool). * * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = 0; uint64_t slop = 0; /* * Make sure spa_dedup_dspace has been set. */ if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); space = spa->spa_rdspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* * Subtract the embedded log space, but no more than half the (3.2%) * unusable space. Note, the "no more than half" is only relevant if * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by * default. */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* * Slop space should be at least spa_min_slop, but no more than half * the entire pool. */ slop = MAX(slop, MIN(space >> 1, spa_min_slop)); return (slop); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } uint64_t spa_get_checkpoint_space(spa_t *spa) { return (spa->spa_checkpoint_info.sci_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa)); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that * contribute to dspace. If a file is overwritten, its old * blocks are freed and new blocks are allocated. If there are * no snapshots of the file, the available space should remain * the same. The old blocks could be freed from the * non-allocating vdev, but the new blocks must be allocated on * other (allocating) vdevs. By reserving the entire size of * the non-allocating vdevs (including allocated space), we * ensure that there will be enough space on the allocating * vdevs for this file overwrite to succeed. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace); spa->spa_rdspace -= spa->spa_nonallocating_dspace; } spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint64_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended != ZIO_SUSPEND_NONE); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } metaslab_class_t * spa_embedded_log_class(spa_t *spa) { return (spa->spa_embedded_log_class); } metaslab_class_t * spa_special_class(spa_t *spa) { return (spa->spa_special_class); } metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); } boolean_t spa_special_has_ddt(spa_t *spa) { return (zfs_ddt_data_is_special && spa->spa_special_class->mc_groups != 0); } /* * Locate an appropriate allocation class */ metaslab_class_t * spa_preferred_class(spa_t *spa, const zio_t *zio) { const zio_prop_t *zp = &zio->io_prop; /* * Override object type for the purposes of selecting a storage class. * Primarily for DMU_OTN_ types where we can't explicitly control their * storage class; instead, choose a static type most closely matches * what we want. */ dmu_object_type_t objtype = zp->zp_storage_type == DMU_OT_NONE ? zp->zp_type : zp->zp_storage_type; /* * ZIL allocations determine their class in zio_alloc_zil(). */ ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; if (DMU_OT_IS_DDT(objtype)) { if (spa->spa_dedup_class->mc_groups != 0) return (spa_dedup_class(spa)); else if (has_special_class && zfs_ddt_data_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* Indirect blocks for user data can land in special if allowed */ if (zp->zp_level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (has_special_class && zfs_user_indirect_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) { if (has_special_class) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* * Allow small file blocks in special class in some cases (like * for the dRAID vdev feature). But always leave a reserve of * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if (DMU_OT_IS_FILE(objtype) && has_special_class && zio->io_size <= zp->zp_zpl_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); uint64_t limit = (space * (100 - zfs_special_class_metadata_reserve_pct)) / 100; if (alloc < limit) return (special); } return (spa_normal_class(spa)); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } spa_autotrim_t spa_get_autotrim(spa_t *spa) { return (spa->spa_autotrim); } uint64_t spa_deadman_ziotime(spa_t *spa) { return (spa->spa_deadman_ziotime); } uint64_t spa_get_deadman_failmode(spa_t *spa) { return (spa->spa_deadman_failmode); } void spa_set_deadman_failmode(spa_t *spa, const char *failmode) { if (strcmp(failmode, "wait") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; else if (strcmp(failmode, "continue") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; else if (strcmp(failmode, "panic") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; else spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; } void spa_set_deadman_ziotime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); } } void spa_set_deadman_synctime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); } } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd != NULL) dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } uint64_t spa_dirty_data(spa_t *spa) { return (spa->spa_dsl_pool->dp_dirty_total); } /* * ========================================================================== * SPA Import Progress Routines * ========================================================================== */ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; char *spa_load_notes; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; } spa_import_progress_t; spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", "pool_name", "notes"); return (0); } static int spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, (sip->pool_name ? sip->pool_name : "-"), (sip->spa_load_notes ? sip->spa_load_notes : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) { spa_import_progress_t *sip; while (shl->size > size) { sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); if (sip->spa_load_notes) kmem_strfree(sip->spa_load_notes); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list)); } static void spa_import_progress_init(void) { spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t), KM_SLEEP); spa_import_progress_list->size = 0; spa_import_progress_list->procfs_list.pl_private = spa_import_progress_list; procfs_list_install("zfs", NULL, "import_progress", 0644, &spa_import_progress_list->procfs_list, spa_import_progress_show, spa_import_progress_show_header, NULL, offsetof(spa_import_progress_t, smh_node)); } static void spa_import_progress_destroy(void) { spa_history_list_t *shl = spa_import_progress_list; procfs_list_uninstall(&shl->procfs_list); spa_import_progress_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); kmem_free(shl, sizeof (spa_history_list_t)); } int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t load_state) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; if (sip->spa_load_notes != NULL) { kmem_strfree(sip->spa_load_notes); sip->spa_load_notes = NULL; } error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } static void spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg, const char *fmt, va_list adx) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; uint64_t pool_guid = spa_guid(spa); if (shl->size == 0) return; char *notes = kmem_vasprintf(fmt, adx); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->spa_load_notes != NULL) { kmem_strfree(sip->spa_load_notes); sip->spa_load_notes = NULL; } sip->spa_load_notes = notes; if (log_dbgmsg) zfs_dbgmsg("'%s' %s", sip->pool_name, notes); notes = NULL; break; } } mutex_exit(&shl->procfs_list.pl_lock); if (notes != NULL) kmem_strfree(notes); } void spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) { va_list adx; va_start(adx, fmt); spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx); va_end(adx); } void spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) { va_list adx; va_start(adx, fmt); spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx); va_end(adx); } int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_max_txg = load_max_txg; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->mmp_sec_remaining = mmp_sec_remaining; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * A new import is in progress, add an entry. */ void spa_import_progress_add(spa_t *spa) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; const char *poolname = NULL; sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); sip->pool_guid = spa_guid(spa); (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &poolname); if (poolname == NULL) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); sip->spa_load_notes = NULL; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); shl->size++; mutex_exit(&shl->procfs_list.pl_lock); } void spa_import_progress_remove(uint64_t pool_guid) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); if (sip->spa_load_notes) spa_strfree(sip->spa_load_notes); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); break; } } mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (TREE_ISIGN(s)); } void spa_boot_init(void) { spa_config_load(); } void spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifndef _KERNEL if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = arc_buf_sigsegv; if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("could not enable watchpoints: " "sigaction(SIGSEGV, ...) = "); } else { arc_watch = B_TRUE; } } #endif fm_init(); zfs_refcount_init(); unique_init(); zfs_btree_init(); metaslab_stat_init(); brt_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); chksum_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); vdev_prop_init(); l2arc_start(); scan_init(); qat_init(); spa_import_progress_init(); zap_init(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); brt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); scan_fini(); qat_fini(); spa_import_progress_destroy(); zap_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } spa_mode_t spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_get_last_scrubbed_txg(spa_t *spa) { return (spa->spa_scrubbed_last_txg); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_errorscrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; // error scrub stats spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_skipped = scn->scn_phys.scn_skipped; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; /* data not stored on disk */ ps->pss_pass_exam = spa->spa_scan_pass_exam; ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; /* error scrub data stored on disk */ ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; ps->pss_error_scrub_to_be_examined = scn->errorscrub_phys.dep_to_examine; /* error scrub data not stored on disk */ ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; return (0); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. */ uint64_t spa_get_last_removal_txg(spa_t *spa) { uint64_t vdevid; uint64_t ret = -1ULL; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* * sr_prev_indirect_vdev is only modified while holding all the * config locks, so it is sufficient to hold SCL_VDEV as reader when * examining it. */ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; while (vdevid != -1ULL) { vdev_t *vd = vdev_lookup_top(spa, vdevid); vdev_indirect_births_t *vib = vd->vdev_indirect_births; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); /* * If the removal did not remap any data, we don't care. */ if (vdev_indirect_births_count(vib) != 0) { ret = vdev_indirect_births_last_entry_txg(vib); break; } vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_VDEV, FTAG); IMPLY(ret != -1ULL, spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); return (ret); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } boolean_t spa_multihost(spa_t *spa) { return (spa->spa_multihost ? B_TRUE : B_FALSE); } uint32_t spa_get_hostid(spa_t *spa) { return (spa->spa_hostid); } boolean_t spa_trust_config(spa_t *spa) { return (spa->spa_trust_config); } uint64_t spa_missing_tvds_allowed(spa_t *spa) { return (spa->spa_missing_tvds_allowed); } space_map_t * spa_syncing_log_sm(spa_t *spa) { return (spa->spa_syncing_log_sm); } void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { spa->spa_missing_tvds = missing; } /* * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc). */ const char * spa_state_to_name(spa_t *spa) { ASSERT3P(spa, !=, NULL); /* * it is possible for the spa to exist, without root vdev * as the spa transitions during import/export */ vdev_t *rvd = spa->spa_root_vdev; if (rvd == NULL) { return ("TRANSITIONING"); } vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; if (spa_suspended(spa)) return ("SUSPENDED"); switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return ("OFFLINE"); case VDEV_STATE_REMOVED: return ("REMOVED"); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return ("FAULTED"); else if (aux == VDEV_AUX_SPLIT_POOL) return ("SPLIT"); else return ("UNAVAIL"); case VDEV_STATE_FAULTED: return ("FAULTED"); case VDEV_STATE_DEGRADED: return ("DEGRADED"); case VDEV_STATE_HEALTHY: return ("ONLINE"); default: break; } return ("UNKNOWN"); } boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t spa_has_checkpoint(spa_t *spa) { return (spa->spa_checkpoint_txg != 0); } boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && spa->spa_mode == SPA_MODE_READ); } uint64_t spa_min_claim_txg(spa_t *spa) { uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; if (checkpoint_txg != 0) return (checkpoint_txg + 1); return (spa->spa_first_txg); } /* * If there is a checkpoint, async destroys may consume more space from * the pool instead of freeing it. In an attempt to save the pool from * getting suspended when it is about to run out of space, we stop * processing async destroys. */ boolean_t spa_suspend_async_destroy(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t unreserved = dsl_pool_unreserved_space(dp, ZFS_SPACE_CHECK_EXTRA_RESERVED); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; if (spa_has_checkpoint(spa) && avail == 0) return (B_TRUE); return (B_FALSE); } #if defined(_KERNEL) int param_set_deadman_failmode_common(const char *val) { spa_t *spa = NULL; char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && strcmp(val, "panic")) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); } return (0); } #endif /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); EXPORT_SYMBOL(spa_close); EXPORT_SYMBOL(spa_refcount_zero); /* Pool configuration lock */ EXPORT_SYMBOL(spa_config_tryenter); EXPORT_SYMBOL(spa_config_enter); EXPORT_SYMBOL(spa_config_exit); EXPORT_SYMBOL(spa_config_held); /* Pool vdev add/remove lock */ EXPORT_SYMBOL(spa_vdev_enter); EXPORT_SYMBOL(spa_vdev_exit); /* Pool vdev state change lock */ EXPORT_SYMBOL(spa_vdev_state_enter); EXPORT_SYMBOL(spa_vdev_state_exit); /* Accessor functions */ EXPORT_SYMBOL(spa_shutting_down); EXPORT_SYMBOL(spa_get_dsl); EXPORT_SYMBOL(spa_get_rootblkptr); EXPORT_SYMBOL(spa_set_rootblkptr); EXPORT_SYMBOL(spa_altroot); EXPORT_SYMBOL(spa_sync_pass); EXPORT_SYMBOL(spa_name); EXPORT_SYMBOL(spa_guid); EXPORT_SYMBOL(spa_last_synced_txg); EXPORT_SYMBOL(spa_first_txg); EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); EXPORT_SYMBOL(spa_special_class); EXPORT_SYMBOL(spa_preferred_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); EXPORT_SYMBOL(spa_lookup_by_guid); EXPORT_SYMBOL(spa_has_spare); EXPORT_SYMBOL(dva_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize); EXPORT_SYMBOL(spa_has_slogs); EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); EXPORT_SYMBOL(spa_trust_config); EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, "Set additional debugging flags"); ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, "Set to attempt to recover from fatal errors"); ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW, "Dead I/O check interval in milliseconds"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, "Enable deadman timer"); ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW, "SPA size estimate multiplication factor"); ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, "Place DDT data into the special class"); ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW, "IO expiration time in milliseconds"); ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); -/* END CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW, "Number of allocators per spa"); ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW, "Minimum number of CPUs per allocators"); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 1a930f1e4f1a..250590f062ea 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6585 +1,6581 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, uint64_t)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } static int vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t objid; int err; if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (EINVAL); } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); if (err == ENOENT) *value = vdev_prop_default_numeric(prop); return (err); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_load_guid(); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_dio_verify_rl, &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); /* * Default Thresholds for tuning ZED */ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { const char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) vd->vdev_devid = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &tmp) == 0) vd->vdev_enc_sysfs_path = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. Ignore pool ashift for vdev * attach case. */ if (alloctype != VDEV_ALLOC_ATTACH) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); } else { vd->vdev_attaching = B_TRUE; } /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); if (vd->vdev_ops == &vdev_root_ops && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, &vd->vdev_root_zap); } /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); vd->vdev_rz_expanding = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } /* * Choose GCD for spa_gcd_alloc. */ static uint64_t vdev_gcd(uint64_t a, uint64_t b) { while (b != 0) { uint64_t t = b; b = a % b; a = t; } return (a); } /* * Set spa_min_alloc and spa_gcd_alloc. */ static void vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; if (spa->spa_gcd_alloc == INT_MAX) { spa->spa_gcd_alloc = min_alloc; } else { spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); /* * If this probe was initiated from zio pipeline, then * change the state in a spa_async_request. Probes that * were initiated from a vdev_open can change the state * as part of the open call. */ if (vps->vps_zio_done_probe) { vd->vdev_fault_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); } } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE * changed, this algorithm can not change, otherwise it would inconsistently * account for existing bp's. We also hard-code txg 0 for the same reason * since expanded RAIDZ vdevs can use a different asize for different birth * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_fault_wanted = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time * (0) or the override value is impossible for the device, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift < vd->vdev_logical_ashift) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) { if (svd != NULL && *dvd != NULL) { if (strcmp(svd, *dvd) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " "from '%s' to '%s'", (u_longlong_t)guid, prefix, *dvd, svd); spa_strfree(*dvd); *dvd = spa_strdup(svd); } } else if (svd != NULL) { *dvd = spa_strdup(svd); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)guid, *dvd); } } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, dvd->vdev_guid); vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, dvd->vdev_guid); vdev_update_path("vdev_physpath", svd->vdev_physpath, &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Recheck if resilver is still needed and cancel any * scheduled resilver if resilver is unneeded. */ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && spa->spa_async_tasks & SPA_ASYNC_RESILVER) { mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; mutex_exit(&spa->spa_async_lock); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ static void vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess_impl(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done, faulting); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); /* * For the faulting case, treat members of a replacing vdev * as if they are not available. It's more likely than not that * a vdev in a replacing vdev could encounter read errors so * treat it as not being able to contribute. */ if (!vdev_readable(vd) || (faulting && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); } else { range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); } /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); } else { mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) { /* leaf vdevs only */ continue; } if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; } else { /* any kind of mirror */ minref = vd->vdev_children; } space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { raidz_dtl_reassessed(vd); } } void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done, rebuild_done, B_FALSE)); } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); vd->vdev_root_zap = vdev_create_link_zap(vd, tx); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be * - offlined * - detached * - removed * - faulted * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); if (vd->vdev_ops == &vdev_raidz_ops) { error = vdev_raidz_load(vd); if (error != 0) return (error); } /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t failfast; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { vd->vdev_failfast = failfast & 1; } else if (error == ENOENT) { vd->vdev_failfast = vdev_prop_default_numeric( VDEV_PROP_FAILFAST); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { uint64_t zapobj; if (vd->vdev_top_zap != 0) zapobj = vd->vdev_top_zap; else zapobj = vd->vdev_leaf_zap; error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, &vd->vdev_checksum_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, &vd->vdev_checksum_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, &vd->vdev_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, &vd->vdev_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, &vd->vdev_slow_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, &vd->vdev_slow_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } /* * Return the amount of space that should be (or was) allocated for the given * psize (compressed block size) in the given TXG. Note that for expanded * RAIDZ vdevs, the size allocated for older BP's may be larger. See * vdev_raidz_asize(). */ uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) { return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, or expanding which can trigger * repartition add/remove events, then don't do anything. */ if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); /* * Confirm the vdev has been removed, otherwise don't do anything. */ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); /* * Asynchronously detach spare vdev if resilver or * rebuild is not required */ if (vd->vdev_unspare && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && !vdev_rebuild_active(tvd)) spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect or removed vdev. */ if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN_TYPED( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift, uint64_t); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; /* Suppress ASAN false positive */ #ifdef __SANITIZE_ADDRESS__ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; #else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; #endif zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_FLUSH. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_FLUSH; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_spa->spa_raidz_expand == NULL || vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; VERIFY3U(pvd->vdev_children, >, 1); vdev_remove_child(pvd, vd); vdev_compact_children(pvd); ASSERT3P(pvd->vdev_child, !=, NULL); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %u active IOs", vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { range_seg64_t iter_rs = *logical_rs; range_seg64_t physical_rs; range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("unexpected vdev type"); } mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); /* Check that vdev has a zap we can use */ if (vd->vdev_root_zap == 0 && vd->vdev_top_zap == 0 && vd->vdev_leaf_zap == 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { const char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_failfast = intval & 1; break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_n = intval; break; case VDEV_PROP_CHECKSUM_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_t = intval; break; case VDEV_PROP_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_n = intval; break; case VDEV_PROP_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_t = intval; break; case VDEV_PROP_SLOW_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_n = intval; break; case VDEV_PROP_SLOW_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_slow_io_t = intval; break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_TRIM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_trim_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_SLOW_IOS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_slow_ios, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_FLUSH. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; case VDEV_PROP_RAIDZ_EXPANDING: /* Only expose this for raidz */ if (vd->vdev_ops == &vdev_raidz_ops) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_rz_expanding, ZPROP_SRC_NONE); } continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_has_trim, ZPROP_SRC_NONE); } continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric( prop); err = 0; } else if (err) { break; } if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, intval, src); break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za->za_name; switch (za->za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za->za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za->za_name, 1, za->za_num_integers, strval); if (err) { kmem_free(strval, za->za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za->za_num_integers); break; default: break; } } zap_cursor_fini(&zc); zap_attribute_free(za); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default lower limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, "Rate limit hung IO (deadman) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, "Rate Direct I/O write verify events to this many per second"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, "Direct I/O writes will perform for checksum verification before " "commiting write"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); -/* END CSTYLED */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index e3dba0257b21..cd24f97ae7cd 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1,1925 +1,1923 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * An indirect vdev corresponds to a vdev that has been removed. Since * we cannot rewrite block pointers of snapshots, etc., we keep a * mapping from old location on the removed device to the new location * on another device in the pool and use this mapping whenever we need * to access the DVA. Unfortunately, this mapping did not respect * logical block boundaries when it was first created, and so a DVA on * this indirect vdev may be "split" into multiple sections that each * map to a different location. As a consequence, not all DVAs can be * translated to an equivalent new DVA. Instead we must provide a * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * * - I/Os to this vdev use the callback to determine where the * data is now located, and issue child I/Os for each segment's new * location. * * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak * detection.) */ /* * "Big theory statement" for how we mark blocks obsolete. * * When a block on an indirect vdev is freed or remapped, a section of * that vdev's mapping may no longer be referenced (aka "obsolete"). We * keep track of how much of each mapping entry is obsolete. When * an entry becomes completely obsolete, we can remove it, thus reducing * the memory used by the mapping. The complete picture of obsolescence * is given by the following data structures, described below: * - the entry-specific obsolete count * - the vdev-specific obsolete spacemap * - the pool-specific obsolete bpobj * * == On disk data structures used == * * We track the obsolete space for the pool using several objects. Each * of these objects is created on demand and freed when no longer * needed, and is assumed to be empty if it does not exist. * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. * * - Each vic_mapping_object (associated with an indirect vdev) can * have a vimp_counts_object. This is an array of uint32_t's * with the same number of entries as the vic_mapping_object. When * the mapping is condensed, entries from the vic_obsolete_sm_object * (see below) are folded into the counts. Therefore, each * obsolete_counts entry tells us the number of bytes in the * corresponding mapping entry that were not referenced when the * mapping was last condensed. * * - Each indirect or removing vdev can have a vic_obsolete_sm_object. * This is a space map containing an alloc entry for every DVA that * has been obsoleted since the last time this indirect vdev was * condensed. We use this object in order to improve performance * when marking a DVA as obsolete. Instead of modifying an arbitrary * offset of the vimp_counts_object, we only need to append an entry * to the end of this object. When a DVA becomes obsolete, it is * added to the obsolete space map. This happens when the DVA is * freed, remapped and not referenced by a snapshot, or the last * snapshot referencing it is destroyed. * * - Each dataset can have a ds_remap_deadlist object. This is a * deadlist object containing all blocks that were remapped in this * dataset but referenced in a previous snapshot. Blocks can *only* * appear on this list if they were remapped (dsl_dataset_block_remapped); * blocks that were killed in a head dataset are put on the normal * ds_deadlist and marked obsolete when they are freed. * * - The pool can have a dp_obsolete_bpobj. This is a list of blocks * in the pool that need to be marked obsolete. When a snapshot is * destroyed, we move some of the ds_remap_deadlist to the obsolete * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then * asynchronously process the obsolete bpobj, moving its entries to * the specific vdevs' obsolete space maps. * * == Summary of how we mark blocks as obsolete == * * - When freeing a block: if any DVA is on an indirect vdev, append to * vic_obsolete_sm_object. * - When remapping a block, add dva to ds_remap_deadlist (if prev snap * references; otherwise append to vic_obsolete_sm_object). * - When freeing a snapshot: move parts of ds_remap_deadlist to * dp_obsolete_bpobj (same algorithm as ds_deadlist). * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to * individual vdev's vic_obsolete_sm_object. */ /* * "Big theory statement" for how we condense indirect vdevs. * * Condensing an indirect vdev's mapping is the process of determining * the precise counts of obsolete space for each mapping entry (by * integrating the obsolete spacemap into the obsolete counts) and * writing out a new mapping that contains only referenced entries. * * We condense a vdev when we expect the mapping to shrink (see * vdev_indirect_should_condense()), but only perform one condense at a * time to limit the memory usage. In addition, we use a separate * open-context thread (spa_condense_indirect_thread) to incrementally * create the new mapping object in a way that minimizes the impact on * the rest of the system. * * == Generating a new mapping == * * To generate a new mapping, we follow these steps: * * 1. Save the old obsolete space map and create a new mapping object * (see spa_condense_indirect_start_sync()). This initializes the * spa_condensing_indirect_phys with the "previous obsolete space map", * which is now read only. Newly obsolete DVAs will be added to a * new (initially empty) obsolete space map, and will not be * considered as part of this condense operation. * * 2. Construct in memory the precise counts of obsolete space for each * mapping entry, by incorporating the obsolete space map into the * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) * * 3. Iterate through each mapping entry, writing to the new mapping any * entries that are not completely obsolete (i.e. which don't have * obsolete count == mapping length). (See * spa_condense_indirect_generate_new_mapping().) * * 4. Destroy the old mapping object and switch over to the new one * (spa_condense_indirect_complete_sync). * * == Restarting from failure == * * To restart the condense when we import/open the pool, we must start * at the 2nd step above: reconstruct the precise counts in memory, * based on the space map + counts. Then in the 3rd step, we start * iterating where we left off: at vimp_max_offset of the new mapping * object. */ static int zfs_condense_indirect_vdevs_enable = B_TRUE; /* * Condense if at least this percent of the bytes in the mapping is * obsolete. With the default of 25%, the amount of space mapped * will be reduced to 1% of its original size after at most 16 * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ static uint_t zfs_condense_indirect_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of * space on disk (logically). This limits the amount of disk space * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a condense (which might otherwise * complete too quickly). If used to reduce the performance impact of * condensing in production, a maximum value of 1 should be sufficient. */ static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0; /* * If an indirect split block contains more than this many possible unique * combinations when being reconstructed, consider it too computationally * expensive to check them all. Instead, try at most 100 randomly-selected * combinations each time the block is accessed. This allows all segment * copies to participate fairly in the reconstruction when all combinations * cannot be checked and prevents repeated use of one bad copy. */ uint_t zfs_reconstruct_indirect_combinations_max = 4096; /* * Enable to simulate damaged segments and validate reconstruction. This * is intentionally not exposed as a module parameter. */ unsigned long zfs_reconstruct_indirect_damage_fraction = 0; /* * The indirect_child_t represents the vdev that we will read from, when we * need to read all copies of the data (e.g. for scrub or reconstruction). * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, * ic_vdev is a child of the mirror. */ typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; /* * ic_duplicate is NULL when the ic_data contents are unique, when it * is determined to be a duplicate it references the primary child. */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ int ic_error; /* set when a child does not contain the data */ } indirect_child_t; /* * The indirect_split_t represents one mapped segment of an i/o to the * indirect vdev. For non-split (contiguously-mapped) blocks, there will be * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. * For split blocks, there will be several of these. */ typedef struct indirect_split { list_node_t is_node; /* link on iv_splits */ /* * is_split_offset is the offset into the i/o. * This is the sum of the previous splits' is_size's. */ uint64_t is_split_offset; vdev_t *is_vdev; /* top-level vdev */ uint64_t is_target_offset; /* offset on is_vdev */ uint64_t is_size; int is_children; /* number of entries in is_child[] */ int is_unique_children; /* number of entries in is_unique_child */ list_t is_unique_child; /* * is_good_child is the child that we are currently using to * attempt reconstruction. */ indirect_child_t *is_good_child; indirect_child_t is_child[]; } indirect_split_t; /* * The indirect_vsd_t is associated with each i/o to the indirect vdev. * It is the "Vdev-Specific Data" in the zio_t's io_vsd. */ typedef struct indirect_vsd { boolean_t iv_split_block; boolean_t iv_reconstruct; uint64_t iv_unique_combinations; uint64_t iv_attempts; uint64_t iv_attempts_max; list_t iv_splits; /* list of indirect_split_t's */ } indirect_vsd_t; static void vdev_indirect_map_free(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; while ((is = list_remove_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } indirect_child_t *ic; while ((ic = list_remove_head(&is->is_unique_child)) != NULL) ; list_destroy(&is->is_unique_child); kmem_free(is, offsetof(indirect_split_t, is_child[is->is_children])); } kmem_free(iv, sizeof (*iv)); } static const zio_vsd_ops_t vdev_indirect_vsd_ops = { .vsd_free = vdev_indirect_map_free, }; /* * Mark the given offset and size as being obsolete. */ void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(size > 0); VERIFY(vdev_indirect_mapping_entry_for_offset( vd->vdev_indirect_mapping, offset) != NULL); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { mutex_enter(&vd->vdev_obsolete_lock); range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } } /* * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This * wrapper is provided because the DMU does not know about vdev_t's and * cannot directly call vdev_indirect_mark_obsolete. */ void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, uint64_t size, dmu_tx_t *tx) { vdev_t *vd = vdev_lookup_top(spa, vdev_id); ASSERT(dmu_tx_is_syncing(tx)); /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vdev_indirect_mark_obsolete(vd, offset, size); } static spa_condensing_indirect_t * spa_condensing_indirect_create(spa_t *spa) { spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); objset_t *mos = spa->spa_meta_objset; for (int i = 0; i < TXG_SIZE; i++) { list_create(&sci->sci_new_mapping_entries[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } sci->sci_new_mapping = vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); return (sci); } static void spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) { for (int i = 0; i < TXG_SIZE; i++) list_destroy(&sci->sci_new_mapping_entries[i]); if (sci->sci_new_mapping != NULL) vdev_indirect_mapping_close(sci->sci_new_mapping); kmem_free(sci, sizeof (*sci)); } boolean_t vdev_indirect_should_condense(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); if (!zfs_condense_indirect_vdevs_enable) return (B_FALSE); /* * We can only condense one indirect vdev at a time. */ if (spa->spa_condensing_indirect != NULL) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); /* * The mapping object size must not change while we are * condensing, so we can only condense indirect vdevs * (not vdevs that are still in the middle of being removed). */ if (vd->vdev_ops != &vdev_indirect_ops) return (B_FALSE); /* * If nothing new has been marked obsolete, there is no * point in condensing. */ uint64_t obsolete_sm_obj __maybe_unused; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); if (vd->vdev_obsolete_sm == NULL) { ASSERT0(obsolete_sm_obj); return (B_FALSE); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); uint64_t mapping_size = vdev_indirect_mapping_size(vim); uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); ASSERT3U(bytes_obsolete, <=, bytes_mapped); /* * If a high percentage of the bytes that are mapped have become * obsolete, condense (unless the mapping is already small enough). * This has a good chance of reducing the amount of memory used * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= zfs_condense_indirect_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", (u_longlong_t)vd->vdev_id, (int)(bytes_obsolete * 100 / bytes_mapped), (u_longlong_t)bytes_mapped / 1024 / 1024); return (B_TRUE); } /* * If the obsolete space map takes up too much space on disk, * condense in order to free up this disk space. */ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete sm " "length %lluMB >= max size %lluMB", (u_longlong_t)vd->vdev_id, (u_longlong_t)obsolete_sm_size / 1024 / 1024, (u_longlong_t)zfs_condense_max_obsolete_bytes / 1024 / 1024); return (B_TRUE); } return (B_FALSE); } /* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)vic->vic_mapping_object, (u_longlong_t)new_count, (u_longlong_t)old_count); vdev_config_dirty(spa->spa_root_vdev); } /* * This sync task appends entries to the new mapping object. */ static void spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(sci, ==, spa->spa_condensing_indirect); vdev_indirect_mapping_add_entries(sci->sci_new_mapping, &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); } /* * Open-context function to add one entry to the new mapping. The new * entry will be remembered and written from syncing context. */ static void spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync * task to write to the MOS on our behalf. */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), spa_condense_indirect_commit_sync, sci, tx); } vdev_indirect_mapping_entry_t *vime = kmem_alloc(sizeof (*vime), KM_SLEEP); vime->vime_mapping = *vimep; vime->vime_obsolete_count = count; list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); dmu_tx_commit(tx); } static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_num_entries = vdev_indirect_mapping_num_entries(old_mapping); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); zfs_dbgmsg("starting condense of vdev %llu from index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); while (mapi < old_num_entries) { if (zthr_iscancelled(zthr)) { zfs_dbgmsg("pausing condense of vdev %llu " "at index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); break; } vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); ASSERT3U(obsolete_counts[mapi], <=, entry_size); if (obsolete_counts[mapi] < entry_size) { spa_condense_indirect_commit_entry(spa, entry, obsolete_counts[mapi]); /* * This delay may be requested for testing, debugging, * or performance reasons. */ hrtime_t now = gethrtime(); hrtime_t sleep_until = now + MSEC2NSEC( zfs_condense_indirect_commit_entry_delay_ms); zfs_sleep_until(sleep_until); } mapi++; } } static boolean_t spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { (void) zthr; spa_t *spa = arg; return (spa->spa_condensing_indirect != NULL); } static void spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; uint64_t start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; space_map_t *prev_obsolete_sm = NULL; ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { /* * The list must start out empty in order for the * _commit_sync() sync task to be properly registered * on the first call to _commit_entry(); so it's wise * to double check and ensure we actually are starting * with empty lists. */ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, counts, prev_obsolete_sm); } space_map_close(prev_obsolete_sm); /* * Generate new mapping. Determine what index to continue from * based on the max offset that we've already written in the * new mapping. */ uint64_t max_offset = vdev_indirect_mapping_max_offset(sci->sci_new_mapping); if (max_offset == 0) { /* We haven't written anything to the new mapping yet. */ start_index = 0; } else { /* * Pick up from where we left off. _entry_for_offset() * returns a pointer into the vim_entries array. If * max_offset is greater than any of the mappings * contained in the table NULL will be returned and * that indicates we've exhausted our iteration of the * old_mapping. */ vdev_indirect_mapping_entry_phys_t *entry = vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, max_offset); if (entry == NULL) { /* * We've already written the whole new mapping. * This special value will cause us to skip the * generate_new_mapping step and just do the sync * task to complete the condense. */ start_index = UINT64_MAX; } else { start_index = entry - old_mapping->vim_entries; ASSERT3U(start_index, <, vdev_indirect_mapping_num_entries(old_mapping)); } } spa_condense_indirect_generate_new_mapping(vd, counts, start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* * If the zthr has received a cancellation signal while running * in generate_new_mapping() or at any point after that, then bail * early. We don't want to complete the condense if the spa is * shutting down. */ if (zthr_iscancelled(zthr)) return; VERIFY0(dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Sync task to begin the condensing process. */ void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; ASSERT0(scip->scip_next_mapping_object); ASSERT0(scip->scip_prev_obsolete_sm_object); ASSERT0(scip->scip_vdev); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); uint64_t obsolete_sm_obj; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); ASSERT3U(obsolete_sm_obj, !=, 0); scip->scip_vdev = vd->vdev_id; scip->scip_next_mapping_object = vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; /* * We don't need to allocate a new space map object, since * vdev_indirect_sync_obsolete will allocate one when needed. */ space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); ASSERT3P(spa->spa_condensing_indirect, ==, NULL); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); zthr_wakeup(spa->spa_condense_zthr); } /* * Sync to the given vdev's obsolete space map any segments that are no longer * referenced as of the given txg. * * If the obsolete space map doesn't exist yet, create and open it. */ void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object == 0) { obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, zfs_vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); ASSERT3U(obsolete_sm_object, !=, 0); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } int spa_condense_init(spa_t *spa) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), &spa->spa_condensing_indirect_phys); if (error == 0) { if (spa_writeable(spa)) { spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); } return (0); } else if (error == ENOENT) { return (0); } else { return (error); } } void spa_condense_fini(spa_t *spa) { if (spa->spa_condensing_indirect != NULL) { spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; } } void spa_start_indirect_condensing_thread(spa_t *spa) { ASSERT3P(spa->spa_condense_zthr, ==, NULL); spa->spa_condense_zthr = zthr_create("z_indirect_condense", spa_condense_indirect_thread_check, spa_condense_indirect_thread, spa, minclsyspri); } /* * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj * will contain either the obsolete spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } /* * Gets the obsolete count are precise spacemap object from the vdev's ZAP. * On success are_precise will be set to reflect if the counts are precise. * All other errors are returned to the caller. */ int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *are_precise = B_FALSE; return (0); } uint64_t val = 0; int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); if (error == 0) { *are_precise = (val != 0); } else if (error == ENOENT) { *are_precise = B_FALSE; error = 0; } return (error); } static void vdev_indirect_close(vdev_t *vd) { (void) vd; } static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *logical_ashift = vd->vdev_ashift; *physical_ashift = vd->vdev_physical_ashift; return (0); } typedef struct remap_segment { vdev_t *rs_vd; uint64_t rs_offset; uint64_t rs_asize; uint64_t rs_split_offset; list_node_t rs_node; } remap_segment_t; static remap_segment_t * rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); rs->rs_vd = vd; rs->rs_offset = offset; rs->rs_asize = asize; rs->rs_split_offset = split_offset; return (rs); } /* * Given an indirect vdev and an extent on that vdev, it duplicates the * physical entries of the indirect mapping that correspond to the extent * to a new array and returns a pointer to it. In addition, copied_entries * is populated with the number of mapping entries that were duplicated. * * Note that the function assumes that the caller holds vdev_indirect_rwlock. * This ensures that the mapping won't change due to condensing as we * copy over its contents. * * Finally, since we are doing an allocation, it is up to the caller to * free the array allocated in this function. */ static vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t *copied_entries) { vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t entries = 0; ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); vdev_indirect_mapping_entry_phys_t *first_mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); ASSERT3P(first_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *m = first_mapping; while (asize > 0) { uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(asize, size - inner_offset); offset += inner_size; asize -= inner_size; entries++; m++; } size_t copy_length = entries * sizeof (*first_mapping); duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); memcpy(duplicate_mappings, first_mapping, copy_length); *copied_entries = entries; return (duplicate_mappings); } /* * Goes through the relevant indirect mappings until it hits a concrete vdev * and issues the callback. On the way to the concrete vdev, if any other * indirect vdevs are encountered, then the callback will also be called on * each of those indirect vdevs. For example, if the segment is mapped to * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is * mapped to segment B on concrete vdev 2, then the callback will be called on * both vdev 1 and vdev 2. * * While the callback passed to vdev_indirect_remap() is called on every vdev * the function encounters, certain callbacks only care about concrete vdevs. * These types of callbacks should return immediately and explicitly when they * are called on an indirect vdev. * * Because there is a possibility that a DVA section in the indirect device * has been split into multiple sections in our mapping, we keep track * of the relevant contiguous segments of the new location (remap_segment_t) * in a stack. This way we can call the callback for each of the new sections * created by a single section of the indirect device. Note though, that in * this scenario the callbacks in each split block won't occur in-order in * terms of offset, so callers should not make any assumptions about that. * * For callbacks that don't handle split blocks and immediately return when * they encounter them (as is the case for remap_blkptr_cb), the caller can * assume that its callback will be applied from the first indirect vdev * encountered to the last one and then the concrete vdev, in that order. */ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) { list_t stack; spa_t *spa = vd->vdev_spa; list_create(&stack, sizeof (remap_segment_t), offsetof(remap_segment_t, rs_node)); for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; uint64_t num_entries = 0; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ASSERT(rs->rs_asize > 0); /* * Note: As this function can be called from open context * (e.g. zio_read()), we need the following rwlock to * prevent the mapping from being changed by condensing. * * So we grab the lock and we make a copy of the entries * that are relevant to the extent that we are working on. * Once that is done, we drop the lock and iterate over * our copy of the mapping. Once we are done with the with * the remap segment and we free it, we also free our copy * of the indirect mapping entries that are relevant to it. * * This way we don't need to wait until the function is * finished with a segment, to condense it. In addition, we * don't need a recursive rwlock for the case that a call to * vdev_indirect_remap() needs to call itself (through the * codepath of its callback) for the same vdev in the middle * of its execution. */ rw_enter(&v->vdev_indirect_rwlock, RW_READER); ASSERT3P(v->vdev_indirect_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *mapping = vdev_indirect_mapping_duplicate_adjacent_entries(v, rs->rs_offset, rs->rs_asize, &num_entries); ASSERT3P(mapping, !=, NULL); ASSERT3U(num_entries, >, 0); rw_exit(&v->vdev_indirect_rwlock); for (uint64_t i = 0; i < num_entries; i++) { /* * Note: the vdev_indirect_mapping can not change * while we are running. It only changes while the * removal is in progress, and then only from syncing * context. While a removal is in progress, this * function is only called for frees, which also only * happen from syncing context. */ vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; ASSERT3P(m, !=, NULL); ASSERT3U(rs->rs_asize, >, 0); uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); ASSERT3U(rs->rs_offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(rs->rs_offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); ASSERT3U(dst_vdev, !=, v->vdev_id); uint64_t inner_offset = rs->rs_offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(rs->rs_asize, size - inner_offset); vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); ASSERT3P(dst_v, !=, NULL); if (dst_v->vdev_ops == &vdev_indirect_ops) { list_insert_head(&stack, rs_alloc(dst_v, dst_offset + inner_offset, inner_size, rs->rs_split_offset)); } if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { /* * Note: This clause exists only solely for * testing purposes. We use it to ensure that * split blocks work and that the callbacks * using them yield the same result if issued * in reverse order. */ uint64_t inner_half = inner_size / 2; func(rs->rs_split_offset + inner_half, dst_v, dst_offset + inner_offset + inner_half, inner_half, arg); func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_half, arg); } else { func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_size, arg); } rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; } VERIFY0(rs->rs_asize); kmem_free(mapping, num_entries * sizeof (*mapping)); kmem_free(rs, sizeof (remap_segment_t)); } list_destroy(&stack); } static void vdev_indirect_child_io_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); abd_free(zio->io_abd); } /* * This is a callback for vdev_indirect_remap() which allocates an * indirect_split_t for each split segment and adds it to iv_splits. */ static void vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; int n = 1; if (vd->vdev_ops == &vdev_mirror_ops) n = vd->vdev_children; indirect_split_t *is = kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); is->is_children = n; is->is_size = size; is->is_split_offset = split_offset; is->is_target_offset = offset; is->is_vdev = vd; list_create(&is->is_unique_child, sizeof (indirect_child_t), offsetof(indirect_child_t, ic_node)); /* * Note that we only consider multiple copies of the data for * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even * though they use the same ops as mirror, because there's only one * "good" copy under the replacing/spare. */ if (vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < n; i++) { is->is_child[i].ic_vdev = vd->vdev_child[i]; list_link_init(&is->is_child[i].ic_node); } } else { is->is_child[0].ic_vdev = vd; } list_insert_tail(&iv->iv_splits, is); } static void vdev_indirect_read_split_done(zio_t *zio) { indirect_child_t *ic = zio->io_private; if (zio->io_error != 0) { /* * Clear ic_data to indicate that we do not have data for this * child. */ abd_free(ic->ic_data); ic->ic_data = NULL; } } /* * Issue reads for all copies (mirror children) of all splits. */ static void vdev_indirect_read_all(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (!vdev_readable(ic->ic_vdev)) continue; /* * If a child is missing the data, set ic_error. Used * in vdev_indirect_repair(). We perform the read * nevertheless which provides the opportunity to * reconstruct the split block if at all possible. */ if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, zio->io_txg, 1)) ic->ic_error = SET_ERROR(ESTALE); ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); ic->ic_duplicate = NULL; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_read_split_done, ic)); } } iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { spa_t *spa __maybe_unused = zio->io_spa; indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); zio->io_vsd = iv; zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); /* * Note: this code can handle other kinds of writes, * but we don't expect them. */ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); ASSERT3P(first, !=, NULL); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire * data, which will checksum the same as the original data. * Pass the BP down so that the child i/o can verify the * checksum, and try a different location if available * (e.g. on a mirror). * * While this special case could be handled the same as the * general (split block) case, doing it this way ensures * that the vast majority of blocks on indirect vdevs * (which are not split) are handled identically to blocks * on non-indirect vdevs. This allows us to be less strict * about performance in the general (but rare) case. */ ASSERT0(first->is_split_offset); ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, first->is_vdev, first->is_target_offset, abd_get_offset(zio->io_abd, 0), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } else { iv->iv_split_block = B_TRUE; if (zio->io_type == ZIO_TYPE_READ && zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { /* * Read all copies. Note that for simplicity, * we don't bother consulting the DTL in the * resilver case. */ vdev_indirect_read_all(zio); } else { /* * If this is a read zio, we read one copy of each * split segment, from the top-level vdev. Since * we don't know the checksum of each split * individually, the child zio can't ensure that * we get the right data. E.g. if it's a mirror, * it will just read from a random (healthy) leaf * vdev. We have to verify the checksum in * vdev_indirect_io_done(). * * For write zios, the vdev code will ensure we write * to all children. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, abd_get_offset_size(zio->io_abd, is->is_split_offset, is->is_size), is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } } } zio_execute(zio); } /* * Report a checksum error for a child. */ static void vdev_indirect_checksum_error(zio_t *zio, indirect_split_t *is, indirect_child_t *ic) { vdev_t *vd = ic->ic_vdev; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zio_bad_cksum_t zbc = { 0 }; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); } /* * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data * with the good copy. The DTL is checked in vdev_indirect_read_all() and * if a vdev is missing a copy of the data we set ic_error and the read is * performed. This provides the opportunity to reconstruct the split block * if at all possible. ic_error is checked here and if set it suppresses * incrementing the checksum counter. Aside from this DTLs are not checked, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). */ static void vdev_indirect_repair(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (!spa_writeable(zio->io_spa)) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, is->is_good_child->ic_data, is->is_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); /* * If ic_error is set the current child does not have * a copy of the data, so suppress incrementing the * checksum counter. */ if (ic->ic_error == ESTALE) continue; vdev_indirect_checksum_error(zio, is, ic); } } } /* * Report checksum errors on all children that we read from. */ static void vdev_indirect_all_checksum_errors(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data == NULL) continue; vdev_t *vd = ic->ic_vdev; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); } } } /* * Copy data from all the splits to a main zio then validate the checksum. * If then checksum is successfully validated return success. */ static int vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) { zio_bad_cksum_t zbc; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { ASSERT3P(is->is_good_child->ic_data, !=, NULL); ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); abd_copy_off(zio->io_abd, is->is_good_child->ic_data, is->is_split_offset, 0, is->is_size); } return (zio_checksum_error(zio, &zbc)); } /* * There are relatively few possible combinations making it feasible to * deterministically check them all. We do this by setting the good_child * to the next unique split version. If we reach the end of the list then * "carry over" to the next unique split version (like counting in base * is_unique_children, but each digit can have a different base). */ static int vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) { boolean_t more = B_TRUE; iv->iv_attempts = 0; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) is->is_good_child = list_head(&is->is_unique_child); while (more == B_TRUE) { iv->iv_attempts++; more = B_FALSE; if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_good_child = list_next(&is->is_unique_child, is->is_good_child); if (is->is_good_child != NULL) { more = B_TRUE; break; } is->is_good_child = list_head(&is->is_unique_child); } } ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); return (SET_ERROR(ECKSUM)); } /* * There are too many combinations to try all of them in a reasonable amount * of time. So try a fixed number of random combinations from the unique * split versions, after which we'll consider the block unrecoverable. */ static int vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) { iv->iv_attempts = 0; while (iv->iv_attempts < iv->iv_attempts_max) { iv->iv_attempts++; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic = list_head(&is->is_unique_child); int children = is->is_unique_children; for (int i = random_in_range(children); i > 0; i--) ic = list_next(&is->is_unique_child, ic); ASSERT3P(ic, !=, NULL); is->is_good_child = ic; } if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); } return (SET_ERROR(ECKSUM)); } /* * This is a validation function for reconstruction. It randomly selects * a good combination, if one can be found, and then it intentionally * damages all other segment copes by zeroing them. This forces the * reconstruction algorithm to locate the one remaining known good copy. */ static int vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) { int error; /* Presume all the copies are unique for initial selection. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (ic->ic_data != NULL) { is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic); } } if (list_is_empty(&is->is_unique_child)) { error = SET_ERROR(EIO); goto out; } } /* * Set each is_good_child to a randomly-selected child which * is known to contain validated data. */ error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error) goto out; /* * Damage all but the known good copy by zeroing it. This will * result in two or less unique copies per indirect_child_t. * Both may need to be checked in order to reconstruct the block. * Set iv->iv_attempts_max such that all unique combinations will * enumerated, but limit the damage to at most 12 indirect splits. */ iv->iv_attempts_max = 1; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; if (iv->iv_attempts_max >= (1ULL << 12)) { iv->iv_attempts_max = UINT64_MAX; break; } } out: /* Empty the unique children lists so they can be reconstructed. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; while ((ic = list_remove_head(&is->is_unique_child)) != NULL) ; is->is_unique_children = 0; } return (error); } /* * This function is called when we have read all copies of the data and need * to try to find a combination of copies that gives us the right checksum. * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know * the checksum of each split segment individually. * * We have to try every unique combination of copies of split segments, until * we find one that checksums correctly. Duplicate segment copies are first * identified and latter skipped during reconstruction. This optimization * reduces the search space and ensures that of the remaining combinations * at most one is correct. * * When the total number of combinations is small they can all be checked. * For example, if we have 3 segments in the split, and each points to a * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] * ======|===================== * A | data_A_0 data_A_1 * B | data_B_0 data_B_1 * C | data_C_0 data_C_1 * * We will try the following (mirror children)^(number of splits) (2^3=8) * combinations, which is similar to bitwise-little-endian counting in * binary. In general each "digit" corresponds to a split segment, and the * base of each digit is is_children, which can be different for each * digit. * * "low bit" "high bit" * v v * data_A_0 data_B_0 data_C_0 * data_A_1 data_B_0 data_C_0 * data_A_0 data_B_1 data_C_0 * data_A_1 data_B_1 data_C_0 * data_A_0 data_B_0 data_C_1 * data_A_1 data_B_0 data_C_1 * data_A_0 data_B_1 data_C_1 * data_A_1 data_B_1 data_C_1 * * Note that the split segments may be on the same or different top-level * vdevs. In either case, we may need to try lots of combinations (see * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror * has small silent errors on all of its children, we can still reconstruct * the correct data, as long as those errors are at sufficiently-separated * offsets (specifically, separated by the largest block size - default of * 128KB, but up to 16MB). */ static void vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; boolean_t known_good = B_FALSE; int error; iv->iv_unique_combinations = 1; iv->iv_attempts_max = UINT64_MAX; if (zfs_reconstruct_indirect_combinations_max > 0) iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; /* * If nonzero, every 1/x blocks will be damaged, in order to validate * reconstruction when there are split segments with damaged copies. * Known_good will be TRUE when reconstruction is known to be possible. */ if (zfs_reconstruct_indirect_damage_fraction != 0 && random_in_range(zfs_reconstruct_indirect_damage_fraction) == 0) known_good = (vdev_indirect_splits_damage(iv, zio) == 0); /* * Determine the unique children for a split segment and add them * to the is_unique_child list. By restricting reconstruction * to these children, only unique combinations will be considered. * This can vastly reduce the search space when there are a large * number of indirect splits. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic_i = &is->is_child[i]; if (ic_i->ic_data == NULL || ic_i->ic_duplicate != NULL) continue; for (int j = i + 1; j < is->is_children; j++) { indirect_child_t *ic_j = &is->is_child[j]; if (ic_j->ic_data == NULL || ic_j->ic_duplicate != NULL) continue; if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) ic_j->ic_duplicate = ic_i; } is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic_i); } /* Reconstruction is impossible, no valid children */ EQUIV(list_is_empty(&is->is_unique_child), is->is_unique_children == 0); if (list_is_empty(&is->is_unique_child)) { zio->io_error = EIO; vdev_indirect_all_checksum_errors(zio); zio_checksum_verified(zio); return; } iv->iv_unique_combinations *= is->is_unique_children; } if (iv->iv_unique_combinations <= iv->iv_attempts_max) error = vdev_indirect_splits_enumerate_all(iv, zio); else error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error != 0) { /* All attempted combinations failed. */ ASSERT3B(known_good, ==, B_FALSE); zio->io_error = error; vdev_indirect_all_checksum_errors(zio); } else { /* * The checksum has been successfully validated. Issue * repair I/Os to any copies of splits which don't match * the validated version. */ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); vdev_indirect_repair(zio); zio_checksum_verified(zio); } } static void vdev_indirect_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (iv->iv_reconstruct) { /* * We have read all copies of the data (e.g. from mirrors), * either because this was a scrub/resilver, or because the * one-copy read didn't checksum correctly. */ vdev_indirect_reconstruct_io_done(zio); return; } if (!iv->iv_split_block) { /* * This was not a split block, so we passed the BP down, * and the checksum was handled by the (one) child zio. */ return; } zio_bad_cksum_t zbc; int ret = zio_checksum_error(zio, &zbc); /* * Any Direct I/O read that has a checksum error must be treated as * suspicious as the contents of the buffer could be getting * manipulated while the I/O is taking place. The checksum verify error * will be reported to the top-level VDEV. */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); ret = 0; } if (ret == 0) { zio_checksum_verified(zio); return; } /* * The checksum didn't match. Read all copies of all splits, and * then we will try to reconstruct. The next time * vdev_indirect_io_done() is called, iv_reconstruct will be set. */ vdev_indirect_read_all(zio); zio_vdev_io_redone(zio); } vdev_ops_t vdev_indirect_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; EXPORT_SYMBOL(spa_condense_fini); EXPORT_SYMBOL(spa_start_indirect_condensing_thread); EXPORT_SYMBOL(spa_condense_indirect_start_sync); EXPORT_SYMBOL(spa_condense_init); EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_should_condense); EXPORT_SYMBOL(vdev_indirect_sync_obsolete); EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT, ZMOD_RW, "Minimum obsolete percent of bytes in the mapping " "to attempt condensing"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW, "Don't bother condensing if the mapping uses less than this amount of " "memory"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64, ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, UINT, ZMOD_RW, "Used by tests to ensure certain actions happen in the middle of a " "condense. A maximum value of 1 should be sufficient."); ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, UINT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); -/* END CSTYLED */ diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 65a840bf9728..850569d1a35e 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -1,1061 +1,1059 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include /* * Vdev mirror kstats */ static kstat_t *mirror_ksp = NULL; typedef struct mirror_stats { kstat_named_t vdev_mirror_stat_rotating_linear; kstat_named_t vdev_mirror_stat_rotating_offset; kstat_named_t vdev_mirror_stat_rotating_seek; kstat_named_t vdev_mirror_stat_non_rotating_linear; kstat_named_t vdev_mirror_stat_non_rotating_seek; kstat_named_t vdev_mirror_stat_preferred_found; kstat_named_t vdev_mirror_stat_preferred_not_found; } mirror_stats_t; static mirror_stats_t mirror_stats = { /* New I/O follows directly the last I/O */ { "rotating_linear", KSTAT_DATA_UINT64 }, /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */ { "rotating_offset", KSTAT_DATA_UINT64 }, /* New I/O requires random seek */ { "rotating_seek", KSTAT_DATA_UINT64 }, /* New I/O follows directly the last I/O (nonrot) */ { "non_rotating_linear", KSTAT_DATA_UINT64 }, /* New I/O requires random seek (nonrot) */ { "non_rotating_seek", KSTAT_DATA_UINT64 }, /* Preferred child vdev found */ { "preferred_found", KSTAT_DATA_UINT64 }, /* Preferred child vdev not found or equal load */ { "preferred_not_found", KSTAT_DATA_UINT64 }, }; #define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64) #define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val) #define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1) void vdev_mirror_stat_init(void) { mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats", "misc", KSTAT_TYPE_NAMED, sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (mirror_ksp != NULL) { mirror_ksp->ks_data = &mirror_stats; kstat_install(mirror_ksp); } } void vdev_mirror_stat_fini(void) { if (mirror_ksp != NULL) { kstat_delete(mirror_ksp); mirror_ksp = NULL; } } /* * Virtual device vector for mirroring. */ typedef struct mirror_child { vdev_t *mc_vd; abd_t *mc_abd; uint64_t mc_offset; int mc_error; int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; uint8_t mc_rebuilding; } mirror_child_t; typedef struct mirror_map { int *mm_preferred; int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; boolean_t mm_rebuilding; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; static const int vdev_mirror_shift = 21; /* * The load configuration settings below are tuned by default for * the case where all devices are of the same rotational type. * * If there is a mixture of rotating and non-rotating media, setting * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results * as it will direct more reads to the non-rotating vdevs which are more likely * to have a higher performance. */ /* Rotating media load calculation configuration. */ static int zfs_vdev_mirror_rotating_inc = 0; static int zfs_vdev_mirror_rotating_seek_inc = 5; static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; /* Non-rotating media load calculation configuration. */ static int zfs_vdev_mirror_non_rotating_inc = 0; static int zfs_vdev_mirror_non_rotating_seek_inc = 1; static inline size_t vdev_mirror_map_size(int children) { return (offsetof(mirror_map_t, mm_child[children]) + sizeof (int) * children); } static inline mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) { mirror_map_t *mm; mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); mm->mm_children = children; mm->mm_resilvering = resilvering; mm->mm_root = root; mm->mm_preferred = (int *)((uintptr_t)mm + offsetof(mirror_map_t, mm_child[children])); return (mm); } static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { .vsd_free = vdev_mirror_map_free, }; static int vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) { uint64_t last_offset; int64_t offset_diff; int load; /* All DVAs have equal weight at the root. */ if (mm->mm_root) return (INT_MAX); /* * We don't return INT_MAX if the device is resilvering i.e. * vdev_resilver_txg != 0 as when tested performance was slightly * worse overall when resilvering with compared to without. */ /* Fix zio_offset for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) zio_offset += VDEV_LABEL_START_SIZE; /* Standard load based on pending queue length. */ load = vdev_queue_length(vd); last_offset = vdev_queue_last_offset(vd); if (vd->vdev_nonrot) { /* Non-rotating media. */ if (last_offset == zio_offset) { MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear); return (load + zfs_vdev_mirror_non_rotating_inc); } /* * Apply a seek penalty even for non-rotating devices as * sequential I/O's can be aggregated into fewer operations on * the device, thus avoiding unnecessary per-command overhead * and boosting performance. */ MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek); return (load + zfs_vdev_mirror_non_rotating_seek_inc); } /* Rotating media I/O's which directly follow the last I/O. */ if (last_offset == zio_offset) { MIRROR_BUMP(vdev_mirror_stat_rotating_linear); return (load + zfs_vdev_mirror_rotating_inc); } /* * Apply half the seek increment to I/O's within seek offset * of the last I/O issued to this vdev as they should incur less * of a seek increment. */ offset_diff = (int64_t)(last_offset - zio_offset); if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) { MIRROR_BUMP(vdev_mirror_stat_rotating_offset); return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); } /* Apply the full seek increment to all other I/O's. */ MIRROR_BUMP(vdev_mirror_stat_rotating_seek); return (load + zfs_vdev_mirror_rotating_seek_inc); } static boolean_t vdev_mirror_rebuilding(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (vdev_mirror_rebuilding(vd->vdev_child[i])) { return (B_TRUE); } } return (B_FALSE); } /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dva_t dva_copy[SPA_DVAS_PER_BP]; /* * The sequential scrub code sorts and issues all DVAs * of a bp separately. Each of these IOs includes all * original DVA copies so that repairs can be performed * in the event of an error, but we only actually want * to check the first DVA since the others will be * checked by their respective sorted IOs. Only if we * hit an error will we try all DVAs upon retrying. * * Note: This check is safe even if the user switches * from a legacy scrub to a sequential one in the middle * of processing, since scn_is_sorted isn't updated until * all outstanding IOs from the previous scrub pass * complete. */ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !(zio->io_flags & ZIO_FLAG_IO_RETRY) && dsl_scan_scrubbing(spa->spa_dsl_pool) && scn->scn_is_sorted) { c = 1; } else { c = BP_GET_NDVAS(zio->io_bp); } /* * If the pool cannot be written to, then infer that some * DVAs might be invalid or point to vdevs that do not exist. * We skip them. */ if (!spa_writeable(spa)) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); int j = 0; for (int i = 0; i < c; i++) { if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) dva_copy[j++] = dva[i]; } if (j == 0) { zio->io_vsd = NULL; zio->io_error = ENXIO; return (NULL); } if (j < c) { dva = dva_copy; c = j; } } mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); if (mc->mc_vd == NULL) { kmem_free(mm, vdev_mirror_map_size( mm->mm_children)); zio->io_vsd = NULL; zio->io_error = ENXIO; return (NULL); } } } else { /* * If we are resilvering, then we should handle scrub reads * differently; we shouldn't issue them to the resilvering * device because it might not have those blocks. * * We are resilvering iff: * 1) We are a replacing vdev (ie our name is "replacing-1" or * "spare-1" or something like that), and * 2) The pool is currently being resilvered. * * We cannot simply check vd->vdev_resilver_txg, because it's * not set in this path. * * Nor can we just check our vdev_ops; there are cases (such as * when a user types "zpool replace pool odev spare_dev" and * spare_dev is in the spare list, or when a spare device is * automatically used to replace a DEGRADED device) when * resilvering is complete but both the original vdev and the * spare vdev remain in the pool. That behavior is intentional. * It helps implement the policy that a spare should be * automatically removed from the pool after the user replaces * the device that originally failed. * * If a spa load is in progress, then spa_dsl_pool may be * uninitialized. But we shouldn't be resilvering during a spa * load anyway. */ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops) && spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; if (vdev_mirror_rebuilding(mc->mc_vd)) mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; } } return (mm); } static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { int numerrors = 0; int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); } for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error) continue; *physical_ashift = vdev_best_ashift(*logical_ashift, *physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { if (vdev_children_are_offline(vd)) vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; else vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_mirror_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static void vdev_mirror_child_done(zio_t *zio) { mirror_child_t *mc = zio->io_private; mc->mc_error = zio->io_error; mc->mc_tried = 1; mc->mc_skipped = 0; } /* * Check the other, lower-index DVAs to see if they're on the same * vdev as the child we picked. If they are, use them since they * are likely to have been allocated from the primary metaslab in * use at the time, and hence are more likely to have locality with * single-copy data. */ static int vdev_mirror_dva_select(zio_t *zio, int p) { dva_t *dva = zio->io_bp->blk_dva; mirror_map_t *mm = zio->io_vsd; int preferred; int c; preferred = mm->mm_preferred[p]; for (p--; p >= 0; p--) { c = mm->mm_preferred[p]; if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) preferred = c; } return (preferred); } static int vdev_mirror_preferred_child_randomize(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; int p; if (mm->mm_root) { p = random_in_range(mm->mm_preferred_cnt); return (vdev_mirror_dva_select(zio, p)); } /* * To ensure we don't always favour the first matching vdev, * which could lead to wear leveling issues on SSD's, we * use the I/O offset as a pseudo random seed into the vdevs * which have the lowest load. */ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; return (mm->mm_preferred[p]); } static boolean_t vdev_mirror_child_readable(mirror_child_t *mc) { vdev_t *vd = mc->mc_vd; if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) return (vdev_draid_readable(vd, mc->mc_offset)); else return (vdev_readable(vd)); } static boolean_t vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) { vdev_t *vd = mc->mc_vd; if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); else return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); } /* * Try to find a vdev whose DTL doesn't contain the block we want to read * preferring vdevs based on determined load. If we can't, try the read on * any vdev we haven't already tried. * * Distributed spares are an exception to the above load rule. They are * always preferred in order to detect gaps in the distributed spare which * are created when another disk in the dRAID fails. In order to restore * redundancy those gaps must be read to trigger the required repair IO. */ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; uint64_t txg = zio->io_txg; int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; for (c = 0; c < mm->mm_children; c++) { mirror_child_t *mc; mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; if (mc->mc_vd == NULL || !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { mm->mm_preferred[0] = c; mm->mm_preferred_cnt = 1; break; } mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; if (mc->mc_load < lowest_load) { lowest_load = mc->mc_load; mm->mm_preferred_cnt = 0; } mm->mm_preferred[mm->mm_preferred_cnt] = c; mm->mm_preferred_cnt++; } if (mm->mm_preferred_cnt == 1) { MIRROR_BUMP(vdev_mirror_stat_preferred_found); return (mm->mm_preferred[0]); } if (mm->mm_preferred_cnt > 1) { MIRROR_BUMP(vdev_mirror_stat_preferred_not_found); return (vdev_mirror_preferred_child_randomize(zio)); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ for (c = 0; c < mm->mm_children; c++) { if (!mm->mm_child[c].mc_tried) return (c); } /* * Every child failed. There's no place left to look. */ return (-1); } static void vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; mirror_child_t *mc; int c, children; mm = vdev_mirror_map_init(zio); zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; if (mm == NULL) { ASSERT(!spa_trust_config(zio->io_spa)); ASSERT(zio->io_type == ZIO_TYPE_READ); zio_execute(zio); return; } if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { /* * For scrubbing reads we need to issue reads to all * children. One child can reuse parent buffer, but * for others we have to allocate separate ones to * verify checksums if io_bp is non-NULL, or compare * them in vdev_mirror_io_done() otherwise. */ boolean_t first = B_TRUE; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; /* Don't issue ZIOs to offline children */ if (!vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; mc->mc_skipped = 1; continue; } mc->mc_abd = first ? zio->io_abd : abd_alloc_sametype(zio->io_abd, zio->io_size); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, mc->mc_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); first = B_FALSE; } zio_execute(zio); return; } /* * For normal reads just pick one child. */ c = vdev_mirror_child_select(zio); children = (c >= 0); } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* * Writes go to all children. */ c = 0; children = mm->mm_children; } while (children--) { mc = &mm->mm_child[c]; c++; /* * When sequentially resilvering only issue write repair * IOs to the vdev which is being rebuilt since performance * is limited by the slowest child. This is an issue for * faster replacement devices such as distributed spares. */ if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && (zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_rebuilding && !mc->mc_rebuilding) { continue; } zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); } zio_execute(zio); } static int vdev_mirror_worst_error(mirror_map_t *mm) { int error[2] = { 0, 0 }; for (int c = 0; c < mm->mm_children; c++) { mirror_child_t *mc = &mm->mm_child[c]; int s = mc->mc_speculative; error[s] = zio_worst_error(error[s], mc->mc_error); } return (error[0] ? error[0] : error[1]); } static void vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; mirror_child_t *mc; int c; int good_copies = 0; int unexpected_errors = 0; int last_good_copy = -1; if (mm == NULL) return; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mc->mc_error) { if (!mc->mc_skipped) unexpected_errors++; } else if (mc->mc_tried) { last_good_copy = c; good_copies++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as success. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ if (good_copies != mm->mm_children) { /* * Always require at least one good copy. * * For ditto blocks (io_vd == NULL), require * all copies to be good. * * XXX -- for replacing vdevs, there's no great answer. * If the old device is really dead, we may not even * be able to access it -- so we only want to * require good writes to the new device. But if * the new device turns out to be flaky, we want * to be able to detach it -- which requires all * writes to the old device to have succeeded. */ if (good_copies == 0 || zio->io_vd == NULL) zio->io_error = vdev_mirror_worst_error(mm); } return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * Any Direct I/O read that has a checksum error must be treated as * suspicious as the contents of the buffer could be getting * manipulated while the I/O is taking place. The checksum verify error * will be reported to the top-level Mirror VDEV. * * There will be no attampt at reading any additional data copies. If * the buffer is still being manipulated while attempting to read from * another child, there exists a possibly that the checksum could be * verified as valid. However, the buffer contents could again get * manipulated after verifying the checksum. This would lead to bad data * being written out during self healing. */ if ((zio->io_flags & ZIO_FLAG_DIO_READ) && (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { zio_dio_chksum_verify_error_report(zio); zio->io_error = vdev_mirror_worst_error(mm); ASSERT3U(zio->io_error, ==, ECKSUM); return; } /* * If we don't have a good copy yet, keep trying other children. */ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ASSERT(c >= 0 && c < mm->mm_children); mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; } if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) { abd_t *best_abd = NULL; if (last_good_copy >= 0) best_abd = mm->mm_child[last_good_copy].mc_abd; /* * If we're scrubbing but don't have a BP available (because * this vdev is under a raidz or draid vdev) then the best we * can do is compare all of the copies read. If they're not * identical then return a checksum error and the most likely * correct data. The raidz code will issue a repair I/O if * possible. */ if (zio->io_bp == NULL) { ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops || zio->io_vd->vdev_ops == &vdev_spare_ops); abd_t *pref_abd = NULL; for (c = 0; c < last_good_copy; c++) { mc = &mm->mm_child[c]; if (mc->mc_error || !mc->mc_tried) continue; if (abd_cmp(mc->mc_abd, best_abd) != 0) zio->io_error = SET_ERROR(ECKSUM); /* * The distributed spare is always prefered * by vdev_mirror_child_select() so it's * considered to be the best candidate. */ if (pref_abd == NULL && mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) pref_abd = mc->mc_abd; /* * In the absence of a preferred copy, use * the parent pointer to avoid a memory copy. */ if (mc->mc_abd == zio->io_abd) best_abd = mc->mc_abd; } if (pref_abd) best_abd = pref_abd; } else { /* * If we have a BP available, then checksums are * already verified and we just need a buffer * with valid data, preferring parent one to * avoid a memory copy. */ for (c = 0; c < last_good_copy; c++) { mc = &mm->mm_child[c]; if (mc->mc_error || !mc->mc_tried) continue; if (mc->mc_abd == zio->io_abd) { best_abd = mc->mc_abd; break; } } } if (best_abd && best_abd != zio->io_abd) abd_copy(zio->io_abd, best_abd, zio->io_size); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mc->mc_abd != zio->io_abd) abd_free(mc->mc_abd); mc->mc_abd = NULL; } } if (good_copies == 0) { zio->io_error = vdev_mirror_worst_error(mm); ASSERT(zio->io_error != 0); } if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { /* * Use the good data we have in hand to repair damaged children. */ for (c = 0; c < mm->mm_children; c++) { /* * Don't rewrite known good children. * Not only is it unnecessary, it could * actually be harmful: if the system lost * power while rewriting the only good copy, * there would be no good copies left! */ mc = &mm->mm_child[c]; if (mc->mc_error == 0) { vdev_ops_t *ops = mc->mc_vd->vdev_ops; if (mc->mc_tried) continue; /* * We didn't try this child. We need to * repair it if: * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - * 2. it's an indirect or distributed spare * vdev (in which case it could point to any * other vdev, which might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && ops != &vdev_indirect_ops && ops != &vdev_draid_spare_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = SET_ERROR(ESTALE); } zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority == ZIO_PRIORITY_REBUILD ? ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } static void vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted == vd->vdev_children) { if (vdev_children_are_offline(vd)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, VDEV_AUX_CHILDREN_OFFLINE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); } } else if (degraded + faulted != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } } /* * Return the maximum asize for a rebuild zio in the provided range. */ static uint64_t vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, uint64_t max_segment) { (void) start; uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); return (MIN(asize, vdev_psize_to_asize(vd, psize))); } vdev_ops_t vdev_mirror_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, "Rotating media load increment for non-seeking I/Os"); ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW, "Rotating media load increment for seeking I/Os"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW, "Offset in bytes from the last I/O which triggers " "a reduced rotating media seek increment"); -/* END CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os"); ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW, "Non-rotating media load increment for seeking I/Os"); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 045395549577..e4487c485075 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1,5124 +1,5122 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ #endif /* * Virtual device vector for RAID-Z. * * This vdev supports single, double, and triple parity. For single parity, * we use a simple XOR of all the data columns. For double or triple parity, * we use a special case of Reed-Solomon coding. This extends the * technique described in "The mathematics of RAID-6" by H. Peter Anvin by * drawing on the system described in "A Tutorial on Reed-Solomon Coding for * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the * former is also based. The latter is designed to provide higher performance * for writes. * * Note that the Plank paper claimed to support arbitrary N+M, but was then * amended six years later identifying a critical flaw that invalidates its * claims. Nevertheless, the technique can be adapted to work for up to * triple parity. For additional parity, the amendment "Note: Correction to * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding * is viable, but the additional complexity means that write performance will * suffer. * * All of the methods above operate on a Galois field, defined over the * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements * can be expressed with a single byte. Briefly, the operations on the * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: * * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 * (A * 2)_4 = A_3 + A_7 * (A * 2)_3 = A_2 + A_7 * (A * 2)_2 = A_1 + A_7 * (A * 2)_1 = A_0 * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). * As an aside, this multiplication is derived from the error correcting * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather * than field addition). The inverse of a field element A (A^-1) is therefore * A ^ (255 - 1) = A^254. * * The up-to-three parity columns, P, Q, R over several data columns, * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) * * See the reconstruction code below for how P, Q and R can used individually * or in concert to recover missing data columns. */ #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a * 64-bit value all at once rather than a byte at a time. This works by * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ #define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } /* * Big Theory Statement for how a RAIDZ VDEV is expanded * * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs * that have been previously expanded can be expanded again. * * The RAIDZ VDEV must be healthy (must be able to write to all the drives in * the VDEV) when an expansion starts. And the expansion will pause if any * disk in the VDEV fails, and resume once the VDEV is healthy again. All other * operations on the pool can continue while an expansion is in progress (e.g. * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, * and zpool initialize which can't be run during an expansion. Following a * reboot or export/import, the expansion resumes where it left off. * * == Reflowing the Data == * * The expansion involves reflowing (copying) the data from the current set * of disks to spread it across the new set which now has one more disk. This * reflow operation is similar to reflowing text when the column width of a * text editor window is expanded. The text doesn’t change but the location of * the text changes to accommodate the new width. An example reflow result for * a 4-wide RAIDZ1 to a 5-wide is shown below. * * Reflow End State * Each letter indicates a parity group (logical stripe) * * Before expansion After Expansion * D1 D2 D3 D4 D1 D2 D3 D4 D5 * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | A | A | A | A | | A | A | A | A | B | * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | B | B | C | C | | B | C | C | C | C | * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | C | C | D | D | | D | D | E | E | E | * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | E | E | E | E | --> | E | F | F | G | G | * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | F | F | G | G | | G | G | H | H | H | * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | G | G | H | H | | H | I | I | J | J | * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| * +------+------+------+------+ +------+------+------+------+------+ * | | | | | | | | | | | * | H | H | I | I | | J | J | | | K | * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| * +------+------+------+------+ +------+------+------+------+------+ * * This reflow approach has several advantages. There is no need to read or * modify the block pointers or recompute any block checksums. The reflow * doesn’t need to know where the parity sectors reside. We can read and write * data sequentially and the copy can occur in a background thread in open * context. The design also allows for fast discovery of what data to copy. * * The VDEV metaslabs are processed, one at a time, to copy the block data to * have it flow across all the disks. The metaslab is disabled for allocations * during the copy. As an optimization, we only copy the allocated data which * can be determined by looking at the metaslab range tree. During the copy we * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still * need to be able to survive losing parity count disks). This means we * cannot overwrite data during the reflow that would be needed if a disk is * lost. * * After the reflow completes, all newly-written blocks will have the new * layout, i.e., they will have the parity to data ratio implied by the new * number of disks in the RAIDZ group. Even though the reflow copies all of * the allocated space (data and parity), it is only rearranged, not changed. * * This act of reflowing the data has a few implications about blocks * that were written before the reflow completes: * * - Old blocks will still use the same amount of space (i.e., they will have * the parity to data ratio implied by the old number of disks in the RAIDZ * group). * - Reading old blocks will be slightly slower than before the reflow, for * two reasons. First, we will have to read from all disks in the RAIDZ * VDEV, rather than being able to skip the children that contain only * parity of this block (because the data of a single block is now spread * out across all the disks). Second, in most cases there will be an extra * bcopy, needed to rearrange the data back to its original layout in memory. * * == Scratch Area == * * As we copy the block data, we can only progress to the point that writes * will not overlap with blocks whose progress has not yet been recorded on * disk. Since partially-copied rows are always read from the old location, * we need to stop one row before the sector-wise overlap, to prevent any * row-wise overlap. For example, in the diagram above, when we reflow sector * B6 it will overwite the original location for B5. * * To get around this, a scratch space is used so that we can start copying * without risking data loss by overlapping the row. As an added benefit, it * improves performance at the beginning of the reflow, but that small perf * boost wouldn't be worth the complexity on its own. * * Ideally we want to copy at least 2 * (new_width)^2 so that we have a * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice * the widths will likely be single digits so we can get a substantial chuck * size using only a few MB of scratch per disk. * * The scratch area is persisted to disk which holds a large amount of reflowed * state. We can always read the partially written stripes when a disk fails or * the copy is interrupted (crash) during the initial copying phase and also * get past a small chunk size restriction. At a minimum, the scratch space * must be large enough to get us to the point that one row does not overlap * itself when moved (i.e new_width^2). But going larger is even better. We * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels * as our scratch space to handle overwriting the initial part of the VDEV. * * 0 256K 512K 4M * +------+------+-----------------------+----------------------------- * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... * | L0 | L1 | Reserved | (Metaslabs) * +------+------+-----------------------+------------------------------- * Scratch Area * * == Reflow Progress Updates == * After the initial scratch-based reflow, the expansion process works * similarly to device removal. We create a new open context thread which * reflows the data, and periodically kicks off sync tasks to update logical * state. In this case, state is the committed progress (offset of next data * to copy). We need to persist the completed offset on disk, so that if we * crash we know which format each VDEV offset is in. * * == Time Dependent Geometry == * * In non-expanded RAIDZ, blocks are read from disk in a column by column * fashion. For a multi-row block, the second sector is in the first column * not in the second column. This allows us to issue full reads for each * column directly into the request buffer. The block data is thus laid out * sequentially in a column-by-column fashion. * * For example, in the before expansion diagram above, one logical block might * be sectors G19-H26. The parity is in G19,H23; and the data is in * G20,H24,G21,H25,G22,H26. * * After a block is reflowed, the sectors that were all in the original column * data can now reside in different columns. When reading from an expanded * VDEV, we need to know the logical stripe width for each block so we can * reconstitute the block’s data after the reads are completed. Likewise, * when we perform the combinatorial reconstruction we need to know the * original width so we can retry combinations from the past layouts. * * Time dependent geometry is what we call having blocks with different layouts * (stripe widths) in the same VDEV. This time-dependent geometry uses the * block’s birth time (+ the time expansion ended) to establish the correct * width for a given block. After an expansion completes, we record the time * for blocks written with a particular width (geometry). * * == On Disk Format Changes == * * New pool feature flag, 'raidz_expansion' whose reference count is the number * of RAIDZ VDEVs that have been expanded. * * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. * * Since the uberblock can point to arbitrary blocks, which might be on the * expanding RAIDZ, and might or might not have been expanded. We need to know * which way a block is laid out before reading it. This info is the next * offset that needs to be reflowed and we persist that in the uberblock, in * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. * After the expansion is complete, we then use the raidz_expand_txgs array * (see below) to determine how to read a block and the ub_raidz_reflow_info * field no longer required. * * The uberblock's ub_raidz_reflow_info field also holds the scratch space * state (i.e., active or not) which is also required before reading a block * during the initial phase of reflowing the data. * * The top-level RAIDZ VDEV has two new entries in the nvlist: * * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here * and used after the expansion is complete to * determine how to read a raidz block * 'raidz_expanding' boolean: present during reflow and removed after completion * used during a spa import to resume an unfinished * expansion * * And finally the VDEVs top zap adds the following informational entries: * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED */ /* * For testing only: pause the raidz expansion after reflowing this amount. * (accessed by ZTS and ztest) */ #ifdef _KERNEL static #endif /* _KERNEL */ unsigned long raidz_expand_max_reflow_bytes = 0; /* * For testing only: pause the raidz expansion at a certain point. */ uint_t raidz_expand_pause_point = 0; /* * Maximum amount of copy io's outstanding at once. */ #ifdef _ILP32 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE; #else static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; #endif /* * Apply raidz map abds aggregation if the number of rows in the map is equal * or greater than the value below. */ static unsigned long raidz_io_aggregate_rows = 4; /* * Automatically start a pool scrub when a RAIDZ expansion completes in * order to verify the checksums of all blocks which have been copied * during the expansion. Automatic scrubbing is enabled by default and * is strongly recommended. */ static int zfs_scrub_after_expand = 1; static void vdev_raidz_row_free(raidz_row_t *rr) { for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size != 0) abd_free(rc->rc_abd); if (rc->rc_orig_data != NULL) abd_free(rc->rc_orig_data); } if (rr->rr_abd_empty != NULL) abd_free(rr->rr_abd_empty); kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); } void vdev_raidz_map_free(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); if (rm->rm_nphys_cols) { for (int i = 0; i < rm->rm_nphys_cols; i++) { if (rm->rm_phys_col[i].rc_abd != NULL) abd_free(rm->rm_phys_col[i].rc_abd); } kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * rm->rm_nphys_cols); } ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; vdev_raidz_map_free(rm); } static int vdev_raidz_reflow_compare(const void *x1, const void *x2) { const reflow_node_t *l = x1; const reflow_node_t *r = x2; return (TREE_CMP(l->re_txg, r->re_txg)); } const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; raidz_row_t * vdev_raidz_row_alloc(int cols, zio_t *zio) { raidz_row_t *rr = kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); rr->rr_cols = cols; rr->rr_scols = cols; for (int c = 0; c < cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_shadow_devidx = INT_MAX; rc->rc_shadow_offset = UINT64_MAX; /* * We can not allow self healing to take place for Direct I/O * reads. There is nothing that stops the buffer contents from * being manipulated while the I/O is in flight. It is possible * that the checksum could be verified on the buffer and then * the contents of that buffer are manipulated afterwards. This * could lead to bad data being written out during self * healing. */ if (!(zio->io_flags & ZIO_FLAG_DIO_READ)) rc->rc_allow_repair = 1; } return (rr); } static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { int c; int nwrapped = 0; uint64_t off = 0; raidz_row_t *rr = rm->rm_row[0]; ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(rm->rm_nrows, ==, 1); /* * Pad any parity columns with additional space to account for skip * sectors. */ if (rm->rm_skipstart < rr->rr_firstdatacol) { ASSERT0(rm->rm_skipstart); nwrapped = rm->rm_nskip; } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { nwrapped = (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; } /* * Optional single skip sectors (rc_size == 0) will be handled in * vdev_raidz_io_start_write(). */ int skipped = rr->rr_scols - rr->rr_cols; /* Allocate buffers for the parity columns */ for (c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * Parity columns will pad out a linear ABD to account for * the skip sector. A linear ABD is used here because * parity calculations use the ABD buffer directly to calculate * parity. This avoids doing a memcpy back to the ABD after the * parity has been calculated. By issuing the parity column * with the skip sector we can reduce contention on the child * VDEV queue locks (vq_lock). */ if (c < nwrapped) { rc->rc_abd = abd_alloc_linear( rc->rc_size + (1ULL << ashift), B_FALSE); abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); skipped++; } else { rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } } for (off = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, off, rc->rc_size); /* * Generate I/O for skip sectors to improve aggregation * continuity. We will use gang ABD's to reduce contention * on the child VDEV queue locks (vq_lock) by issuing * a single I/O that contains the data and skip sector. * * It is important to make sure that rc_size is not updated * even though we are adding a skip sector to the ABD. When * calculating the parity in vdev_raidz_generate_parity_row() * the rc_size is used to iterate through the ABD's. We can * not have zero'd out skip sectors used for calculating * parity for raidz, because those same sectors are not used * during reconstruction. */ if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd, B_TRUE); abd_gang_add(rc->rc_abd, abd_get_zeros(1ULL << ashift), B_TRUE); skipped++; } else { rc->rc_abd = abd; } off += rc->rc_size; } ASSERT3U(off, ==, zio->io_size); ASSERT3S(skipped, ==, rm->rm_nskip); } static void vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) { int c; raidz_row_t *rr = rm->rm_row[0]; ASSERT3U(rm->rm_nrows, ==, 1); /* Allocate buffers for the parity columns */ for (c = 0; c < rr->rr_firstdatacol; c++) rr->rr_col[c].rc_abd = abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); for (uint64_t off = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, off, rc->rc_size); off += rc->rc_size; } } /* * Divides the IO evenly across all child vdevs; usually, dcols is * the number of children in the target vdev. * * Avoid inlining the function to keep vdev_raidz_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; uint64_t acols, scols; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); rm->rm_nrows = 1; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ uint64_t q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* * acols: The columns that will be accessed. * scols: The columns that will be accessed or skipped. */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; scols = dcols; } ASSERT3U(acols, <=, scols); rr = vdev_raidz_row_alloc(scols, zio); rm->rm_row[0] = rr; rr->rr_cols = acols; rr->rr_bigcols = bc; rr->rr_firstdatacol = nparity; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif uint64_t asize = 0; for (uint64_t c = 0; c < scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; uint64_t col = f + c; uint64_t coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rc->rc_devidx = col; rc->rc_offset = coff; if (c >= acols) rc->rc_size = 0; else if (c < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_skipstart = bc; /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices evenly, we * won't see any benefit. Further, occasional writes that aren't a * multiple of the LCM of the number of children and the minimum * stripe width are sufficient to avoid pessimal behavior. * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for padding * we must make sure to note this swap. We will never intend to * skip the first column since at least one data and one parity * column must appear in each row. */ ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { uint64_t devidx = rr->rr_col[0].rc_devidx; o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_map_alloc_write(zio, rm, ashift); } else { vdev_raidz_map_alloc_read(zio, rm); } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } /* * Everything before reflow_offset_synced should have been moved to the new * location (read and write completed). However, this may not yet be reflected * in the on-disk format (e.g. raidz_reflow_sync() has been called but the * uberblock has not yet been written). If reflow is not in progress, * reflow_offset_synced should be UINT64_MAX. For each row, if the row is * entirely before reflow_offset_synced, it will come from the new location. * Otherwise this row will come from the old location. Therefore, rows that * straddle the reflow_offset_synced will come from the old location. * * For writes, reflow_offset_next is the next offset to copy. If a sector has * been copied, but not yet reflected in the on-disk progress * (reflow_offset_synced), it will also be written to the new (already copied) * offset. */ noinline raidz_map_t * vdev_raidz_map_alloc_expanded(zio_t *zio, uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, uint64_t nparity, uint64_t reflow_offset_synced, uint64_t reflow_offset_next, boolean_t use_scratch) { abd_t *abd = zio->io_abd; uint64_t offset = zio->io_offset; uint64_t size = zio->io_size; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = size >> ashift; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. * AKA "full rows" */ uint64_t q = s / (logical_cols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ uint64_t r = s - q * (logical_cols - nparity); /* The number of "big columns" - those which contain remainder data. */ uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* How many rows contain data (not skip) */ uint64_t rows = howmany(tot, logical_cols); int cols = MIN(tot, logical_cols); raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), KM_SLEEP); rm->rm_nrows = rows; rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_skipstart = bc; uint64_t asize = 0; for (uint64_t row = 0; row < rows; row++) { boolean_t row_use_scratch = B_FALSE; raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio); rm->rm_row[row] = rr; /* The starting RAIDZ (parent) vdev sector of the row. */ uint64_t b = (offset >> ashift) + row * logical_cols; /* * If we are in the middle of a reflow, and the copying has * not yet completed for any part of this row, then use the * old location of this row. Note that reflow_offset_synced * reflects the i/o that's been completed, because it's * updated by a synctask, after zio_wait(spa_txg_zio[]). * This is sufficient for our check, even if that progress * has not yet been recorded to disk (reflected in * spa_ubsync). Also note that we consider the last row to * be "full width" (`cols`-wide rather than `bc`-wide) for * this calculation. This causes a tiny bit of unnecessary * double-writes but is safe and simpler to calculate. */ int row_phys_cols = physical_cols; if (b + cols > reflow_offset_synced >> ashift) row_phys_cols--; else if (use_scratch) row_use_scratch = B_TRUE; /* starting child of this row */ uint64_t child_id = b % row_phys_cols; /* The starting byte offset on each child vdev. */ uint64_t child_offset = (b / row_phys_cols) << ashift; /* * Note, rr_cols is the entire width of the block, even * if this row is shorter. This is needed because parity * generation (for Q and R) needs to know the entire width, * because it treats the short row as though it was * full-width (and the "phantom" sectors were zero-filled). * * Another approach to this would be to set cols shorter * (to just the number of columns that we might do i/o to) * and have another mechanism to tell the parity generation * about the "entire width". Reconstruction (at least * vdev_raidz_reconstruct_general()) would also need to * know about the "entire width". */ rr->rr_firstdatacol = nparity; #ifdef ZFS_DEBUG /* * note: rr_size is PSIZE, not ASIZE */ rr->rr_offset = b << ashift; rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; #endif for (int c = 0; c < rr->rr_cols; c++, child_id++) { if (child_id >= row_phys_cols) { child_id -= row_phys_cols; child_offset += 1ULL << ashift; } raidz_col_t *rc = &rr->rr_col[c]; rc->rc_devidx = child_id; rc->rc_offset = child_offset; /* * Get this from the scratch space if appropriate. * This only happens if we crashed in the middle of * raidz_reflow_scratch_sync() (while it's running, * the rangelock prevents us from doing concurrent * io), and even then only during zpool import or * when the pool is imported readonly. */ if (row_use_scratch) rc->rc_offset -= VDEV_BOOT_SIZE; uint64_t dc = c - rr->rr_firstdatacol; if (c < rr->rr_firstdatacol) { rc->rc_size = 1ULL << ashift; /* * Parity sectors' rc_abd's are set below * after determining if this is an aggregation. */ } else if (row == rows - 1 && bc != 0 && c >= bc) { /* * Past the end of the block (even including * skip sectors). This sector is part of the * map so that we have full rows for p/q parity * generation. */ rc->rc_size = 0; rc->rc_abd = NULL; } else { /* "data column" (col excluding parity) */ uint64_t off; if (c < bc || r == 0) { off = dc * rows + row; } else { off = r * rows + (dc - r) * (rows - 1) + row; } rc->rc_size = 1ULL << ashift; rc->rc_abd = abd_get_offset_struct( &rc->rc_abdstruct, abd, off << ashift, rc->rc_size); } if (rc->rc_size == 0) continue; /* * If any part of this row is in both old and new * locations, the primary location is the old * location. If this sector was already copied to the * new location, we need to also write to the new, * "shadow" location. * * Note, `row_phys_cols != physical_cols` indicates * that the primary location is the old location. * `b+c < reflow_offset_next` indicates that the copy * to the new location has been initiated. We know * that the copy has completed because we have the * rangelock, which is held exclusively while the * copy is in progress. */ if (row_use_scratch || (row_phys_cols != physical_cols && b + c < reflow_offset_next >> ashift)) { rc->rc_shadow_devidx = (b + c) % physical_cols; rc->rc_shadow_offset = ((b + c) / physical_cols) << ashift; if (row_use_scratch) rc->rc_shadow_offset -= VDEV_BOOT_SIZE; } asize += rc->rc_size; } /* * See comment in vdev_raidz_map_alloc() */ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && (offset & (1ULL << 20))) { ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); int devidx0 = rr->rr_col[0].rc_devidx; uint64_t offset0 = rr->rr_col[0].rc_offset; int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; uint64_t shadow_offset0 = rr->rr_col[0].rc_shadow_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[0].rc_shadow_devidx = rr->rr_col[1].rc_shadow_devidx; rr->rr_col[0].rc_shadow_offset = rr->rr_col[1].rc_shadow_offset; rr->rr_col[1].rc_devidx = devidx0; rr->rr_col[1].rc_offset = offset0; rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; rr->rr_col[1].rc_shadow_offset = shadow_offset0; } } ASSERT3U(asize, ==, tot << ashift); /* * Determine if the block is contiguous, in which case we can use * an aggregation. */ if (rows >= raidz_io_aggregate_rows) { rm->rm_nphys_cols = physical_cols; rm->rm_phys_col = kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, KM_SLEEP); /* * Determine the aggregate io's offset and size, and check * that the io is contiguous. */ for (int i = 0; i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; if (rc->rc_size == 0) continue; if (prc->rc_size == 0) { ASSERT0(prc->rc_offset); prc->rc_offset = rc->rc_offset; } else if (prc->rc_offset + prc->rc_size != rc->rc_offset) { /* * This block is not contiguous and * therefore can't be aggregated. * This is expected to be rare, so * the cost of allocating and then * freeing rm_phys_col is not * significant. */ kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * rm->rm_nphys_cols); rm->rm_phys_col = NULL; rm->rm_nphys_cols = 0; break; } prc->rc_size += rc->rc_size; } } } if (rm->rm_phys_col != NULL) { /* * Allocate aggregate ABD's. */ for (int i = 0; i < rm->rm_nphys_cols; i++) { raidz_col_t *prc = &rm->rm_phys_col[i]; prc->rc_devidx = i; if (prc->rc_size == 0) continue; prc->rc_abd = abd_alloc_linear(rm->rm_phys_col[i].rc_size, B_FALSE); } /* * Point the parity abd's into the aggregate abd's. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, prc->rc_abd, rc->rc_offset - prc->rc_offset, rc->rc_size); } } } else { /* * Allocate new abd's for the parity sectors. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_firstdatacol; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_alloc_linear(rc->rc_size, B_TRUE); } } } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } struct pqr_struct { uint64_t *p; uint64_t *q; uint64_t *r; }; static int vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); } static int vdev_raidz_pq_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; } return (0); } static int vdev_raidz_pqr_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; VDEV_RAIDZ_64MUL_4(*pqr->r, mask); *pqr->r ^= *src; } return (0); } static void vdev_raidz_generate_parity_p(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void vdev_raidz_generate_parity_pq(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } } else { struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } } } static void vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_R].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); (void) memcpy(r, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; } } else { struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } } } } /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { if (rr->rr_cols == 0) { /* * We are handling this block one row at a time (because * this block has a different logical vs physical width, * due to RAIDZ expansion), and this is a pad-only row, * which has no parity. */ return; } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; switch (rr->rr_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rr); break; case 2: vdev_raidz_generate_parity_pq(rr); break; case 3: vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } void vdev_raidz_generate_parity(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_generate_parity_row(rm, rr); } } static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) { (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; int cnt = size / sizeof (src[0]); for (int i = 0; i < cnt; i++) { dst[i] ^= src[i]; } return (0); } static int vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, void *private) { (void) private; uint64_t *dst = dbuf; uint64_t *src = sbuf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, src++) { VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } return (0); } static int vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) { (void) private; uint64_t *dst = buf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++) { /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ VDEV_RAIDZ_64MUL_2(*dst, mask); } return (0); } struct reconst_q_struct { uint64_t *q; int exp; }; static int vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) { struct reconst_q_struct *rq = private; uint64_t *dst = buf; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, rq->q++) { int j; uint8_t *b; *dst ^= *rq->q; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { *b = vdev_raidz_exp2(*b, rq->exp); } } return (0); } struct reconst_pq_struct { uint8_t *p; uint8_t *q; uint8_t *pxy; uint8_t *qxy; int aexp; int bexp; }; static int vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; uint8_t *yd = ybuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); *yd = *rpq->p ^ *rpq->pxy ^ *xd; } return (0); } static int vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { /* same operation as vdev_raidz_reconst_pq_func() on xd */ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); } return (0); } static void vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; abd_t *dst, *src; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; dst = rr->rr_col[x].rc_abd; abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; if (c == x) continue; (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_p_func, NULL); } } static void vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; abd_t *dst, *src; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; dst = rr->rr_col[x].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, rr->rr_col[x].rc_size - size); } } else { ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; dst = rr->rr_col[x].rc_abd; exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); } static void vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; abd_t *xd, *yd; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); ASSERT(y < rr->rr_cols); ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as * though columns x and y were full of zeros -- Pxy and Qxy. We want to * reuse the parity generation mechanism without trashing the actual * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; xsize = rr->rr_col[x].rc_size; ysize = rr->rr_col[y].rc_size; rr->rr_col[VDEV_RAIDZ_P].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); rr->rr_col[VDEV_RAIDZ_Q].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rr->rr_col[x].rc_size = 0; rr->rr_col[y].rc_size = 0; vdev_raidz_generate_parity_pq(rr); rr->rr_col[x].rc_size = xsize; rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); xd = rr->rr_col[x].rc_abd; yd = rr->rr_col[y].rc_abd; /* * We now have: * Pxy = P + D_x + D_y * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y * * We can then solve for D_x: * D_x = A * (P + Pxy) + B * (Q + Qxy) * where * A = 2^(x - y) * (2^(x - y) + 1)^-1 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 * * With D_x in hand, we can easily solve for D_y: * D_y = P + Pxy + D_x */ a = vdev_raidz_pow2[255 + x - y]; b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; ASSERT3U(xsize, >=, ysize); struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; (void) abd_iterate_func2(xd, yd, 0, 0, ysize, vdev_raidz_reconst_pq_func, &rpq); (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; } /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coefficients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): * * __ __ __ __ * | | __ __ | p_0 | * | V | | D_0 | | p_m-1 | * | | x | : | = | d_0 | * | I | | D_n-1 | | : | * | | ~~ ~~ | d_n-1 | * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde * matrix defined by the coefficients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * * __ __ __ __ * | 1 .. 1 1 1 | | p_0 | * | 2^n-1 .. 4 2 1 | __ __ | : | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | * | 1 .. 0 0 0 | | D_1 | | d_0 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | * | : : : : | | : | | d_2 | * | 0 .. 1 0 0 | | D_n-1 | | : | * | 0 .. 0 1 0 | ~~ ~~ | : | * | 0 .. 0 0 1 | | d_n-1 | * ~~ ~~ ~~ ~~ * * Note that I, V, d, and p are known. To compute D, we must invert the * matrix and use the known data and parity values to reconstruct the unknown * data values. We begin by removing the rows in V|I and d|p that correspond * to failed or missing columns; we then make V|I square (n x n) and d|p * sized n by removing rows corresponding to unused parity from the bottom up * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' * using Gauss-Jordan elimination. In the example below we use m=3 parity * columns, n=8 data columns, with errors in d_1, d_2, and p_1: * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks * | 19 205 116 29 64 16 4 1 | / / * | 1 0 0 0 0 0 0 0 | / / * | 0 1 0 0 0 0 0 0 | <--' / * (V|I) = | 0 0 1 0 0 0 0 0 | <---' * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 | * (V|I)' = | 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We * have carefully chosen the seed values 1, 2, and 4 to ensure that this * matrix is not singular. * __ __ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 0 0 1 0 0 0 0 0 | * | 167 100 5 41 159 169 217 208 | * | 166 100 4 40 158 168 216 209 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values * of the missing data. * * As is apparent from the example above, the only non-trivial rows in the * inverse matrix correspond to the data disks that we're trying to * reconstruct. Indeed, those are the only rows we need as the others would * only be useful for reconstructing data known or assumed to be valid. For * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ static void vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. */ for (i = 0; i < nmap; i++) { ASSERT3S(0, <=, map[i]); ASSERT3S(map[i], <=, 2); pow = map[i] * n; if (pow > 255) pow -= 255; ASSERT(pow <= 255); for (j = 0; j < n; j++) { pow -= map[i]; if (pow < 0) pow += 255; rows[i][j] = vdev_raidz_pow2[pow]; } } } static void vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; uint8_t log; /* * Assert that the first nmissing entries from the array of used * columns correspond to parity columns and that subsequent entries * correspond to data columns. */ for (i = 0; i < nmissing; i++) { ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* * First initialize the storage where we'll compute the inverse rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { invrows[i][j] = (i == j) ? 1 : 0; } } /* * Subtract all trivial rows from the rows of consequence. */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { ASSERT3U(used[j], >=, rr->rr_firstdatacol); jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; } } /* * For each of the rows of interest, we must normalize it and subtract * a multiple of it from the other rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { ASSERT0(rows[i][j]); } ASSERT3U(rows[i][missing[i]], !=, 0); /* * Compute the inverse of the first element and multiply each * element in the row by that value. */ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; for (j = 0; j < n; j++) { rows[i][j] = vdev_raidz_exp2(rows[i][j], log); invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); } for (ii = 0; ii < nmissing; ii++) { if (i == ii) continue; ASSERT3U(rows[ii][missing[i]], !=, 0); log = vdev_raidz_log2[rows[ii][missing[i]]]; for (j = 0; j < n; j++) { rows[ii][j] ^= vdev_raidz_exp2(rows[i][j], log); invrows[ii][j] ^= vdev_raidz_exp2(invrows[i][j], log); } } } /* * Verify that the data that is left in the rows are properly part of * an identity matrix. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { ASSERT0(rows[i][j]); } } } } static void vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; uint8_t *src; uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; uint8_t log = 0; uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; psize = sizeof (invlog[0][0]) * n * nmissing; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing; i++) { invlog[i] = pp; pp += n; } for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { ASSERT3U(invrows[i][j], !=, 0); invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; } } for (i = 0; i < n; i++) { c = used[i]; ASSERT3U(c, <, rr->rr_cols); ccount = rr->rr_col[c].rc_size; ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); if (ccount == 0) continue; src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { cc = missing[j] + rr->rr_firstdatacol; ASSERT3U(cc, >=, rr->rr_firstdatacol); ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); dcount[j] = rr->rr_col[cc].rc_size; if (dcount[j] != 0) dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; for (cc = 0; cc < nmissing; cc++) { if (x >= dcount[cc]) continue; if (*src == 0) { val = 0; } else { if ((ll = log + invlog[cc][i]) >= 255) ll -= 255; val = vdev_raidz_pow2[ll]; } if (i == 0) dst[cc][x] = val; else dst[cc][x] ^= val; } } } kmem_free(p, psize); } static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int i, c, t, tt; unsigned int n; unsigned int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; abd_t **bufs = NULL; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. */ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { ASSERT(rr->rr_col[i].rc_abd != NULL); if (!abd_is_linear(rr->rr_col[i].rc_abd)) { bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; bufs[c] = col->rc_abd; if (bufs[c] != NULL) { col->rc_abd = abd_alloc_linear( col->rc_size, B_TRUE); abd_copy(col->rc_abd, bufs[c], col->rc_size); } } break; } } n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = tgts[t] - rr->rr_firstdatacol; } } /* * Figure out which parity columns to use to help generate the missing * data columns. */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. */ if (c == tgts[tt]) { tt++; continue; } parity_map[i] = c; i++; } psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing_rows; i++) { rows[i] = pp; pp += n; invrows[i] = pp; pp += n; } used = pp; for (i = 0; i < nmissing_rows; i++) { used[i] = parity_map[i]; } for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } ASSERT3S(i, <, n); used[i] = c; i++; } /* * Initialize the interesting rows of the matrix. */ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); /* * copy back from temporary linear abds and free them */ if (bufs) { for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; if (bufs[c] != NULL) { abd_copy(bufs[c], col->rc_abd, col->rc_size); abd_free(col->rc_abd); } col->rc_abd = bufs[c]; } kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } } static void vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c, ret; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, (int)rr->rr_missingparity); } nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " "offset=%llx error=%u)", rr, c, (int)rr->rr_col[c].rc_devidx, (long long)rr->rr_col[c].rc_offset, (int)rr->rr_col[c].rc_error); } if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; nbadparity--; } } ASSERT(ntgts >= nt); ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return; /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) { vdev_raidz_reconstruct_p(rr, dt, 1); return; } ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) { vdev_raidz_reconstruct_q(rr, dt, 1); return; } ASSERT(rr->rr_firstdatacol > 2); break; case 2: ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) { vdev_raidz_reconstruct_pq(rr, dt, 2); return; } ASSERT(rr->rr_firstdatacol > 2); break; } vdev_raidz_reconstruct_general(rr, tgts, ntgts); } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; ASSERT(nparity > 0); if (nparity > VDEV_RAIDZ_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); } for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) continue; *physical_ashift = vdev_best_ashift(*logical_ashift, *physical_ashift, cvd->vdev_physical_ashift); } if (vd->vdev_rz_expanding) { *asize *= vd->vdev_children - 1; *max_asize *= vd->vdev_children - 1; vd->vdev_min_asize = *asize; } else { *asize *= vd->vdev_children; *max_asize *= vd->vdev_children; } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_raidz_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c] != NULL) vdev_close(vd->vdev_child[c]); } } /* * Return the logical width to use, given the txg in which the allocation * happened. Note that BP_GET_BIRTH() is usually the txg in which the * BP was allocated. Remapped BP's (that were relocated due to device * removal, see remap_blkptr_cb()), will have a more recent physical birth * which reflects when the BP was relocated, but we can ignore these because * they can't be on RAIDZ (device removal doesn't support RAIDZ). */ static uint64_t vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) { reflow_node_t lookup = { .re_txg = txg, }; avl_index_t where; uint64_t width; mutex_enter(&vdrz->vd_expand_lock); reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); if (re != NULL) { width = re->re_logical_width; } else { re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); if (re != NULL) width = re->re_logical_width; else width = vdrz->vd_original_width; } mutex_exit(&vdrz->vd_expand_lock); return (width); } /* * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated * more space due to the lower data-to-parity ratio. In this case it's * important to pass in the correct txg. Note that vdev_gang_header_asize() * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, * regardless of txg. This is assured because for a single data sector, we * allocate P+1 sectors regardless of width ("cols", which is at least P+1). */ static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; cols = vdev_raidz_get_logical_width(vdrz, txg); asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; #ifdef ZFS_DEBUG uint64_t asize_new = ((psize - 1) >> ashift) + 1; uint64_t ncols_new = vdrz->vd_physical_width; asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / (ncols_new - nparity)); asize_new = roundup(asize_new, nparity + 1) << ashift; VERIFY3U(asize_new, <=, asize); #endif return (asize); } /* * The allocatable space for a raidz vdev is N * sizeof(smallest child) * so each child must provide at least 1/Nth of its asize. */ static uint64_t vdev_raidz_min_asize(vdev_t *vd) { return ((vd->vdev_min_asize + vd->vdev_children - 1) / vd->vdev_children); } void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; ASSERT3P(rc->rc_abd, !=, NULL); rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static void vdev_raidz_shadow_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; rc->rc_shadow_error = zio->io_error; } static void vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) { (void) rm; #ifdef ZFS_DEBUG range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, rr->rr_size, BP_GET_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); if (vdev_xlate_is_empty(&physical_rs)) { /* * If we are in the middle of expansion, the * physical->logical mapping is changing so vdev_xlate() * can't give us a reliable answer. */ return; } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* * It would be nice to assert that rs_end is equal * to rc_offset + rc_size but there might be an * optional I/O at the end that is not accounted in * rc_size. */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } #endif } static void vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; vdev_raidz_generate_parity_row(rm, rr); for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ vdev_raidz_io_verify(zio, rm, rr, c); if (rc->rc_size == 0) continue; ASSERT3U(rc->rc_offset + rc->rc_size, <, cvd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3P(rc->rc_abd, !=, NULL); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, abd_get_size(rc->rc_abd), zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); if (rc->rc_shadow_devidx != INT_MAX) { vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; ASSERT3U( rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, cvd2->vdev_psize - VDEV_LABEL_END_SIZE); zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, rc->rc_shadow_offset, rc->rc_abd, abd_get_size(rc->rc_abd), zio->io_type, zio->io_priority, 0, vdev_raidz_shadow_child_done, rc)); } } } /* * Generate optional I/Os for skip sectors to improve aggregation contiguity. * This only works for vdev_raidz_map_alloc() (not _expanded()). */ static void raidz_start_skip_writes(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t ashift = vd->vdev_top->vdev_ashift; raidz_map_t *rm = zio->io_vsd; ASSERT3U(rm->rm_nrows, ==, 1); raidz_row_t *rr = rm->rm_row[0]; for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_size != 0) continue; ASSERT3P(rc->rc_abd, ==, NULL); ASSERT3U(rc->rc_offset, <, cvd->vdev_psize - VDEV_LABEL_END_SIZE); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, NULL, 1ULL << ashift, zio->io_type, zio->io_priority, ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } } static void vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) continue; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } } static void vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) { vdev_t *vd = zio->io_vd; for (int i = 0; i < rm->rm_nphys_cols; i++) { raidz_col_t *prc = &rm->rm_phys_col[i]; if (prc->rc_size == 0) continue; ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ prc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { prc->rc_error = SET_ERROR(ESTALE); prc->rc_skipped = 1; continue; } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, prc->rc_offset, prc->rc_abd, prc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, prc)); } } static void vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) { /* * If there are multiple rows, we will be hitting * all disks, so go ahead and read the parity so * that we are reading in decent size chunks. */ boolean_t forceparity = rm->rm_nrows > 1; if (rm->rm_phys_col) { vdev_raidz_io_start_read_phys_cols(zio, rm); } else { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_start_read_row(zio, rr, forceparity); } } } /* * Start an IO operation on a RAIDZ VDev * * Outline: * - For write operations: * 1. Generate the parity data * 2. Create child zio write operations to each column's vdev, for both * data and parity. * 3. If the column skips any sectors for padding, create optional dummy * write zio children for those areas to improve aggregation continuity. * - For read operations: * 1. Create child zio read operations to each data column's vdev to read * the range of data required for zio. * 2. If this is a scrub or resilver operation, or if any of the data * vdevs have had errors, then create zio read operations to the parity * columns' VDevs as well. */ static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; raidz_map_t *rm; uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, BP_GET_BIRTH(zio->io_bp)); if (logical_width != vdrz->vd_physical_width) { zfs_locked_range_t *lr = NULL; uint64_t synced_offset = UINT64_MAX; uint64_t next_offset = UINT64_MAX; boolean_t use_scratch = B_FALSE; /* * Note: when the expansion is completing, we set * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) * in a later txg than when we last update spa_ubsync's state * (see the end of spa_raidz_expand_thread()). Therefore we * may see vre_state!=SCANNING before * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected * on disk, but the copying progress has been synced to disk * (and reflected in spa_ubsync). In this case it's fine to * treat the expansion as completed, since if we crash there's * no additional copying to do. */ if (vdrz->vn_vre.vre_state == DSS_SCANNING) { ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, &vdrz->vn_vre); lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, zio->io_offset, zio->io_size, RL_READER); use_scratch = (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == RRSS_SCRATCH_VALID); synced_offset = RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); next_offset = vdrz->vn_vre.vre_offset; /* * If we haven't resumed expanding since importing the * pool, vre_offset won't have been set yet. In * this case the next offset to be copied is the same * as what was synced. */ if (next_offset == UINT64_MAX) { next_offset = synced_offset; } } if (use_scratch) { zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" "%lld next_offset=%lld use_scratch=%u", zio, zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", (long long)zio->io_offset, (long long)synced_offset, (long long)next_offset, use_scratch); } rm = vdev_raidz_map_alloc_expanded(zio, tvd->vdev_ashift, vdrz->vd_physical_width, logical_width, vdrz->vd_nparity, synced_offset, next_offset, use_scratch); rm->rm_lr = lr; } else { rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, logical_width, vdrz->vd_nparity); } rm->rm_original_width = vdrz->vd_original_width; zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_start_write(zio, rm->rm_row[i]); } if (logical_width == vdrz->vd_physical_width) { raidz_start_skip_writes(zio); } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); } /* * Report a checksum error for a child of a RAID-Z device. */ void vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); } } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it. */ static int raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc = {0}; raidz_map_t *rm = zio->io_vsd; int ret = zio_checksum_error(zio, &zbc); /* * Any Direct I/O read that has a checksum error must be treated as * suspicious as the contents of the buffer could be getting * manipulated while the I/O is taking place. The checksum verify error * will be reported to the top-level RAIDZ VDEV. */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); zio_checksum_verified(zio); return (0); } if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; return (ret); } /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the * number of such failures. */ static int raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = rc->rc_abd; ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); } /* * Verify any empty sectors are zero filled to ensure the parity * is calculated correctly even if these non-data sectors are damaged. */ if (rr->rr_nempty && rr->rr_abd_empty != NULL) ret += vdev_draid_map_verify_empty(zio, rr); /* * Regenerates parity even for !tried||rc_error!=0 columns. This * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). */ vdev_raidz_generate_parity_row(rm, rr); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { zfs_dbgmsg("found error on col=%u devidx=%u off %llx", c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } abd_free(orig[c]); } return (ret); } static int vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); } return (error); } static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } if (rc->rc_force_repair) unexpected_errors++; } /* * If we read more parity disks than were used for * reconstruction, confirm that the other parity disks produced * correct data. * * Note that we also regenerate parity when resilvering so we * can write it out to failed devices later. */ if (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors || (zio->io_flags & ZIO_FLAG_RESILVER)) { int n = raidz_parity_verify(zio, rr); unexpected_errors += n; } if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *vd = zio->io_vd; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!rc->rc_allow_repair) { continue; } else if (!rc->rc_force_repair && (rc->rc_error == 0 || rc->rc_size == 0)) { continue; } /* * We do not allow self healing for Direct I/O reads. * See comment in vdev_raid_row_alloc(). */ ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ); zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " "offset=%llx", zio, c, rc->rc_devidx, (long long)rc->rc_offset); zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority == ZIO_PRIORITY_REBUILD ? ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } /* * Scrub or resilver i/o's: overwrite any shadow locations with the * good data. This ensures that if we've already copied this sector, * it will be corrected if it was damaged. This writes more than is * necessary, but since expansion is paused during scrub/resilver, at * most a single row will have a shadow location. */ if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *vd = zio->io_vd; if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) continue; vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; /* * Note: We don't want to update the repair stats * because that would incorrectly indicate that there * was bad data to repair, which we aren't sure about. * By clearing the SCAN_THREAD flag, we prevent this * from happening, despite having the REPAIR flag set. * We need to set SELF_HEAL so that this i/o can't be * bypassed by zio_vdev_io_start(). */ zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL); cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; zio_nowait(cio); } } } static void raidz_restore_orig_data(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { abd_copy(rc->rc_abd, rc->rc_orig_data, rc->rc_size); rc->rc_need_orig_restore = B_FALSE; } } } } /* * During raidz_reconstruct() for expanded VDEV, we need special consideration * failure simulations. See note in raidz_reconstruct() on simulating failure * of a pre-expansion device. * * Treating logical child i as failed, return TRUE if the given column should * be treated as failed. The idea of logical children allows us to imagine * that a disk silently failed before a RAIDZ expansion (reads from this disk * succeed but return the wrong data). Since the expansion doesn't verify * checksums, the incorrect data will be moved to new locations spread among * the children (going diagonally across them). * * Higher "logical child failures" (values of `i`) indicate these * "pre-expansion failures". The first physical_width values imagine that a * current child failed; the next physical_width-1 values imagine that a * child failed before the most recent expansion; the next physical_width-2 * values imagine a child failed in the expansion before that, etc. */ static boolean_t raidz_simulate_failure(int physical_width, int original_width, int ashift, int i, raidz_col_t *rc) { uint64_t sector_id = physical_width * (rc->rc_offset >> ashift) + rc->rc_devidx; for (int w = physical_width; w >= original_width; w--) { if (i < w) { return (sector_id % w == i); } else { i -= w; } } ASSERT(!"invalid logical child id"); return (B_FALSE); } /* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed * returns 0 on successful reconstruction */ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); } /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { raidz_row_t *rr = rm->rm_row[r]; int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ int t = 0; int dead = 0; int dead_data = 0; if (dbgmsg) zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); if (rc->rc_error != 0) { dead++; if (c >= nparity) dead_data++; continue; } if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { if (raidz_simulate_failure(physical_width, original_width, zio->io_vd->vdev_top->vdev_ashift, ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( rc->rc_size, B_TRUE); abd_copy(rc->rc_orig_data, rc->rc_abd, rc->rc_size); } rc->rc_need_orig_restore = B_TRUE; dead++; if (c >= nparity) dead_data++; /* * Note: simulating failure of a * pre-expansion device can hit more * than one column, in which case we * might try to simulate more failures * than can be reconstructed, which is * also more than the size of my_tgts. * This check prevents accessing past * the end of my_tgts. The "dead > * nparity" check below will fail this * reconstruction attempt. */ if (t < VDEV_RAIDZ_MAXPARITY) { my_tgts[t++] = c; if (dbgmsg) { zfs_dbgmsg("simulating " "failure of col %u " "devidx %u", c, (int)rc->rc_devidx); } } break; } } } if (dead > nparity) { /* reconstruction not possible */ if (dbgmsg) { zfs_dbgmsg("reconstruction not possible; " "too many failures"); } raidz_restore_orig_data(rm); return (EINVAL); } if (dead_data > 0) vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); } /* Check for success */ if (raidz_checksum_verify(zio) == 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) return (0); /* Reconstruction succeeded - report errors */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { /* * Note: if this is a parity column, * we don't really know if it's wrong. * We need to let * vdev_raidz_io_done_verified() check * it, and if we set rc_error, it will * think that it is a "known" error * that doesn't need to be checked * or corrected. */ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { vdev_raidz_checksum_error(zio, rc, rc->rc_orig_data); rc->rc_error = SET_ERROR(ECKSUM); } rc->rc_need_orig_restore = B_FALSE; } } vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); if (dbgmsg) { zfs_dbgmsg("reconstruction successful " "(checksum verified)"); } return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " "failed", zio); } return (ECKSUM); } /* * Iterate over all combinations of N bad vdevs and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into * account how reconstruction is actually performed. For example, with * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. * * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose * 3 columns to reconstruct), we will generate the following sequence: * * STATE ACTION * 0 1 2 special case: skip since these are all parity * 0 1 3 first slot: reset to 0; middle slot: increment to 2 * 0 2 3 first slot: increment to 1 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 * 0 1 4 first: reset to 0; middle: increment to 2 * 0 2 4 first: increment to 1 * 1 2 4 first: reset to 0; middle: increment to 3 * 0 3 4 first: increment to 1 * 1 3 4 first: increment to 2 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 * 0 1 5 first: reset to 0; middle: increment to 2 * 0 2 5 first: increment to 1 * 1 2 5 first: reset to 0; middle: increment to 3 * 0 3 5 first: increment to 1 * 1 3 5 first: increment to 2 * 2 3 5 first: reset to 0; middle: increment to 4 * 0 4 5 first: increment to 1 * 1 4 5 first: increment to 2 * 2 4 5 first: increment to 3 * 3 4 5 done * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. * * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; int physical_width = zio->io_vd->vdev_children; int original_width = (rm->rm_original_width != 0) ? rm->rm_original_width : physical_width; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; for (int c = 0; c < rr->rr_cols; c++) { if (rr->rr_col[c].rc_error) total_errors++; } if (total_errors > nparity) return (vdev_raidz_worst_error(rr)); } for (int num_failures = 1; num_failures <= nparity; num_failures++) { int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ /* * Determine number of logical children, n. See comment * above raidz_simulate_failure(). */ int n = 0; for (int w = physical_width; w >= original_width; w--) { n += w; } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); /* Handle corner cases in combrec logic */ ltgts[-1] = -1; for (int i = 0; i < num_failures; i++) { ltgts[i] = i; } ltgts[num_failures] = n; for (;;) { int err = raidz_reconstruct(zio, ltgts, num_failures, nparity); if (err == EINVAL) { /* * Reconstruction not possible with this # * failures; try more failures. */ break; } else if (err == 0) return (0); /* Compute next targets to try */ for (int t = 0; ; t++) { ASSERT3U(t, <, num_failures); ltgts[t]++; if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { zfs_dbgmsg("reconstruction " "failed for num_failures=" "%u; tried all " "combinations", num_failures); } break; } ASSERT3U(ltgts[t], <, n); ASSERT3U(ltgts[t], <=, ltgts[t + 1]); /* * If that spot is available, we're done here. * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) break; // found next combination /* * Otherwise, reset this tgt to the minimum, * and move on to the next tgt. */ ltgts[t] = ltgts[t - 1] + 1; ASSERT3U(ltgts[t], ==, t); } /* Increase the number of failures and keep trying. */ if (ltgts[num_failures - 1] == n) break; } } if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } void vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { for (uint64_t row = 0; row < rm->rm_nrows; row++) { raidz_row_t *rr = rm->rm_row[row]; vdev_raidz_reconstruct_row(rm, rr, t, nt); } } /* * Complete a write IO operation on a RAIDZ VDev * * Outline: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. */ static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { int normal_errors = 0; int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ normal_errors++; } if (rc->rc_shadow_error != 0) { ASSERT(rc->rc_shadow_error != ECKSUM); shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough * columns to reconstruct the data, the I/O failed. Otherwise, good * enough. Note that in the case of a shadow write (during raidz * expansion), depending on if we crash, either the normal (old) or * shadow (new) location may become the "real" version of the block, * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ if (normal_errors > rr->rr_firstdatacol || shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } } static void vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr) { int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; /* * If scrubbing and a replacing/sparing child vdev determined * that not all of its children have an identical copy of the * data, then clear the error so the column is treated like * any other read and force a repair to correct the damage. */ if (rc->rc_error == ECKSUM) { ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); vdev_raidz_checksum_error(zio, rc, rc->rc_abd); rc->rc_force_repair = 1; rc->rc_error = 0; } if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; total_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } /* * If there were data errors and the number of errors we saw was * correctable -- less than or equal to the number of parity disks read * -- reconstruct based on the missing data. */ if (data_errors != 0 && total_errors <= rr->rr_firstdatacol - parity_untried) { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we * wouldn't be here in the correctable case. There must * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ ASSERT(parity_untried == 0); ASSERT(parity_errors < rr->rr_firstdatacol); /* * Identify the data columns that reported an error. */ int n = 0; int tgts[VDEV_RAIDZ_MAXPARITY]; for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error != 0) { ASSERT(n < VDEV_RAIDZ_MAXPARITY); tgts[n++] = c; } } ASSERT(rr->rr_firstdatacol >= n); vdev_raidz_reconstruct_row(rm, rr, tgts, n); } } /* * Return the number of reads issued. */ static int vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; int nread = 0; rr->rr_missingdata = 0; rr->rr_missingparity = 0; /* * If this rows contains empty sectors which are not required * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_tried || rc->rc_size == 0) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); nread++; } return (nread); } /* * We're here because either there were too many errors to even attempt * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() * failed. In either case, there is enough bad data to prevent reconstruction. * Start checksum ereports for all children which haven't failed. */ static void vdev_raidz_io_done_unrecoverable(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; if (rc->rc_error != 0) continue; zio_bad_cksum_t zbc; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; mutex_enter(&cvd->vdev_stat_lock); cvd->vdev_stat.vs_checksum_errors++; mutex_exit(&cvd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, cvd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, &zbc); } } } void vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { if (rm->rm_phys_col) { /* * This is an aggregated read. Copy the data and status * from the aggregate abd's to the individual rows. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_tried || rc->rc_size == 0) continue; raidz_col_t *prc = &rm->rm_phys_col[rc->rc_devidx]; rc->rc_error = prc->rc_error; rc->rc_tried = prc->rc_tried; rc->rc_skipped = prc->rc_skipped; if (c >= rr->rr_firstdatacol) { /* * Note: this is slightly faster * than using abd_copy_off(). */ char *physbuf = abd_to_buf( prc->rc_abd); void *physloc = physbuf + rc->rc_offset - prc->rc_offset; abd_copy_from_buf(rc->rc_abd, physloc, rc->rc_size); } } } } for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, rm, rr); } if (raidz_checksum_verify(zio) == 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) goto done; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); } else { /* * A sequential resilver has no checksum which makes * combinatoral reconstruction impossible. This code * path is unreachable since raidz_checksum_verify() * has no checksum to verify and must succeed. */ ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); /* * This isn't a typical situation -- either we got a * read error or a child silently returned bad data. * Read every block so we can try again with as much * data and parity as we can track down. If we've * already been through once before, all children will * be marked as tried so we'll proceed to combinatorial * reconstruction. */ int nread = 0; for (int i = 0; i < rm->rm_nrows; i++) { nread += vdev_raidz_read_all(zio, rm->rm_row[i]); } if (nread != 0) { /* * Normally our stage is VDEV_IO_DONE, but if * we've already called redone(), it will have * changed to VDEV_IO_START, in which case we * don't want to call redone() again. */ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) zio_vdev_io_redone(zio); return; } /* * It would be too expensive to try every possible * combination of failed sectors in every row, so * instead we try every combination of failed current or * past physical disk. This means that if the incorrect * sectors were all on Nparity disks at any point in the * past, we will find the correct data. The only known * case where this is less durable than a non-expanded * RAIDZ, is if we have a silent failure during * expansion. In that case, one block could be * partially in the old format and partially in the * new format, so we'd lost some sectors from the old * format and some from the new format. * * e.g. logical_width=4 physical_width=6 * the 15 (6+5+4) possible failed disks are: * width=6 child=0 * width=6 child=1 * width=6 child=2 * width=6 child=3 * width=6 child=4 * width=6 child=5 * width=5 child=0 * width=5 child=1 * width=5 child=2 * width=5 child=3 * width=5 child=4 * width=4 child=0 * width=4 child=1 * width=4 child=2 * width=4 child=3 * And we will try every combination of Nparity of these * failing. * * As a first pass, we can generate every combo, * and try reconstructing, ignoring any known * failures. If any row has too many known + simulated * failures, then we bail on reconstructing with this * number of simulated failures. As an improvement, * we could detect the number of whole known failures * (i.e. we have known failures on these disks for * every row; the disks never succeeded), and * subtract that from the max # failures to simulate. * We could go even further like the current * combrec code, but that doesn't seem like it * gains us very much. If we simulate a failure * that is also a known failure, that's fine. */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { vdev_raidz_io_done_unrecoverable(zio); } } } done: if (rm->rm_lr != NULL) { zfs_rangelock_exit(rm->rm_lr); rm->rm_lr = NULL; } } static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; /* * If we're in the middle of a RAIDZ expansion, this block may be in * the old and/or new location. For simplicity, always resilver it. */ if (vdrz->vn_vre.vre_state == DSS_SCANNING) return (B_TRUE); uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; /* Unreachable by sequential resilver. */ ASSERT3U(phys_birth, !=, TXG_UNKNOWN); if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) return (B_FALSE); if (s + nparity >= dcols) return (B_TRUE); for (uint64_t c = 0; c < s + nparity; c++) { uint64_t devidx = (f + c) % dcols; vdev_t *cvd = vd->vdev_child[devidx]; /* * dsl_scan_need_resilver() already checked vd with * vdev_dtl_contains(). So here just check cvd with * vdev_dtl_empty(), cheaper and a good approximation. */ if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) return (B_TRUE); } return (B_FALSE); } static void vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { (void) remain_rs; vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); vdev_raidz_t *vdrz = raidvd->vdev_tsd; if (vdrz->vn_vre.vre_state == DSS_SCANNING) { /* * We're in the middle of expansion, in which case the * translation is in flux. Any answer we give may be wrong * by the time we return, so it isn't safe for the caller to * act on it. Therefore we say that this range isn't present * on any children. The only consumers of this are "zpool * initialize" and trimming, both of which are "best effort" * anyway. */ physical_rs->rs_start = physical_rs->rs_end = 0; remain_rs->rs_start = remain_rs->rs_end = 0; return; } uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ ASSERT0(logical_rs->rs_start % (1 << ashift)); ASSERT0(logical_rs->rs_end % (1 << ashift)); uint64_t b_start = logical_rs->rs_start >> ashift; uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ start_row = ((b_start - tgt_col - 1) / width) + 1; uint64_t end_row = 0; if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; physical_rs->rs_start = start_row << ashift; physical_rs->rs_end = end_row << ashift; ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, logical_rs->rs_end - logical_rs->rs_start); } static void raidz_reflow_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; /* * Ensure there are no i/os to the range that is being committed. */ uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); mutex_enter(&vre->vre_lock); uint64_t new_offset = MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); /* * We should not have committed anything that failed. */ VERIFY3U(vre->vre_failed_offset, >=, old_offset); mutex_exit(&vre->vre_lock); zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, old_offset, new_offset - old_offset, RL_WRITER); /* * Update the uberblock that will be written when this txg completes. */ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); vre->vre_offset_pertxg[txgoff] = 0; zfs_rangelock_exit(lr); mutex_enter(&vre->vre_lock); vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; vre->vre_bytes_copied_pertxg[txgoff] = 0; mutex_exit(&vre->vre_lock); vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); } static void raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); vdev_raidz_t *vdrz = raidvd->vdev_tsd; for (int i = 0; i < TXG_SIZE; i++) VERIFY0(vre->vre_offset_pertxg[i]); reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; re->re_logical_width = vdrz->vd_physical_width; mutex_enter(&vdrz->vd_expand_lock); avl_add(&vdrz->vd_expand_txgs, re); mutex_exit(&vdrz->vd_expand_lock); vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); /* * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS * will get written (based on vd_expand_txgs). */ vdev_config_dirty(vd); /* * Before we change vre_state, the on-disk state must reflect that we * have completed all copying, so that vdev_raidz_io_start() can use * vre_state to determine if the reflow is in progress. See also the * end of spa_raidz_expand_thread(). */ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, raidvd->vdev_ms_count << raidvd->vdev_ms_shift); vre->vre_end_time = gethrestime_sec(); vre->vre_state = DSS_FINISHED; uint64_t state = vre->vre_state; VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state, tx)); uint64_t end_time = vre->vre_end_time; VERIFY0(zap_update(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, sizeof (end_time), 1, &end_time, tx)); spa->spa_uberblock.ub_raidz_reflow_info = 0; spa_history_log_internal(spa, "raidz vdev expansion completed", tx, "%s vdev %llu new width %llu", spa_name(spa), (unsigned long long)vd->vdev_id, (unsigned long long)vd->vdev_children); spa->spa_raidz_expand = NULL; raidvd->vdev_rz_expanding = B_FALSE; spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); spa_notify_waiters(spa); /* * While we're in syncing context take the opportunity to * setup a scrub. All the data has been sucessfully copied * but we have not validated any checksums. */ setup_sync_arg_t setup_sync_arg = { .func = POOL_SCAN_SCRUB, .txgstart = 0, .txgend = 0, }; if (zfs_scrub_after_expand && dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) { dsl_scan_setup_sync(&setup_sync_arg, tx); } } /* * State of one copy batch. */ typedef struct raidz_reflow_arg { vdev_raidz_expand_t *rra_vre; /* Global expantion state. */ zfs_locked_range_t *rra_lr; /* Range lock of this batch. */ uint64_t rra_txg; /* TXG of this batch. */ uint_t rra_ashift; /* Ashift of the vdev. */ uint32_t rra_tbd; /* Number of in-flight ZIOs. */ uint32_t rra_writes; /* Number of write ZIOs. */ zio_t *rra_zio[]; /* Write ZIO pointers. */ } raidz_reflow_arg_t; /* * Write of the new location on one child is done. Once all of them are done * we can unlock and free everything. */ static void raidz_reflow_write_done(zio_t *zio) { raidz_reflow_arg_t *rra = zio->io_private; vdev_raidz_expand_t *vre = rra->rra_vre; abd_free(zio->io_abd); mutex_enter(&vre->vre_lock); if (zio->io_error != 0) { /* Force a reflow pause on errors */ vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); } ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); vre->vre_outstanding_bytes -= zio->io_size; if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < vre->vre_failed_offset) { vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += zio->io_size; } cv_signal(&vre->vre_cv); boolean_t done = (--rra->rra_tbd == 0); mutex_exit(&vre->vre_lock); if (!done) return; spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); zfs_rangelock_exit(rra->rra_lr); kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes); } /* * Read of the old location on one child is done. Once all of them are done * writes should have all the data and we can issue them. */ static void raidz_reflow_read_done(zio_t *zio) { raidz_reflow_arg_t *rra = zio->io_private; vdev_raidz_expand_t *vre = rra->rra_vre; /* Reads of only one block use write ABDs. For bigger free gangs. */ if (zio->io_size > (1 << rra->rra_ashift)) abd_free(zio->io_abd); /* * If the read failed, or if it was done on a vdev that is not fully * healthy (e.g. a child that has a resilver in progress), we may not * have the correct data. Note that it's OK if the write proceeds. * It may write garbage but the location is otherwise unused and we * will retry later due to vre_failed_offset. */ if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", (long long)rra->rra_lr->lr_offset, (long long)rra->rra_lr->lr_length, (long long)rra->rra_txg, zio->io_error, vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), vdev_dtl_empty(zio->io_vd, DTL_MISSING)); mutex_enter(&vre->vre_lock); /* Force a reflow pause on errors */ vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); mutex_exit(&vre->vre_lock); } if (atomic_dec_32_nv(&rra->rra_tbd) > 0) return; rra->rra_tbd = rra->rra_writes; for (uint64_t i = 0; i < rra->rra_writes; i++) zio_nowait(rra->rra_zio[i]); } static void raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (offset == 0) return; mutex_enter(&vre->vre_lock); ASSERT3U(vre->vre_offset, <=, offset); vre->vre_offset = offset; mutex_exit(&vre->vre_lock); if (vre->vre_offset_pertxg[txgoff] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, spa, tx); } vre->vre_offset_pertxg[txgoff] = offset; } static boolean_t vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) { for (int i = 0; i < raidz_vd->vdev_children; i++) { /* Quick check if a child is being replaced */ if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) return (B_TRUE); } return (B_FALSE); } static boolean_t raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint_t ashift = vd->vdev_top->vdev_ashift; range_seg_t *rs = range_tree_first(rt); if (rt == NULL) return (B_FALSE); uint64_t offset = rs_get_start(rs, rt); ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); uint64_t size = rs_get_end(rs, rt) - offset; ASSERT3U(size, >=, 1 << ashift); ASSERT(IS_P2ALIGNED(size, 1 << ashift)); uint64_t blkid = offset >> ashift; uint_t old_children = vd->vdev_children - 1; /* * We can only progress to the point that writes will not overlap * with blocks whose progress has not yet been recorded on disk. * Since partially-copied rows are still read from the old location, * we need to stop one row before the sector-wise overlap, to prevent * row-wise overlap. * * Note that even if we are skipping over a large unallocated region, * we can't move the on-disk progress to `offset`, because concurrent * writes/allocations could still use the currently-unallocated * region. */ uint64_t ubsync_blkid = RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; uint64_t next_overwrite_blkid = ubsync_blkid + ubsync_blkid / old_children - old_children; VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); if (blkid >= next_overwrite_blkid) { raidz_reflow_record_progress(vre, next_overwrite_blkid << ashift, tx); return (B_TRUE); } size = MIN(size, raidz_expand_max_copy_bytes); size = MIN(size, (uint64_t)old_children * MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE)); size = MAX(size, 1 << ashift); uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); size = (uint64_t)blocks << ashift; range_tree_remove(rt, offset, size); uint_t reads = MIN(blocks, old_children); uint_t writes = MIN(blocks, vd->vdev_children); raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) + sizeof (zio_t *) * writes, KM_SLEEP); rra->rra_vre = vre; rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, offset, size, RL_WRITER); rra->rra_txg = dmu_tx_get_txg(tx); rra->rra_ashift = ashift; rra->rra_tbd = reads; rra->rra_writes = writes; raidz_reflow_record_progress(vre, offset + size, tx); /* * SCL_STATE will be released when the read and write are done, * by raidz_reflow_write_done(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* check if a replacing vdev was added, if so treat it as an error */ if (vdev_raidz_expand_child_replacing(vd)) { zfs_dbgmsg("replacing vdev encountered, reflow paused at " "offset=%llu txg=%llu", (long long)rra->rra_lr->lr_offset, (long long)rra->rra_txg); mutex_enter(&vre->vre_lock); vre->vre_failed_offset = MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); cv_signal(&vre->vre_cv); mutex_exit(&vre->vre_lock); /* drop everything we acquired */ spa_config_exit(spa, SCL_STATE, spa); zfs_rangelock_exit(rra->rra_lr); kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes); return (B_TRUE); } mutex_enter(&vre->vre_lock); vre->vre_outstanding_bytes += size; mutex_exit(&vre->vre_lock); /* Allocate ABD and ZIO for each child we write. */ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; zio_t *pio = spa->spa_txg_zio[txgoff]; uint_t b = blocks / vd->vdev_children; uint_t bb = blocks % vd->vdev_children; for (uint_t i = 0; i < writes; i++) { uint_t n = b + (i < bb); abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE); rra->rra_zio[i] = zio_vdev_child_io(pio, NULL, vd->vdev_child[(blkid + i) % vd->vdev_children], ((blkid + i) / vd->vdev_children) << ashift, abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra); } /* * Allocate and issue ZIO for each child we read. For reads of only * one block we can use respective writer ABDs, since they will also * have only one block. For bigger reads create gang ABDs and fill * them with respective blocks from writer ABDs. */ b = blocks / old_children; bb = blocks % old_children; for (uint_t i = 0; i < reads; i++) { uint_t n = b + (i < bb); abd_t *abd; if (n > 1) { abd = abd_alloc_gang(); for (uint_t j = 0; j < n; j++) { uint_t b = j * old_children + i; abd_t *cabd = abd_get_offset_size( rra->rra_zio[b % vd->vdev_children]->io_abd, (b / vd->vdev_children) << ashift, 1 << ashift); abd_gang_add(abd, cabd, B_TRUE); } } else { abd = rra->rra_zio[i]->io_abd; } zio_nowait(zio_vdev_child_io(pio, NULL, vd->vdev_child[(blkid + i) % old_children], ((blkid + i) / old_children) << ashift, abd, n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra)); } return (B_FALSE); } /* * For testing (ztest specific) */ static void raidz_expand_pause(uint_t pause_point) { while (raidz_expand_pause_point != 0 && raidz_expand_pause_point <= pause_point) delay(hz); } static void raidz_scratch_child_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); } /* * Reflow the beginning portion of the vdev into an intermediate scratch area * in memory and on disk. This operation must be persisted on disk before we * proceed to overwrite the beginning portion with the reflowed data. * * This multi-step task can fail to complete if disk errors are encountered * and we can return here after a pause (waiting for disk to become healthy). */ static void raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) { vdev_raidz_expand_t *vre = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; zio_t *pio; int error; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); int ashift = raidvd->vdev_ashift; uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, uint64_t); uint64_t logical_size = write_size * raidvd->vdev_children; uint64_t read_size = P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), 1 << ashift); /* * The scratch space must be large enough to get us to the point * that one row does not overlap itself when moved. This is checked * by vdev_raidz_attach_check(). */ VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); VERIFY3U(write_size, <=, read_size); zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, 0, logical_size, RL_WRITER); abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), KM_SLEEP); for (int i = 0; i < raidvd->vdev_children; i++) { abds[i] = abd_alloc_linear(read_size, B_FALSE); } raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); /* * If we have already written the scratch area then we must read from * there, since new writes were redirected there while we were paused * or the original location may have been partially overwritten with * reflowed data. */ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); /* * Read from scratch space. */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE * to the offset to calculate the physical offset to * write to. Passing in a negative offset makes us * access the scratch area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d reading scratch location", error); goto io_error_exit; } goto overwrite; } /* * Read from original location. */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children - 1; i++) { ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], read_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d reading original location", error); io_error_exit: for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); zfs_rangelock_exit(lr); spa_config_exit(spa, SCL_STATE, FTAG); return; } raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); /* * Reflow in memory. */ uint64_t logical_sectors = logical_size >> ashift; for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { int oldchild = i % (raidvd->vdev_children - 1); uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; int newchild = i % raidvd->vdev_children; uint64_t newoff = (i / raidvd->vdev_children) << ashift; /* a single sector should not be copying over itself */ ASSERT(!(newchild == oldchild && newoff == oldoff)); abd_copy_off(abds[newchild], abds[oldchild], newoff, oldoff, 1 << ashift); } /* * Verify that we filled in everything we intended to (write_size on * each child). */ VERIFY0(logical_sectors % raidvd->vdev_children); VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, write_size); /* * Write to scratch location (boot area). */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { zfs_dbgmsg("reflow: error %d writing scratch location", error); goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", (long long)logical_size); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); /* * Update uberblock to indicate that scratch space is valid. This is * needed because after this point, the real location may be * overwritten. If we crash, we need to get the data from the * scratch space, rather than the real location. * * Note: ub_timestamp is bumped so that vdev_uberblock_compare() * will prefer this uberblock. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); /* * Overwrite with reflow'ed data. */ overwrite: pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); } error = zio_wait(pio); if (error != 0) { /* * When we exit early here and drop the range lock, new * writes will go into the scratch area so we'll need to * read from there when we return after pausing. */ zfs_dbgmsg("reflow: error %d writing real location", error); /* * Update the uberblock that is written when this txg completes. */ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, logical_size); goto io_error_exit; } pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", (long long)logical_size); for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); /* * Update uberblock to indicate that the initial part has been * reflow'ed. This is needed because after this point (when we exit * the rangelock), we allow regular writes to this region, which will * be written to the new location only (because reflow_offset_next == * reflow_offset_synced). If we crashed and re-copied from the * scratch space, we would lose the regular writes. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, logical_size); spa->spa_ubsync.ub_timestamp++; ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); /* * Update progress. */ vre->vre_offset = logical_size; zfs_rangelock_exit(lr); spa_config_exit(spa, SCL_STATE, FTAG); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vre->vre_offset_pertxg[txgoff] = vre->vre_offset; vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; /* * Note - raidz_reflow_sync() will update the uberblock state to * RRSS_SCRATCH_INVALID_SYNCED_REFLOW */ raidz_reflow_sync(spa, tx); raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); } /* * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work * here. No other i/o can be in progress, so we don't need the vre_rangelock. */ void vdev_raidz_reflow_copy_scratch(spa_t *spa) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); ASSERT0(logical_size % raidvd->vdev_children); uint64_t write_size = logical_size / raidvd->vdev_children; zio_t *pio; /* * Read from scratch space. */ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), KM_SLEEP); for (int i = 0; i < raidvd->vdev_children; i++) { abds[i] = abd_alloc_linear(write_size, B_FALSE); } pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { /* * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to * the offset to calculate the physical offset to write to. * Passing in a negative offset lets us access the boot area. */ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); /* * Overwrite real location with reflow'ed data. */ pio = zio_root(spa, NULL, NULL, 0); for (int i = 0; i < raidvd->vdev_children; i++) { zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, 0, raidz_scratch_child_done, pio)); } zio_wait(pio); pio = zio_root(spa, NULL, NULL, 0); zio_flush(pio, raidvd); zio_wait(pio); zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " "to real location", (long long)logical_size); for (int i = 0; i < raidvd->vdev_children; i++) abd_free(abds[i]); kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); /* * Update uberblock. */ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); spa->spa_ubsync.ub_timestamp++; VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); if (spa_multihost(spa)) mmp_update_uberblock(spa, &spa->spa_ubsync); zfs_dbgmsg("reflow recovery: uberblock updated " "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", (long long)spa->spa_ubsync.ub_txg, (long long)logical_size, (long long)spa->spa_ubsync.ub_timestamp); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, spa_first_txg(spa)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; vre->vre_offset = logical_size; vre->vre_offset_pertxg[txgoff] = vre->vre_offset; vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; /* * Note that raidz_reflow_sync() will update the uberblock once more */ raidz_reflow_sync(spa, tx); dmu_tx_commit(tx); spa_config_exit(spa, SCL_STATE, FTAG); } static boolean_t spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) { (void) zthr; spa_t *spa = arg; return (spa->spa_raidz_expand != NULL && !spa->spa_raidz_expand->vre_waiting_for_resilver); } /* * RAIDZ expansion background thread * * Can be called multiple times if the reflow is paused */ static void spa_raidz_expand_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_raidz_expand_t *vre = spa->spa_raidz_expand; if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) vre->vre_offset = 0; else vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); /* Reflow the begining portion using the scratch area */ if (vre->vre_offset == 0) { VERIFY0(dsl_sync_task(spa_name(spa), NULL, raidz_reflow_scratch_sync, vre, 0, ZFS_SPACE_CHECK_NONE)); /* if we encountered errors then pause */ if (vre->vre_offset == 0) { mutex_enter(&vre->vre_lock); vre->vre_waiting_for_resilver = B_TRUE; mutex_exit(&vre->vre_lock); return; } } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); uint64_t guid = raidvd->vdev_guid; /* Iterate over all the remaining metaslabs */ for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; i < raidvd->vdev_ms_count && !zthr_iscancelled(zthr) && vre->vre_failed_offset == UINT64_MAX; i++) { metaslab_t *msp = raidvd->vdev_ms[i]; metaslab_disable(msp); mutex_enter(&msp->ms_lock); /* * The metaslab may be newly created (for the expanded * space), in which case its trees won't exist yet, * so we need to bail out early. */ if (msp->ms_new) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; } VERIFY0(metaslab_load(msp)); /* * We want to copy everything except the free (allocatable) * space. Note that there may be a little bit more free * space (e.g. in ms_defer), and it's fine to copy that too. */ uint64_t shift, start; range_seg_type_t type = metaslab_calculate_range_tree_type( raidvd, msp, &start, &shift); range_tree_t *rt = range_tree_create(NULL, type, NULL, start, shift); range_tree_add(rt, msp->ms_start, msp->ms_size); range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); mutex_exit(&msp->ms_lock); /* * Force the last sector of each metaslab to be copied. This * ensures that we advance the on-disk progress to the end of * this metaslab while the metaslab is disabled. Otherwise, we * could move past this metaslab without advancing the on-disk * progress, and then an allocation to this metaslab would not * be copied. */ int sectorsz = 1 << raidvd->vdev_ashift; uint64_t ms_last_offset = msp->ms_start + msp->ms_size - sectorsz; if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { range_tree_add(rt, ms_last_offset, sectorsz); } /* * When we are resuming from a paused expansion (i.e. * when importing a pool with a expansion in progress), * discard any state that we have already processed. */ if (vre->vre_offset > msp->ms_start) { range_tree_clear(rt, msp->ms_start, vre->vre_offset - msp->ms_start); } while (!zthr_iscancelled(zthr) && !range_tree_is_empty(rt) && vre->vre_failed_offset == UINT64_MAX) { /* * We need to periodically drop the config lock so that * writers can get in. Additionally, we can't wait * for a txg to sync while holding a config lock * (since a waiting writer could cause a 3-way deadlock * with the sync thread, which also gets a config * lock for reader). So we can't hold the config lock * while calling dmu_tx_assign(). */ spa_config_exit(spa, SCL_CONFIG, FTAG); /* * If requested, pause the reflow when the amount * specified by raidz_expand_max_reflow_bytes is reached * * This pause is only used during testing or debugging. */ while (raidz_expand_max_reflow_bytes != 0 && raidz_expand_max_reflow_bytes <= vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { delay(hz); } mutex_enter(&vre->vre_lock); while (vre->vre_outstanding_bytes > raidz_expand_max_copy_bytes) { cv_wait(&vre->vre_cv, &vre->vre_lock); } mutex_exit(&vre->vre_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); /* * Reacquire the vdev_config lock. Theoretically, the * vdev_t that we're expanding may have changed. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); boolean_t needsync = raidz_reflow_impl(raidvd, vre, rt, tx); dmu_tx_commit(tx); if (needsync) { spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); } } spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_enable(msp, B_FALSE, B_FALSE); range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); } spa_config_exit(spa, SCL_CONFIG, FTAG); /* * The txg_wait_synced() here ensures that all reflow zio's have * completed, and vre_failed_offset has been set if necessary. It * also ensures that the progress of the last raidz_reflow_sync() is * written to disk before raidz_reflow_complete_sync() changes the * in-memory vre_state. vdev_raidz_io_start() uses vre_state to * determine if a reflow is in progress, in which case we may need to * write to both old and new locations. Therefore we can only change * vre_state once this is not necessary, which is once the on-disk * progress (in spa_ubsync) has been set past any possible writes (to * the end of the last metaslab). */ txg_wait_synced(spa->spa_dsl_pool, 0); if (!zthr_iscancelled(zthr) && vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { /* * We are not being canceled or paused, so the reflow must be * complete. In that case also mark it as completed on disk. */ ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); VERIFY0(dsl_sync_task(spa_name(spa), NULL, raidz_reflow_complete_sync, spa, 0, ZFS_SPACE_CHECK_NONE)); (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); } else { /* * Wait for all copy zio's to complete and for all the * raidz_reflow_sync() synctasks to be run. */ spa_history_log_internal(spa, "reflow pause", NULL, "offset=%llu failed_offset=%lld", (long long)vre->vre_offset, (long long)vre->vre_failed_offset); mutex_enter(&vre->vre_lock); if (vre->vre_failed_offset != UINT64_MAX) { /* * Reset progress so that we will retry everything * after the point that something failed. */ vre->vre_offset = vre->vre_failed_offset; vre->vre_failed_offset = UINT64_MAX; vre->vre_waiting_for_resilver = B_TRUE; } mutex_exit(&vre->vre_lock); } } void spa_start_raidz_expansion_thread(spa_t *spa) { ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", spa_raidz_expand_thread_check, spa_raidz_expand_thread, spa, defclsyspri); } void raidz_dtl_reassessed(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (spa->spa_raidz_expand != NULL) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; /* * we get called often from vdev_dtl_reassess() so make * sure it's our vdev and any replacing is complete */ if (vd->vdev_top->vdev_id == vre->vre_vdev_id && !vdev_raidz_expand_child_replacing(vd->vdev_top)) { mutex_enter(&vre->vre_lock); if (vre->vre_waiting_for_resilver) { vdev_dbgmsg(vd, "DTL reassessed, " "continuing raidz expansion"); vre->vre_waiting_for_resilver = B_FALSE; zthr_wakeup(spa->spa_raidz_expand_zthr); } mutex_exit(&vre->vre_lock); } } } int vdev_raidz_attach_check(vdev_t *new_child) { vdev_t *raidvd = new_child->vdev_parent; uint64_t new_children = raidvd->vdev_children; /* * We use the "boot" space as scratch space to handle overwriting the * initial part of the vdev. If it is too small, then this expansion * is not allowed. This would be very unusual (e.g. ashift > 13 and * >200 children). */ if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { return (EINVAL); } return (0); } void vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) { vdev_t *new_child = arg; spa_t *spa = new_child->vdev_spa; vdev_t *raidvd = new_child->vdev_parent; vdev_raidz_t *vdrz = raidvd->vdev_tsd; ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); ASSERT3P(raidvd->vdev_top, ==, raidvd); ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, new_child); spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); vdrz->vd_physical_width++; VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; vdrz->vn_vre.vre_offset = 0; vdrz->vn_vre.vre_failed_offset = UINT64_MAX; spa->spa_raidz_expand = &vdrz->vn_vre; zthr_wakeup(spa->spa_raidz_expand_zthr); /* * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get * written to the config. */ vdev_config_dirty(raidvd); vdrz->vn_vre.vre_start_time = gethrestime_sec(); vdrz->vn_vre.vre_end_time = 0; vdrz->vn_vre.vre_state = DSS_SCANNING; vdrz->vn_vre.vre_bytes_copied = 0; uint64_t state = vdrz->vn_vre.vre_state; VERIFY0(zap_update(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state, tx)); uint64_t start_time = vdrz->vn_vre.vre_start_time; VERIFY0(zap_update(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, sizeof (start_time), 1, &start_time, tx)); (void) zap_remove(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); (void) zap_remove(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); spa_history_log_internal(spa, "raidz vdev expansion started", tx, "%s vdev %llu new width %llu", spa_name(spa), (unsigned long long)raidvd->vdev_id, (unsigned long long)raidvd->vdev_children); } int vdev_raidz_load(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; int err; uint64_t state = DSS_NONE; uint64_t start_time = 0; uint64_t end_time = 0; uint64_t bytes_copied = 0; if (vd->vdev_top_zap != 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, sizeof (state), 1, &state); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, sizeof (start_time), 1, &start_time); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, sizeof (end_time), 1, &end_time); if (err != 0 && err != ENOENT) return (err); err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, sizeof (bytes_copied), 1, &bytes_copied); if (err != 0 && err != ENOENT) return (err); } /* * If we are in the middle of expansion, vre_state should have * already been set by vdev_raidz_init(). */ EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; vdrz->vn_vre.vre_start_time = start_time; vdrz->vn_vre.vre_end_time = end_time; vdrz->vn_vre.vre_bytes_copied = bytes_copied; return (0); } int spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) { vdev_raidz_expand_t *vre = spa->spa_raidz_expand; if (vre == NULL) { /* no removal in progress; find most recent completed */ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; if (vd->vdev_ops == &vdev_raidz_ops) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (vdrz->vn_vre.vre_end_time != 0 && (vre == NULL || vdrz->vn_vre.vre_end_time > vre->vre_end_time)) { vre = &vdrz->vn_vre; } } } } if (vre == NULL) { return (SET_ERROR(ENOENT)); } pres->pres_state = vre->vre_state; pres->pres_expanding_vdev = vre->vre_vdev_id; vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); pres->pres_to_reflow = vd->vdev_stat.vs_alloc; mutex_enter(&vre->vre_lock); pres->pres_reflowed = vre->vre_bytes_copied; for (int i = 0; i < TXG_SIZE; i++) pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; mutex_exit(&vre->vre_lock); pres->pres_start_time = vre->vre_start_time; pres->pres_end_time = vre->vre_end_time; pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; return (0); } /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error != 0) return (SET_ERROR(EINVAL)); uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(EINVAL)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); vdrz->vn_vre.vre_vdev_id = -1; vdrz->vn_vre.vre_offset = UINT64_MAX; vdrz->vn_vre.vre_failed_offset = UINT64_MAX; mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; /* note, the ID does not exist when creating a pool */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &vdrz->vn_vre.vre_vdev_id); boolean_t reflow_in_progress = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); if (reflow_in_progress) { spa->spa_raidz_expand = &vdrz->vn_vre; vdrz->vn_vre.vre_state = DSS_SCANNING; } vdrz->vd_original_width = children; uint64_t *txgs; unsigned int txgs_size = 0; error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, &txgs, &txgs_size); if (error == 0) { for (int i = 0; i < txgs_size; i++) { reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = txgs[txgs_size - i - 1]; re->re_logical_width = vdrz->vd_physical_width - i; if (reflow_in_progress) re->re_logical_width--; avl_add(&vdrz->vd_expand_txgs, re); } vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; } if (reflow_in_progress) { vdrz->vd_original_width--; zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", children, txgs_size); } *tsd = vdrz; return (0); } static void vdev_raidz_fini(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) vd->vdev_spa->spa_raidz_expand = NULL; reflow_node_t *re; void *cookie = NULL; avl_tree_t *tree = &vdrz->vd_expand_txgs; while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) kmem_free(re, sizeof (*re)); avl_destroy(&vdrz->vd_expand_txgs); mutex_destroy(&vdrz->vd_expand_lock); mutex_destroy(&vdrz->vn_vre.vre_lock); cv_destroy(&vdrz->vn_vre.vre_cv); zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); kmem_free(vdrz, sizeof (*vdrz)); } /* * Add RAIDZ specific fields to the config nvlist. */ static void vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) { ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); vdev_raidz_t *vdrz = vd->vdev_tsd; /* * Make sure someone hasn't managed to sneak a fancy new vdev * into a crufty old storage pool. */ ASSERT(vdrz->vd_nparity == 1 || (vdrz->vd_nparity <= 2 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || (vdrz->vd_nparity <= 3 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add these even on storage pools where they * aren't strictly required -- older software will just ignore * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); if (vdrz->vn_vre.vre_state == DSS_SCANNING) { fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } mutex_enter(&vdrz->vd_expand_lock); if (!avl_is_empty(&vdrz->vd_expand_txgs)) { uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, KM_SLEEP); uint64_t i = 0; for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { txgs[i++] = re->re_txg; } fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, txgs, count); kmem_free(txgs, sizeof (uint64_t) * count); } mutex_exit(&vdrz->vd_expand_lock); } static uint64_t vdev_raidz_nparity(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; return (vdrz->vd_nparity); } static uint64_t vdev_raidz_ndisks(vdev_t *vd) { return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { .vdev_op_init = vdev_raidz_init, .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, .vdev_op_asize = vdev_raidz_asize, .vdev_op_min_asize = vdev_raidz_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, .vdev_op_need_resilver = vdev_raidz_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = vdev_raidz_config_generate, .vdev_op_nparity = vdev_raidz_nparity, .vdev_op_ndisks = vdev_raidz_ndisks, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, "For testing, pause RAIDZ expansion after reflowing this many bytes"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, "Max amount of concurrent i/o for RAIDZ expansion"); ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, "For expanded RAIDZ, aggregate reads that have more rows than this"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); -/* END CSTYLED */ diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 1249657f9d72..08c85a874803 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1,2568 +1,2566 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This file contains the necessary logic to remove vdevs from a * storage pool. Currently, the only devices that can be removed * are log, cache, and spare devices; and top level vdevs from a pool * w/o raidz or mirrors. (Note that members of a mirror can be removed * by the detach operation.) * * Log vdevs are removed by evacuating them and then turning the vdev * into a hole vdev while holding spa config locks. * * Top level vdevs are removed and converted into an indirect vdev via * a multi-step process: * * - Disable allocations from this device (spa_vdev_remove_top). * * - From a new thread (spa_vdev_remove_thread), copy data from * the removing vdev to a different vdev. The copy happens in open * context (spa_vdev_copy_impl) and issues a sync task * (vdev_mapping_sync) so the sync thread can update the partial * indirect mappings in core and on disk. * * - If a free happens during a removal, it is freed from the * removing vdev, and if it has already been copied, from the new * location as well (free_from_removing_vdev). * * - After the removal is completed, the copy thread converts the vdev * into an indirect vdev (vdev_remove_complete) before instructing * the sync thread to destroy the space maps and finish the removal * (spa_finish_removal). */ typedef struct vdev_copy_arg { metaslab_t *vca_msp; uint64_t vca_outstanding_bytes; uint64_t vca_read_error_bytes; uint64_t vca_write_error_bytes; kcondvar_t vca_cv; kmutex_t vca_lock; } vdev_copy_arg_t; /* * The maximum amount of memory we can use for outstanding i/o while * doing a device removal. This determines how much i/o we can have * in flight concurrently. */ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; /* * The largest contiguous segment that we will attempt to allocate when * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. * * See also the accessor function spa_remove_max_segment(). */ uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device * encounters hard IO error during the removal process the removal will * not be cancelled. This can result in a normally recoverable block * becoming permanently damaged and is not recommended. */ static int zfs_removal_ignore_errors = 0; /* * Allow a remap segment to span free chunks of at most this size. The main * impact of a larger span is that we will read and write larger, more * contiguous chunks, with more "unnecessary" data -- trading off bandwidth * for iops. The value here was chosen to align with * zfs_vdev_read_gap_limit, which is a similar concept when doing regular * reads (but there's no reason it has to be the same). * * Additionally, a higher span will have the following relatively minor * effects: * - the mapping will be smaller, since one entry can cover more allocated * segments * - more of the fragmentation in the removing device will be preserved * - we'll do larger allocations, which may fail and fall back on smaller * allocations */ uint_t vdev_removal_max_span = 32 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg); static int spa_vdev_remove_cancel_impl(spa_t *spa); static void spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) { VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys, tx)); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { for (int i = 0; i < count; i++) { uint64_t guid = fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void vdev_activate(vdev_t *vd) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; uint64_t vdev_space = spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; ASSERT(!vd->vdev_islog); ASSERT(vd->vdev_noalloc); metaslab_group_activate(mg); metaslab_group_activate(vd->vdev_log_mg); ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space); spa->spa_nonallocating_dspace -= vdev_space; vd->vdev_noalloc = B_FALSE; } static int vdev_passivate(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; int error; ASSERT(!vd->vdev_noalloc); vdev_t *rvd = spa->spa_root_vdev; metaslab_group_t *mg = vd->vdev_mg; metaslab_class_t *normal = spa_normal_class(spa); if (mg->mg_class == normal) { /* * We must check that this is not the only allocating device in * the pool before passivating, otherwise we will not be able * to make progress because we can't allocate from any vdevs. */ boolean_t last = B_TRUE; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; if (cvd == vd || cvd->vdev_ops == &vdev_indirect_ops) continue; metaslab_class_t *mc = cvd->vdev_mg->mg_class; if (mc != normal) continue; if (!cvd->vdev_noalloc) { last = B_FALSE; break; } } if (last) return (SET_ERROR(EINVAL)); } metaslab_group_passivate(mg); ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * We must ensure that no "stubby" log blocks are allocated * on the device to be removed. These blocks could be * written at any time, including while we are in the middle * of copying them. */ error = spa_reset_logs(spa); *txg = spa_vdev_config_enter(spa); if (error != 0) { metaslab_group_activate(mg); ASSERT(!vd->vdev_islog); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); return (error); } spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; vd->vdev_noalloc = B_TRUE; return (0); } /* * Turn off allocations for a top-level device from the pool. * * Turning off allocations for a top-level device can take a significant * amount of time. As a result we use the spa_vdev_config_[enter/exit] * functions which allow us to grab and release the spa_config_lock while * still holding the namespace lock. During each step the configuration * is synced out. */ int spa_vdev_noalloc(spa_t *spa, uint64_t guid) { vdev_t *vd; uint64_t txg; int error = 0; ASSERT(!MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) error = SET_ERROR(ENOENT); else if (vd->vdev_mg == NULL) error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP); else if (!vd->vdev_noalloc) error = vdev_passivate(vd, &txg); if (error == 0) { vdev_dirty_leaves(vd, VDD_DTL, txg); vdev_config_dirty(vd); } error = spa_vdev_exit(spa, NULL, txg, error); return (error); } int spa_vdev_alloc(spa_t *spa, uint64_t guid) { vdev_t *vd; uint64_t txg; int error = 0; ASSERT(!MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) error = SET_ERROR(ENOENT); else if (vd->vdev_mg == NULL) error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP); else if (!vd->vdev_removing) vdev_activate(vd); if (error == 0) { vdev_dirty_leaves(vd, VDD_DTL, txg); vdev_config_dirty(vd); } (void) spa_vdev_exit(spa, NULL, txg, error); return (error); } static void spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev, count - 1); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } static spa_vdev_removal_t * spa_vdev_removal_create(vdev_t *vd) { spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } return (svr); } void spa_vdev_removal_destroy(spa_vdev_removal_t *svr) { for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); ASSERT0(svr->svr_max_offset_to_sync[i]); range_tree_destroy(svr->svr_frees[i]); list_destroy(&svr->svr_new_segments[i]); } range_tree_destroy(svr->svr_allocd_segs); mutex_destroy(&svr->svr_lock); cv_destroy(&svr->svr_cv); kmem_free(svr, sizeof (*svr)); } /* * This is called as a synctask in the txg in which we will mark this vdev * as removing (in the config stored in the MOS). * * It begins the evacuation of a toplevel vdev by: * - initializing the spa_removing_phys which tracks this removal * - computing the amount of space to remove for accounting purposes * - dirtying all dbufs in the spa_config_object * - creating the spa_vdev_removal * - starting the spa_vdev_remove_thread */ static void vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; spa_vdev_removal_t *svr = NULL; uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); ASSERT0(vdev_get_nparity(vd)); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { /* * By activating the OBSOLETE_COUNTS feature, we prevent * the pool from being downgraded and ensure that the * refcounts are precise. */ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); uint64_t one = 1; VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, &one, tx)); boolean_t are_precise __maybe_unused; ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise)); ASSERT3B(are_precise, ==, B_TRUE); } vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); vd->vdev_indirect_mapping = vdev_indirect_mapping_open(mos, vic->vic_mapping_object); vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); vd->vdev_indirect_births = vdev_indirect_births_open(mos, vic->vic_births_object); spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; spa->spa_removing_phys.sr_start_time = gethrestime_sec(); spa->spa_removing_phys.sr_end_time = 0; spa->spa_removing_phys.sr_state = DSS_SCANNING; spa->spa_removing_phys.sr_to_copy = 0; spa->spa_removing_phys.sr_copied = 0; /* * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because * there may be space in the defer tree, which is free, but still * counted in vs_alloc. */ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *ms = vd->vdev_ms[i]; if (ms->ms_sm == NULL) continue; spa->spa_removing_phys.sr_to_copy += metaslab_allocated_space(ms); /* * Space which we are freeing this txg does not need to * be copied. */ spa->spa_removing_phys.sr_to_copy -= range_tree_space(ms->ms_freeing); ASSERT0(range_tree_space(ms->ms_freed)); for (int t = 0; t < TXG_SIZE; t++) ASSERT0(range_tree_space(ms->ms_allocating[t])); } /* * Sync tasks are called before metaslab_sync(), so there should * be no already-synced metaslabs in the TXG_CLEAN list. */ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); spa_sync_removing_state(spa, tx); /* * All blocks that we need to read the most recent mapping must be * stored on concrete vdevs. Therefore, we must dirty anything that * is read before spa_remove_init(). Specifically, the * spa_config_object. (Note that although we already modified the * spa_config_object in spa_sync_removing_state, that may not have * modified all blocks of the object.) */ dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { dmu_buf_t *dbuf; VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, offset, FTAG, &dbuf, 0)); dmu_buf_will_dirty(dbuf, tx); offset += dbuf->db_size; dmu_buf_rele(dbuf, FTAG); } /* * Now that we've allocated the im_object, dirty the vdev to ensure * that the object gets written to the config on disk. */ vdev_config_dirty(vd); zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu " "im_obj=%llu", (u_longlong_t)vd->vdev_id, vd, (u_longlong_t)dmu_tx_get_txg(tx), (u_longlong_t)vic->vic_mapping_object); spa_history_log_internal(spa, "vdev remove started", tx, "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); /* * Setting spa_vdev_removal causes subsequent frees to call * free_from_removing_vdev(). Note that we don't need any locking * because we are the sync thread, and metaslab_free_impl() is only * called from syncing context (potentially from a zio taskq thread, * but in any case only when there are outstanding free i/os, which * there are not). */ ASSERT3P(spa->spa_vdev_removal, ==, NULL); spa->spa_vdev_removal = svr; svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * When we are opening a pool, we must read the mapping for each * indirect vdev in order from most recently removed to least * recently removed. We do this because the blocks for the mapping * of older indirect vdevs may be stored on more recently removed vdevs. * In order to read each indirect mapping object, we must have * initialized all more recently removed vdevs. */ int spa_remove_init(spa_t *spa) { int error; error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys); if (error == ENOENT) { spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } else if (error != 0) { return (error); } if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { /* * We are currently removing a vdev. Create and * initialize a spa_vdev_removal_t from the bonus * buffer of the removing vdevs vdev_im_object, and * initialize its partial mapping. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_removing_phys.sr_removing_vdev); if (vd == NULL) { spa_config_exit(spa, SCL_STATE, FTAG); return (EINVAL); } vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT(vdev_is_concrete(vd)); spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); ASSERT(vd->vdev_removing); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); spa_config_exit(spa, SCL_STATE, FTAG); spa->spa_vdev_removal = svr; } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != UINT64_MAX) { vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); indirect_vdev_id = vic->vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_STATE, FTAG); /* * Now that we've loaded all the indirect mappings, we can allow * reads from other blocks (e.g. via predictive prefetch). */ spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } void spa_restart_removal(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; /* * In general when this function is called there is no * removal thread running. The only scenario where this * is not true is during spa_import() where this function * is called twice [once from spa_import_impl() and * spa_async_resume()]. Thus, in the scenario where we * import a pool that has an ongoing removal we don't * want to spawn a second thread. */ if (svr->svr_thread != NULL) return; if (!spa_writeable(spa)) return; zfs_dbgmsg("restarting removal of %llu", (u_longlong_t)svr->svr_vdev_id); svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * Process freeing from a device which is in the middle of being removed. * We must handle this carefully so that we attempt to copy freed data, * and we correctly free already-copied data. */ void free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t txg = spa_syncing_txg(spa); uint64_t max_offset_yet = 0; ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, vdev_indirect_mapping_object(vim)); ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); mutex_enter(&svr->svr_lock); /* * Remove the segment from the removing vdev's spacemap. This * ensures that we will not attempt to copy this space (if the * removal thread has not yet visited it), and also ensures * that we know what is actually allocated on the new vdevs * (needed if we cancel the removal). * * Note: we must do the metaslab_free_concrete() with the svr_lock * held, so that the remove_thread can not load this metaslab and then * visit this offset between the time that we metaslab_free_concrete() * and when we check to see if it has been visited. * * Note: The checkpoint flag is set to false as having/taking * a checkpoint and removing a device can't happen at the same * time. */ ASSERT(!spa_has_checkpoint(spa)); metaslab_free_concrete(vd, offset, size, B_FALSE); uint64_t synced_size = 0; uint64_t synced_offset = 0; uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); if (offset < max_offset_synced) { /* * The mapping for this offset is already on disk. * Free from the new location. * * Note that we use svr_max_synced_offset because it is * updated atomically with respect to the in-core mapping. * By contrast, vim_max_offset is not. * * This block may be split between a synced entry and an * in-flight or unvisited entry. Only process the synced * portion of it here. */ synced_size = MIN(size, max_offset_synced - offset); synced_offset = offset; ASSERT3U(max_offset_yet, <=, max_offset_synced); max_offset_yet = max_offset_synced; DTRACE_PROBE3(remove__free__synced, spa_t *, spa, uint64_t, offset, uint64_t, synced_size); size -= synced_size; offset += synced_size; } /* * Look at all in-flight txgs starting from the currently syncing one * and see if a section of this free is being copied. By starting from * this txg and iterating forward, we might find that this region * was copied in two different txgs and handle it appropriately. */ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { int txgoff = (txg + i) & TXG_MASK; if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { /* * The mapping for this offset is in flight, and * will be synced in txg+i. */ uint64_t inflight_size = MIN(size, svr->svr_max_offset_to_sync[txgoff] - offset); DTRACE_PROBE4(remove__free__inflight, spa_t *, spa, uint64_t, offset, uint64_t, inflight_size, uint64_t, txg + i); /* * We copy data in order of increasing offset. * Therefore the max_offset_to_sync[] must increase * (or be zero, indicating that nothing is being * copied in that txg). */ if (svr->svr_max_offset_to_sync[txgoff] != 0) { ASSERT3U(svr->svr_max_offset_to_sync[txgoff], >=, max_offset_yet); max_offset_yet = svr->svr_max_offset_to_sync[txgoff]; } /* * We've already committed to copying this segment: * we have allocated space elsewhere in the pool for * it and have an IO outstanding to copy the data. We * cannot free the space before the copy has * completed, or else the copy IO might overwrite any * new data. To free that space, we record the * segment in the appropriate svr_frees tree and free * the mapped space later, in the txg where we have * completed the copy and synced the mapping (see * vdev_mapping_sync). */ range_tree_add(svr->svr_frees[txgoff], offset, inflight_size); size -= inflight_size; offset += inflight_size; /* * This space is already accounted for as being * done, because it is being copied in txg+i. * However, if i!=0, then it is being copied in * a future txg. If we crash after this txg * syncs but before txg+i syncs, then the space * will be free. Therefore we must account * for the space being done in *this* txg * (when it is freed) rather than the future txg * (when it will be copied). */ ASSERT3U(svr->svr_bytes_done[txgoff], >=, inflight_size); svr->svr_bytes_done[txgoff] -= inflight_size; svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; } } ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); if (size > 0) { /* * The copy thread has not yet visited this offset. Ensure * that it doesn't. */ DTRACE_PROBE3(remove__free__unvisited, spa_t *, spa, uint64_t, offset, uint64_t, size); if (svr->svr_allocd_segs != NULL) range_tree_clear(svr->svr_allocd_segs, offset, size); /* * Since we now do not need to copy this data, for * accounting purposes we have done our job and can count * it as completed. */ svr->svr_bytes_done[txg & TXG_MASK] += size; } mutex_exit(&svr->svr_lock); /* * Now that we have dropped svr_lock, process the synced portion * of this free. */ if (synced_size > 0) { vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); /* * Note: this can only be called from syncing context, * and the vdev_indirect_mapping is only changed from the * sync thread, so we don't need svr_lock while doing * metaslab_free_impl_cb. */ boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, metaslab_free_impl_cb, &checkpoint); } } /* * Stop an active removal and update the spa_removing phys. */ static void spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); /* Ensure the removal thread has completed before we free the svr. */ spa_vdev_remove_suspend(spa); ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); if (state == DSS_FINISHED) { spa_removing_phys_t *srp = &spa->spa_removing_phys; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (srp->sr_prev_indirect_vdev != -1) { vdev_t *pvd; pvd = vdev_lookup_top(spa, srp->sr_prev_indirect_vdev); ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); } vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; srp->sr_prev_indirect_vdev = vd->vdev_id; } spa->spa_removing_phys.sr_state = state; spa->spa_removing_phys.sr_end_time = gethrestime_sec(); spa->spa_vdev_removal = NULL; spa_vdev_removal_destroy(svr); spa_sync_removing_state(spa, tx); spa_notify_waiters(spa); vdev_config_dirty(spa->spa_root_vdev); } static void free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_mark_obsolete(vd, offset, size); boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } /* * On behalf of the removal thread, syncs an incremental bit more of * the indirect mapping to disk and updates the in-memory mapping. * Called as a sync task in every txg that the removal thread makes progress. */ static void vdev_mapping_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; uint64_t txg = dmu_tx_get_txg(tx); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT(vic->vic_mapping_object != 0); ASSERT3U(txg, ==, spa_syncing_txg(spa)); vdev_indirect_mapping_add_entries(vim, &svr->svr_new_segments[txg & TXG_MASK], tx); vdev_indirect_births_add_entry(vd->vdev_indirect_births, vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); /* * Free the copied data for anything that was freed while the * mapping entries were in flight. */ mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_frees[txg & TXG_MASK], free_mapped_segment_cb, vd); ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, vdev_indirect_mapping_max_offset(vim)); svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; mutex_exit(&svr->svr_lock); spa_sync_removing_state(spa, tx); } typedef struct vdev_copy_segment_arg { spa_t *vcsa_spa; dva_t *vcsa_dest_dva; uint64_t vcsa_txg; range_tree_t *vcsa_obsolete_segs; } vdev_copy_segment_arg_t; static void unalloc_seg(void *arg, uint64_t start, uint64_t size) { vdev_copy_segment_arg_t *vcsa = arg; spa_t *spa = vcsa->vcsa_spa; blkptr_t bp = { { { {0} } } }; BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(&bp, size); BP_SET_PSIZE(&bp, size); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(&bp, DMU_OT_NONE); BP_SET_LEVEL(&bp, 0); BP_SET_DEDUP(&bp, 0); BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); DVA_SET_OFFSET(&bp.blk_dva[0], DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); DVA_SET_ASIZE(&bp.blk_dva[0], size); zio_free(spa, vcsa->vcsa_txg, &bp); } /* * All reads and writes associated with a call to spa_vdev_copy_segment() * are done. */ static void spa_vdev_copy_segment_done(zio_t *zio) { vdev_copy_segment_arg_t *vcsa = zio->io_private; range_tree_vacate(vcsa->vcsa_obsolete_segs, unalloc_seg, vcsa); range_tree_destroy(vcsa->vcsa_obsolete_segs); kmem_free(vcsa, sizeof (*vcsa)); spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); } /* * The write of the new location is done. */ static void spa_vdev_copy_segment_write_done(zio_t *zio) { vdev_copy_arg_t *vca = zio->io_private; abd_free(zio->io_abd); mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes -= zio->io_size; if (zio->io_error != 0) vca->vca_write_error_bytes += zio->io_size; cv_signal(&vca->vca_cv); mutex_exit(&vca->vca_lock); } /* * The read of the old location is done. The parent zio is the write to * the new location. Allow it to start. */ static void spa_vdev_copy_segment_read_done(zio_t *zio) { vdev_copy_arg_t *vca = zio->io_private; if (zio->io_error != 0) { mutex_enter(&vca->vca_lock); vca->vca_read_error_bytes += zio->io_size; mutex_exit(&vca->vca_lock); } zio_nowait(zio_unique_parent(zio)); } /* * If the old and new vdevs are mirrors, we will read both sides of the old * mirror, and write each copy to the corresponding side of the new mirror. * If the old and new vdevs have a different number of children, we will do * this as best as possible. Since we aren't verifying checksums, this * ensures that as long as there's a good copy of the data, we'll have a * good copy after the removal, even if there's silent damage to one side * of the mirror. If we're removing a mirror that has some silent damage, * we'll have exactly the same damage in the new location (assuming that * the new location is also a mirror). * * We accomplish this by creating a tree of zio_t's, with as many writes as * there are "children" of the new vdev (a non-redundant vdev counts as one * child, a 2-way mirror has 2 children, etc). Each write has an associated * read from a child of the old vdev. Typically there will be the same * number of children of the old and new vdevs. However, if there are more * children of the new vdev, some child(ren) of the old vdev will be issued * multiple reads. If there are more children of the old vdev, some copies * will be dropped. * * For example, the tree of zio_t's for a 2-way mirror is: * * null * / \ * write(new vdev, child 0) write(new vdev, child 1) * | | * read(old vdev, child 0) read(old vdev, child 1) * * Child zio's complete before their parents complete. However, zio's * created with zio_vdev_child_io() may be issued before their children * complete. In this case we need to make sure that the children (reads) * complete before the parents (writes) are *issued*. We do this by not * calling zio_nowait() on each write until its corresponding read has * completed. * * The spa_config_lock must be held while zio's created by * zio_vdev_child_io() are in progress, to ensure that the vdev tree does * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" * zio is needed to release the spa_config_lock after all the reads and * writes complete. (Note that we can't grab the config lock for each read, * because it is not reentrant - we could deadlock with a thread waiting * for a write lock.) */ static void spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, vdev_t *source_vd, uint64_t source_offset, vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) { ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); /* * If the destination child in unwritable then there is no point * in issuing the source reads which cannot be written. */ if (!vdev_writeable(dest_child_vd)) return; mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes += size; mutex_exit(&vca->vca_lock); abd_t *abd = abd_alloc_for_io(size, B_FALSE); vdev_t *source_child_vd = NULL; if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { /* * Source and dest are both mirrors. Copy from the same * child id as we are copying to (wrapping around if there * are more dest children than source children). If the * preferred source child is unreadable select another. */ for (int i = 0; i < source_vd->vdev_children; i++) { source_child_vd = source_vd->vdev_child[ (dest_id + i) % source_vd->vdev_children]; if (vdev_readable(source_child_vd)) break; } } else { source_child_vd = source_vd; } /* * There should always be at least one readable source child or * the pool would be in a suspended state. Somehow selecting an * unreadable child would result in IO errors, the removal process * being cancelled, and the pool reverting to its pre-removal state. */ ASSERT3P(source_child_vd, !=, NULL); zio_t *write_zio = zio_vdev_child_io(nzio, NULL, dest_child_vd, dest_offset, abd, size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_write_done, vca); zio_nowait(zio_vdev_child_io(write_zio, NULL, source_child_vd, source_offset, abd, size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_read_done, vca)); } /* * Allocate a new location for this segment, and create the zio_t's to * read from the old location and write to the new location. */ static int spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; uint64_t start = range_tree_min(segs); ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); uint64_t size = range_tree_span(segs); if (range_tree_span(segs) > maxalloc) { /* * We can't allocate all the segments. Prefer to end * the allocation at the end of a segment, thus avoiding * additional split blocks. */ range_seg_max_t search; zfs_btree_index_t where; rs_set_start(&search, segs, start + maxalloc); rs_set_end(&search, segs, start + maxalloc); (void) zfs_btree_find(&segs->rt_root, &search, &where); range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, &where); if (rs != NULL) { size = rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. * I.e. the first segment is larger than maxalloc, * so we must split it. */ size = maxalloc; } } ASSERT3U(size, <=, maxalloc); ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); /* * An allocation class might not have any remaining vdevs or space */ metaslab_class_t *mc = mg->mg_class; if (mc->mc_groups == 0) mc = spa_normal_class(spa); int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); if (error == ENOSPC && mc != spa_normal_class(spa)) { error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0); } if (error != 0) return (error); /* * Determine the ranges that are not actually needed. Offsets are * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); zfs_btree_index_t where; range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); ASSERT3U(rs_get_start(rs, segs), ==, start); uint64_t prev_seg_end = rs_get_end(rs, segs); while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { if (rs_get_start(rs, segs) >= start + size) { break; } else { range_tree_add(obsolete_segs, prev_seg_end - start, rs_get_start(rs, segs) - prev_seg_end); } prev_seg_end = rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); range_tree_clear(segs, start, size); /* * We can't have any padding of the allocated size, otherwise we will * misunderstand what's allocated, and the size of the mapping. We * prevent padding by ensuring that all devices in the pool have the * same ashift, and the allocation size is a multiple of the ashift. */ VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { entry->vime_obsolete_count = range_tree_space(obsolete_segs); } vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; vcsa->vcsa_obsolete_segs = obsolete_segs; vcsa->vcsa_spa = spa; vcsa->vcsa_txg = txg; /* * See comment before spa_vdev_copy_one_child(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, spa_vdev_copy_segment_done, vcsa, 0); vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); if (dest_vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < dest_vd->vdev_children; i++) { vdev_t *child = dest_vd->vdev_child[i]; spa_vdev_copy_one_child(vca, nzio, vd, start, child, DVA_GET_OFFSET(&dst), i, size); } } else { spa_vdev_copy_one_child(vca, nzio, vd, start, dest_vd, DVA_GET_OFFSET(&dst), -1, size); } zio_nowait(nzio); list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); vdev_dirty(vd, 0, NULL, txg); return (0); } /* * Complete the removal of a toplevel vdev. This is called as a * synctask in the same txg that we will sync out the new config (to the * MOS object) which indicates that this vdev is indirect. */ static void vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); } ASSERT3U(spa->spa_removing_phys.sr_copied, ==, spa->spa_removing_phys.sr_to_copy); vdev_destroy_spacemaps(vd, tx); /* destroy leaf zaps, if any */ ASSERT3P(svr->svr_zaplist, !=, NULL); for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); pair != NULL; pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); } fnvlist_free(svr->svr_zaplist); spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); /* vd->vdev_path is not available here */ spa_history_log_internal(spa, "vdev remove completed", tx, "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id); } static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); ASSERT0(vdev_get_nparity(vd)); if (vd->vdev_leaf_zap != 0) { char zkey[32]; (void) snprintf(zkey, sizeof (zkey), "%s-%llu", VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap); fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); } for (uint64_t id = 0; id < vd->vdev_children; id++) { vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); } } static void vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) { vdev_t *ivd; dmu_tx_t *tx; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; /* * First, build a list of leaf zaps to be destroyed. * This is passed to the sync context thread, * which does the actual unlinking. */ svr->svr_zaplist = fnvlist_alloc(); vdev_remove_enlist_zaps(vd, svr->svr_zaplist); ivd = vdev_add_parent(vd, &vdev_indirect_ops); ivd->vdev_removing = 0; vd->vdev_leaf_zap = 0; vdev_remove_child(ivd, vd); vdev_compact_children(ivd); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); mutex_enter(&svr->svr_lock); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); /* After this, we can not use svr. */ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, tx); dmu_tx_commit(tx); } /* * Complete the removal of a toplevel vdev. This is called in open * context by the removal thread after we have copied all vdev's data. */ static void vdev_remove_complete(spa_t *spa) { uint64_t txg; /* * Wait for any deferred frees to be synced before we call * vdev_metaslab_fini() */ txg_wait_synced(spa->spa_dsl_pool, 0); txg = spa_vdev_enter(spa); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); vdev_rebuild_stop_wait(vd); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); uint64_t vdev_space = spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)txg); ASSERT3U(0, !=, vdev_space); ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space); /* the vdev is no longer part of the dspace */ spa->spa_nonallocating_dspace -= vdev_space; /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); vdev_remove_replace_with_indirect(vd, txg); /* * We now release the locks, allowing spa_sync to run and finish the * removal via vdev_remove_complete_sync in syncing context. * * Note that we hold on to the vdev_t that has been replaced. Since * it isn't part of the vdev tree any longer, it can't be concurrently * manipulated, even while we don't have the config lock. */ (void) spa_vdev_exit(spa, NULL, txg, 0); /* * Top ZAP should have been transferred to the indirect vdev in * vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_top_zap); /* * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_leaf_zap); txg = spa_vdev_enter(spa); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Request to update the config and the config cachefile. */ vdev_config_dirty(spa->spa_root_vdev); (void) spa_vdev_exit(spa, vd, txg, 0); if (ev != NULL) spa_event_post(ev); } /* * Evacuates a segment of size at most max_alloc from the vdev * via repeated calls to spa_vdev_copy_segment. If an allocation * fails, the pool is probably too fragmented to handle such a * large size, so decrease max_alloc so that the caller will not try * this size again this txg. */ static void spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, uint64_t *max_alloc, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = dmu_tx_pool(tx)->dp_spa; mutex_enter(&svr->svr_lock); /* * Determine how big of a chunk to copy. We can allocate up * to max_alloc bytes, and we can span up to vdev_removal_max_span * bytes of unallocated space at a time. "segs" will track the * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (;;) { range_tree_t *rt = svr->svr_allocd_segs; range_seg_t *rs = range_tree_first(rt); if (rs == NULL) break; uint64_t seg_length; if (range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, rt), *max_alloc); } else { if (rs_get_start(rs, rt) - range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; } else if (rs_get_end(rs, rt) - range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past * max_alloc. Rather than splitting this * segment, leave it for the next mapping. */ break; } else { seg_length = rs_get_end(rs, rt) - rs_get_start(rs, rt); } } range_tree_add(segs, rs_get_start(rs, rt), seg_length); range_tree_remove(svr->svr_allocd_segs, rs_get_start(rs, rt), seg_length); } if (range_tree_is_empty(segs)) { mutex_exit(&svr->svr_lock); range_tree_destroy(segs); return; } if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, svr, tx); } svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); /* * Note: this is the amount of *allocated* space * that we are taking care of each txg. */ svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); mutex_exit(&svr->svr_lock); zio_alloc_list_t zal; metaslab_trace_init(&zal); uint64_t thismax = SPA_MAXBLOCKSIZE; while (!range_tree_is_empty(segs)) { int error = spa_vdev_copy_segment(vd, segs, thismax, txg, vca, &zal); if (error == ENOSPC) { /* * Cut our segment in half, and don't try this * segment size again this txg. Note that the * allocation size must be aligned to the highest * ashift in the pool, so that the allocation will * not be padded out to a multiple of the ashift, * which could cause us to think that this mapping * is larger than we intended. */ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); uint64_t attempted = MIN(range_tree_span(segs), thismax); thismax = P2ROUNDUP(attempted / 2, 1 << spa->spa_max_ashift); /* * The minimum-size allocation can not fail. */ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); *max_alloc = attempted - (1 << spa->spa_max_ashift); } else { ASSERT0(error); /* * We've performed an allocation, so reset the * alloc trace list. */ metaslab_trace_fini(&zal); metaslab_trace_init(&zal); } } metaslab_trace_fini(&zal); range_tree_destroy(segs); } /* * The size of each removal mapping is limited by the tunable * zfs_remove_max_segment, but we must adjust this to be a multiple of the * pool's ashift, so that we don't try to split individual sectors regardless * of the tunable value. (Note that device removal requires that all devices * have the same ashift, so there's no difference between spa_min_ashift and * spa_max_ashift.) The raw tunable should not be used elsewhere. */ uint64_t spa_remove_max_segment(spa_t *spa) { return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); } /* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. * For each contiguous segment of allocated space (capping the segment * size at SPA_MAXBLOCKSIZE), we: * - Allocate space for it on another vdev. * - Create a new mapping from the old location to the new location * (as a record in svr_new_segments). * - Initiate a physical read zio to get the data off the removing disk. * - In the read zio's done callback, initiate a physical write zio to * write it to the new vdev. * Note that all of this will take effect when a particular TXG syncs. * The sync thread ensures that all the phys reads and writes for the syncing * TXG have completed (see spa_txg_zio) and writes the new mappings to disk * (see vdev_mapping_sync()). */ static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg) { spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; uint64_t max_alloc = spa_remove_max_segment(spa); uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_removing); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT(vim != NULL); mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); vca.vca_outstanding_bytes = 0; vca.vca_read_error_bytes = 0; vca.vca_write_error_bytes = 0; mutex_enter(&svr->svr_lock); /* * Start from vim_max_offset so we pick up where we left off * if we are restarting the removal after opening the pool. */ uint64_t msi; for (msi = start_offset >> vd->vdev_ms_shift; msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; ASSERT3U(msi, <=, vd->vdev_ms_count); ASSERT0(range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(range_tree_space(msp->ms_allocating[i])); } /* * If the metaslab has ever been allocated from (ms_sm!=NULL), * read the allocated segments from the space map object * into svr_allocd_segs. Since we do this while holding * svr_lock and ms_sync_lock, concurrent frees (which * would have modified the space map) will wait for us * to finish loading the spacemap, and then take the * appropriate action (see free_from_removing_vdev()). */ if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); range_tree_walk(msp->ms_unflushed_allocs, range_tree_add, svr->svr_allocd_segs); range_tree_walk(msp->ms_unflushed_frees, range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* * When we are resuming from a paused removal (i.e. * when importing a pool with a removal in progress), * discard any state that we have already processed. */ range_tree_clear(svr->svr_allocd_segs, 0, start_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", (u_longlong_t)zfs_btree_numnodes( &svr->svr_allocd_segs->rt_root), (u_longlong_t)msp->ms_id); while (!svr->svr_thread_exit && !range_tree_is_empty(svr->svr_allocd_segs)) { mutex_exit(&svr->svr_lock); /* * We need to periodically drop the config lock so that * writers can get in. Additionally, we can't wait * for a txg to sync while holding a config lock * (since a waiting writer could cause a 3-way deadlock * with the sync thread, which also gets a config * lock for reader). So we can't hold the config lock * while calling dmu_tx_assign(). */ spa_config_exit(spa, SCL_CONFIG, FTAG); /* * This delay will pause the removal around the point * specified by zfs_removal_suspend_progress. We do this * solely from the test suite or during debugging. */ while (zfs_removal_suspend_progress && !svr->svr_thread_exit) delay(hz); mutex_enter(&vca.vca_lock); while (vca.vca_outstanding_bytes > zfs_remove_max_copy_bytes) { cv_wait(&vca.vca_cv, &vca.vca_lock); } mutex_exit(&vca.vca_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); /* * Reacquire the vdev_config lock. The vdev_t * that we're removing may have changed, e.g. due * to a vdev_attach or vdev_detach. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) max_alloc = spa_remove_max_segment(spa); last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); dmu_tx_commit(tx); mutex_enter(&svr->svr_lock); } mutex_enter(&vca.vca_lock); if (zfs_removal_ignore_errors == 0 && (vca.vca_read_error_bytes > 0 || vca.vca_write_error_bytes > 0)) { svr->svr_thread_exit = B_TRUE; } mutex_exit(&vca.vca_lock); } mutex_exit(&svr->svr_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); /* * Wait for all copies to finish before cleaning up the vca. */ txg_wait_synced(spa->spa_dsl_pool, 0); ASSERT0(vca.vca_outstanding_bytes); mutex_destroy(&vca.vca_lock); cv_destroy(&vca.vca_cv); if (svr->svr_thread_exit) { mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); /* * During the removal process an unrecoverable read or write * error was encountered. The removal process must be * cancelled or this damage may become permanent. */ if (zfs_removal_ignore_errors == 0 && (vca.vca_read_error_bytes > 0 || vca.vca_write_error_bytes > 0)) { zfs_dbgmsg("canceling removal due to IO errors: " "[read_error_bytes=%llu] [write_error_bytes=%llu]", (u_longlong_t)vca.vca_read_error_bytes, (u_longlong_t)vca.vca_write_error_bytes); spa_vdev_remove_cancel_impl(spa); } } else { ASSERT0(range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } thread_exit(); } void spa_vdev_remove_suspend(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; mutex_enter(&svr->svr_lock); svr->svr_thread_exit = B_TRUE; while (svr->svr_thread != NULL) cv_wait(&svr->svr_cv, &svr->svr_lock); svr->svr_thread_exit = B_FALSE; mutex_exit(&svr->svr_lock); } /* * Return true if the "allocating" property has been set to "off" */ static boolean_t vdev_prop_allocating_off(vdev_t *vd) { uint64_t objid = vd->vdev_top_zap; uint64_t allocating = 1; /* no vdev property object => no props */ if (objid != 0) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; mutex_enter(&spa->spa_props_lock); (void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t), 1, &allocating); mutex_exit(&spa->spa_props_lock); } return (allocating == 0); } static int spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) { (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); return (0); } /* * Cancel a removal by freeing all entries from the partial mapping * and marking the vdev as no longer being removing. */ static void spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) { (void) arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; objset_t *mos = spa->spa_meta_objset; ASSERT3P(svr->svr_thread, ==, NULL); spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); boolean_t are_precise; VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (are_precise) { spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); } uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_free(vd->vdev_obsolete_sm, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&svr->svr_new_segments[i])); ASSERT3U(svr->svr_max_offset_to_sync[i], <=, vdev_indirect_mapping_max_offset(vim)); } for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; ASSERT0(range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) ASSERT0(range_tree_space(msp->ms_allocating[i])); for (int i = 0; i < TXG_DEFER_SIZE; i++) ASSERT0(range_tree_space(msp->ms_defer[i])); ASSERT0(range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); range_tree_walk(msp->ms_unflushed_allocs, range_tree_add, svr->svr_allocd_segs); range_tree_walk(msp->ms_unflushed_frees, range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* * Clear everything past what has been synced, * because we have not allocated mappings for it yet. */ uint64_t syncd = vdev_indirect_mapping_max_offset(vim); uint64_t sm_end = msp->ms_sm->sm_start + msp->ms_sm->sm_size; if (sm_end > syncd) range_tree_clear(svr->svr_allocd_segs, syncd, sm_end - syncd); mutex_exit(&svr->svr_lock); } mutex_exit(&msp->ms_lock); mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_allocd_segs, free_mapped_segment_cb, vd); mutex_exit(&svr->svr_lock); } /* * Note: this must happen after we invoke free_mapped_segment_cb, * because it adds to the obsolete_segments. */ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); ASSERT3U(vic->vic_mapping_object, ==, vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = 0; ASSERT3U(vic->vic_births_object, ==, vdev_indirect_births_object(vd->vdev_indirect_births)); vdev_indirect_births_close(vd->vdev_indirect_births); vd->vdev_indirect_births = NULL; vdev_indirect_births_free(mos, vic->vic_births_object, tx); vic->vic_births_object = 0; /* * We may have processed some frees from the removing vdev in this * txg, thus increasing svr_bytes_done; discard that here to * satisfy the assertions in spa_vdev_removal_destroy(). * Note that future txg's can not have any bytes_done, because * future TXG's are only modified from open context, and we have * already shut down the copying thread. */ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; spa_finish_removal(spa, DSS_CANCELED, tx); vd->vdev_removing = B_FALSE; if (!vdev_prop_allocating_off(vd)) { spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); vdev_activate(vd); spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); } vdev_config_dirty(vd); zfs_dbgmsg("canceled device removal for vdev %llu in %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx)); spa_history_log_internal(spa, "vdev remove canceled", tx, "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); } static int spa_vdev_remove_cancel_impl(spa_t *spa) { int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED); return (error); } int spa_vdev_remove_cancel(spa_t *spa) { spa_vdev_remove_suspend(spa); if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); return (spa_vdev_remove_cancel_impl(spa)); } void svr_sync(spa_t *spa, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (svr == NULL) return; /* * This check is necessary so that we do not dirty the * DIRECTORY_OBJECT via spa_sync_removing_state() when there * is nothing to do. Dirtying it every time would prevent us * from syncing-to-convergence. */ if (svr->svr_bytes_done[txgoff] == 0) return; /* * Update progress accounting. */ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; svr->svr_bytes_done[txgoff] = 0; spa_sync_removing_state(spa, tx); } static void vdev_remove_make_hole_and_free(vdev_t *vd) { uint64_t id = vd->vdev_id; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); vdev_free(vd); vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a log device. The config lock is held for the specified TXG. */ static int spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; int error = 0; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3P(vd->vdev_log_mg, ==, NULL); ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * Cancel any initialize or TRIM which was in progress. */ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED); vdev_autotrim_stop_wait(vd); /* * Evacuate the device. We don't hold the config lock as * writer since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (vd->vdev_stat.vs_alloc != 0) error = spa_reset_logs(spa); *txg = spa_vdev_config_enter(spa); if (error != 0) { metaslab_group_activate(mg); ASSERT3P(vd->vdev_log_mg, ==, NULL); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); /* * When the log space map feature is enabled we look at * the vdev's top_zap to find the on-disk flush data of * the metaslab we just flushed. Thus, while removing a * log vdev we make sure to call vdev_metaslab_fini() * first, which removes all metaslabs of this vdev from * spa_metaslabs_by_flushed before vdev_remove_empty() * destroys the top_zap of this log vdev. * * This avoids the scenario where we flush a metaslab * from the log vdev being removed that doesn't have a * top_zap and end up failing to lookup its on-disk flush * data. * * We don't call metaslab_group_destroy() right away * though (it will be called in vdev_free() later) as * during metaslab_sync() of metaslabs from other vdevs * we may touch the metaslab group of this vdev through * metaslab_class_histogram_verify() */ vdev_metaslab_fini(vd); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* The top ZAP should have been destroyed by vdev_remove_empty. */ ASSERT0(vd->vdev_top_zap); /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ ASSERT0(vd->vdev_leaf_zap); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); ASSERT0(vd->vdev_stat.vs_alloc); /* * Clean up the vdev namespace. */ vdev_remove_make_hole_and_free(vd); if (ev != NULL) spa_event_post(ev); return (0); } static int spa_vdev_remove_top_check(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (vd != vd->vdev_top) return (SET_ERROR(ENOTSUP)); if (!vdev_is_concrete(vd)) return (SET_ERROR(ENOTSUP)); if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); /* * This device is already being removed */ if (vd->vdev_removing) return (SET_ERROR(EALREADY)); metaslab_class_t *mc = vd->vdev_mg->mg_class; metaslab_class_t *normal = spa_normal_class(spa); if (mc != normal) { /* * Space allocated from the special (or dedup) class is * included in the DMU's space usage, but it's not included * in spa_dspace (or dsl_pool_adjustedsize()). Therefore * there is always at least as much free space in the normal * class, as is allocated from the special (and dedup) class. * As a backup check, we will return ENOSPC if this is * violated. See also spa_update_dspace(). */ uint64_t available = metaslab_class_get_space(normal) - metaslab_class_get_alloc(normal); ASSERT3U(available, >=, vd->vdev_stat.vs_alloc); if (available < vd->vdev_stat.vs_alloc) return (SET_ERROR(ENOSPC)); } else if (!vd->vdev_noalloc) { /* available space in the pool's normal class */ uint64_t available = dsl_dir_space_available( spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); if (available < vd->vdev_stat.vs_dspace) return (SET_ERROR(ENOSPC)); } /* * There can not be a removal in progress. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); /* * The device must have all its data. */ if (!vdev_dtl_empty(vd, DTL_MISSING) || !vdev_dtl_empty(vd, DTL_OUTAGE)) return (SET_ERROR(EBUSY)); /* * The device must be healthy. */ if (!vdev_readable(vd)) return (SET_ERROR(EIO)); /* * All vdevs in normal class must have the same ashift. */ if (spa->spa_max_ashift != spa->spa_min_ashift) { return (SET_ERROR(EINVAL)); } /* * A removed special/dedup vdev must have same ashift as normal class. */ ASSERT(!vd->vdev_islog); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE && vd->vdev_ashift != spa->spa_max_ashift) { return (SET_ERROR(EINVAL)); } /* * All vdevs in normal class must have the same ashift * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; /* * A removed special/dedup vdev must have the same ashift * across all vdevs in its class. */ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE && cvd->vdev_alloc_bias == vd->vdev_alloc_bias && cvd->vdev_ashift != vd->vdev_ashift) { return (SET_ERROR(EINVAL)); } if (cvd->vdev_ashift != 0 && cvd->vdev_alloc_bias == VDEV_BIAS_NONE) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); if (!vdev_is_concrete(cvd)) continue; if (vdev_get_nparity(cvd) != 0) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only */ if (cvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < cvd->vdev_children; cid++) { if (!cvd->vdev_child[cid]->vdev_ops-> vdev_op_leaf) return (SET_ERROR(EINVAL)); } } } return (0); } /* * Initiate removal of a top-level vdev, reducing the total space in the pool. * The config lock is held for the specified TXG. Once initiated, * evacuation of all allocated space (copying it to other vdevs) happens * in the background (see spa_vdev_remove_thread()), and can be canceled * (see spa_vdev_remove_cancel()). If successful, the vdev will * be transformed to an indirect vdev (see spa_vdev_remove_complete()). */ static int spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; boolean_t set_noalloc = B_FALSE; int error; /* * Check for errors up-front, so that we don't waste time * passivating the metaslab group and clearing the ZIL if there * are errors. */ error = spa_vdev_remove_top_check(vd); /* * Stop allocating from this vdev. Note that we must check * that this is not the only device in the pool before * passivating, otherwise we will not be able to make * progress because we can't allocate from any vdevs. * The above check for sufficient free space serves this * purpose. */ if (error == 0 && !vd->vdev_noalloc) { set_noalloc = B_TRUE; error = vdev_passivate(vd, txg); } if (error != 0) return (error); /* * We stop any initializing and TRIM that is currently in progress * but leave the state as "active". This will allow the process to * resume if the removal is canceled sometime later. */ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_wait(vd); *txg = spa_vdev_config_enter(spa); /* * Things might have changed while the config lock was dropped * (e.g. space usage). Check for errors again. */ error = spa_vdev_remove_top_check(vd); if (error != 0) { if (set_noalloc) vdev_activate(vd); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); return (error); } vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); return (0); } /* * Remove a device from the pool. * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0, error_log; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); sysevent_t *ev = NULL; const char *vd_type = NULL; char *vd_path = NULL; ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; if (!locked) return (spa_vdev_exit(spa, NULL, txg, error)); return (error); } vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { const char *type; boolean_t draid_spare = B_FALSE; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) draid_spare = B_TRUE; if (vd == NULL && draid_spare) { error = SET_ERROR(ENOTSUP); } else { if (vd == NULL) vd = spa_lookup_by_guid(spa, guid, B_TRUE); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); vd_type = VDEV_TYPE_SPARE; vd_path = spa_strdup(fnvlist_lookup_string( nv, ZPOOL_CONFIG_PATH)); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } } else { error = SET_ERROR(EBUSY); } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { vd_type = VDEV_TYPE_L2CACHE; vd_path = spa_strdup(fnvlist_lookup_string( nv, ZPOOL_CONFIG_PATH)); /* * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); /* * Stop trimming the cache device. We need to release the * config lock to allow the syncing of TRIM transactions * without releasing the spa_namespace_lock. The same * strategy is employed in spa_vdev_remove_top(). */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); txg = spa_vdev_config_enter(spa); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); vd_type = VDEV_TYPE_LOG; vd_path = spa_strdup((vd->vdev_path != NULL) ? vd->vdev_path : "-"); error = spa_vdev_remove_log(vd, &txg); } else if (vd != NULL) { ASSERT(!locked); error = spa_vdev_remove_top(vd, &txg); } else { /* * There is no vdev of any kind with the specified guid. */ error = SET_ERROR(ENOENT); } error_log = error; if (!locked) error = spa_vdev_exit(spa, NULL, txg, error); /* * Logging must be done outside the spa config lock. Otherwise, * this code path could end up holding the spa config lock while * waiting for a txg_sync so it can write to the internal log. * Doing that would prevent the txg sync from actually happening, * causing a deadlock. */ if (error_log == 0 && vd_type != NULL && vd_path != NULL) { spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path); } if (vd_path != NULL) spa_strfree(vd_path); if (ev != NULL) spa_event_post(ev); return (error); } int spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) { prs->prs_state = spa->spa_removing_phys.sr_state; if (prs->prs_state == DSS_NONE) return (SET_ERROR(ENOENT)); prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; prs->prs_start_time = spa->spa_removing_phys.sr_start_time; prs->prs_end_time = spa->spa_removing_phys.sr_end_time; prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; prs->prs_copied = spa->spa_removing_phys.sr_copied; prs->prs_mapping_memory = 0; uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != -1) { vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); indirect_vdev_id = vic->vic_prev_indirect_vdev; } return (0); } ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW, "Ignore hard IO errors when removing device"); ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW, "Largest contiguous segment to allocate when removing device"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW, "Largest span of free chunks a remap segment can span"); -/* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW, "Pause device removal after this many bytes are copied " "(debug use only - causes removal to hang)"); -/* END CSTYLED */ EXPORT_SYMBOL(free_from_removing_vdev); EXPORT_SYMBOL(spa_removal_get_stats); EXPORT_SYMBOL(spa_remove_init); EXPORT_SYMBOL(spa_restart_removal); EXPORT_SYMBOL(spa_vdev_removal_destroy); EXPORT_SYMBOL(spa_vdev_remove); EXPORT_SYMBOL(spa_vdev_remove_cancel); EXPORT_SYMBOL(spa_vdev_remove_suspend); EXPORT_SYMBOL(svr_sync); diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 40e7bcf3ed1f..99fc4ec1928f 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -1,1715 +1,1713 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2023 Alexander Stetsenko * Copyright (c) 2023, Klara Inc. */ /* * This file contains the top half of the zfs directory structure * implementation. The bottom half is in zap_leaf.c. * * The zdir is an extendable hash data structure. There is a table of * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are * each a constant size and hold a variable number of directory entries. * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. * * The pointer table holds a power of 2 number of pointers. * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to * by the pointer at index i in the table holds entries whose hash value * has a zd_prefix_len - bit prefix */ #include #include #include #include #include #include #include #include #include /* * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object * (all leaf blocks) when we start iterating over it. * * For zap_cursor_init(), the callers all intend to iterate through all the * entries. There are a few cases where an error (typically i/o error) could * cause it to bail out early. * * For zap_cursor_init_serialized(), there are callers that do the iteration * outside of ZFS. Typically they would iterate over everything, but we * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), * zcp_snapshots_iter(), and other iterators over things in the MOS - these * are called by /sbin/zfs and channel programs. The other example is * zfs_readdir() which iterates over directory entries for the getdents() * syscall. /sbin/ls iterates to the end (unless it receives a signal), but * userland doesn't have to. * * Given that the ZAP entries aren't returned in a specific order, the only * legitimate use cases for partial iteration would be: * * 1. Pagination: e.g. you only want to display 100 entries at a time, so you * get the first 100 and then wait for the user to hit "next page", which * they may never do). * * 2. You want to know if there are more than X entries, without relying on * the zfs-specific implementation of the directory's st_size (which is * the number of entries). */ static int zap_iterate_prefetch = B_TRUE; /* * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be * collapsed into a single block. */ int zap_shrink_enabled = B_TRUE; int fzap_default_block_shift = 14; /* 16k blocksize */ static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx); void fzap_byteswap(void *vbuf, size_t size) { uint64_t block_type = *(uint64_t *)vbuf; if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) zap_leaf_byteswap(vbuf, size); else { /* it's a ptrtbl block */ byteswap_uint64_array(vbuf, size); } } void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) { ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; zap->zap_dbu.dbu_evict_func_async = NULL; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; zap_phys_t *zp = zap_f_phys(zap); /* * explicitly zero it since it might be coming from an * initialized microzap */ memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); zp->zap_freeblk = 2; /* block 1 will be the first leaf */ zp->zap_num_leafs = 1; zp->zap_num_entries = 0; zp->zap_salt = zap->zap_salt; zp->zap_normflags = zap->zap_normflags; zp->zap_flags = flags; /* block 1 will be the first leaf */ for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; /* * set up block 1 - the first leaf */ dmu_buf_t *db; VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, 1<l_dbuf = db; zap_leaf_init(l, zp->zap_normflags != 0); kmem_free(l, sizeof (zap_leaf_t)); dmu_buf_rele(db, FTAG); } static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) { if (RW_WRITE_HELD(&zap->zap_rwlock)) return (1); if (rw_tryupgrade(&zap->zap_rwlock)) { dmu_buf_will_dirty(zap->zap_dbuf, tx); return (1); } return (0); } /* * Generic routines for dealing with the pointer & cookie tables. */ static int zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), dmu_tx_t *tx) { uint64_t newblk; int bs = FZAP_BLOCK_SHIFT(zap); int hepb = 1<<(bs-4); /* hepb = half the number of entries in a block */ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); ASSERT(tbl->zt_numblks > 0); if (tbl->zt_nextblk != 0) { newblk = tbl->zt_nextblk; } else { newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); dmu_prefetch_by_dnode(zap->zap_dnode, 0, tbl->zt_blk << bs, tbl->zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); } /* * Copy the ptrtbl from the old to new location. */ uint64_t b = tbl->zt_blks_copied; dmu_buf_t *db_old; int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ dmu_buf_t *db_new; VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); dmu_buf_rele(db_old, FTAG); tbl->zt_blks_copied++; dprintf("copied block %llu of %llu\n", (u_longlong_t)tbl->zt_blks_copied, (u_longlong_t)tbl->zt_numblks); if (tbl->zt_blks_copied == tbl->zt_numblks) { (void) dmu_free_range(zap->zap_objset, zap->zap_object, tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); tbl->zt_blk = newblk; tbl->zt_numblks *= 2; tbl->zt_shift++; tbl->zt_nextblk = 0; tbl->zt_blks_copied = 0; dprintf("finished; numblocks now %llu (%uk entries)\n", (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10)); } return (0); } static int zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, dmu_tx_t *tx) { int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); dprintf("storing %llx at index %llx\n", (u_longlong_t)val, (u_longlong_t)idx); uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); dmu_buf_will_dirty(db, tx); if (tbl->zt_nextblk != 0) { uint64_t idx2 = idx * 2; uint64_t blk2 = idx2 >> (bs-3); uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err != 0) { dmu_buf_rele(db, FTAG); return (err); } dmu_buf_will_dirty(db2, tx); ((uint64_t *)db2->db_data)[off2] = val; ((uint64_t *)db2->db_data)[off2+1] = val; dmu_buf_rele(db2, FTAG); } ((uint64_t *)db->db_data)[off] = val; dmu_buf_rele(db, FTAG); return (0); } static int zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) { int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; dmu_buf_rele(db, FTAG); if (tbl->zt_nextblk != 0) { /* * read the nextblk for the sake of i/o error checking, * so that zap_table_load() will catch errors for * zap_table_store. */ blk = (idx*2) >> (bs-3); err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) dmu_buf_rele(db, FTAG); } return (err); } /* * Routines for growing the ptrtbl. */ static void zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) { for (int i = 0; i < n; i++) { uint64_t lb = src[i]; dst[2 * i + 0] = lb; dst[2 * i + 1] = lb; } } static int zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) { /* * The pointer table should never use more hash bits than we * have (otherwise we'd be using useless zero bits to index it). * If we are within 2 bits of running out, stop growing, since * this is already an aberrant condition. */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) return (SET_ERROR(ENOSPC)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* * We are outgrowing the "embedded" ptrtbl (the one * stored in the header block). Give it its own entire * block, which will double the size of the ptrtbl. */ ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); uint64_t newblk = zap_allocate_blocks(zap, 1); dmu_buf_t *db_new; int err = dmu_buf_hold_by_dnode(zap->zap_dnode, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err != 0) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; zap_f_phys(zap)->zap_ptrtbl.zt_shift++; ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << (FZAP_BLOCK_SHIFT(zap)-3)); return (0); } else { return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, zap_ptrtbl_transfer, tx)); } } static void zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) { dmu_buf_will_dirty(zap->zap_dbuf, tx); mutex_enter(&zap->zap_f.zap_num_entries_mtx); ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); zap_f_phys(zap)->zap_num_entries += delta; mutex_exit(&zap->zap_f.zap_num_entries_mtx); } static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks) { ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); uint64_t newblk = zap_f_phys(zap)->zap_freeblk; zap_f_phys(zap)->zap_freeblk += nblocks; return (newblk); } static void zap_leaf_evict_sync(void *dbu) { zap_leaf_t *l = dbu; rw_destroy(&l->l_rwlock); kmem_free(l, sizeof (zap_leaf_t)); } static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); uint64_t blkid = zap_allocate_blocks(zap, 1); dmu_buf_t *db = NULL; VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db, DMU_READ_NO_PREFETCH)); /* * Create the leaf structure and stash it on the dbuf. If zap was * recent shrunk or truncated, the dbuf might have been sitting in the * cache waiting to be evicted, and so still have the old leaf attached * to it. If so, just reuse it. */ zap_leaf_t *l = dmu_buf_get_user(db); if (l == NULL) { l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); l->l_blkid = blkid; l->l_dbuf = db; rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); dmu_buf_set_user(l->l_dbuf, &l->l_dbu); } else { ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); } rw_enter(&l->l_rwlock, RW_WRITER); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); zap_f_phys(zap)->zap_num_leafs++; return (l); } int fzap_count(zap_t *zap, uint64_t *count) { ASSERT(!zap->zap_ismicro); mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ *count = zap_f_phys(zap)->zap_num_entries; mutex_exit(&zap->zap_f.zap_num_entries_mtx); return (0); } /* * Routines for obtaining zap_leaf_t's */ void zap_put_leaf(zap_leaf_t *l) { rw_exit(&l->l_rwlock); dmu_buf_rele(l->l_dbuf, NULL); } static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { ASSERT(blkid != 0); zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ zap_leaf_evict_sync(&l->l_dbu); l = winner; } /* * lhr_pad was previously used for the next leaf in the leaf * chain. There should be no chained leafs (as we have removed * support for them). */ ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); /* * There should be more hash entries than there can be * chunks to put in the hash table */ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); /* The chunks should begin at the end of the hash table */ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); /* The chunks should end at the end of the block */ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); return (l); } static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { dmu_buf_t *db; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); /* * If system crashed just after dmu_free_long_range in zfs_rmnode, we * would be left with an empty xattr dir in delete queue. blkid=0 * would be passed in when doing zfs_purgedir. If that's the case we * should just return immediately. The underlying objects should * already be freed, so this should be perfectly fine. */ if (blkid == 0) return (SET_ERROR(ENOENT)); int bs = FZAP_BLOCK_SHIFT(zap); int err = dmu_buf_hold_by_dnode(zap->zap_dnode, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); ASSERT3U(db->db_object, ==, zap->zap_object); ASSERT3U(db->db_offset, ==, blkid << bs); ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); zap_leaf_t *l = dmu_buf_get_user(db); if (l == NULL) l = zap_open_leaf(blkid, db); rw_enter(&l->l_rwlock, lt); /* * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, * causing ASSERT below to fail. */ if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); } static int zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { ASSERT3U(idx, <, (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); return (0); } else { return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, valp)); } } static int zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) { ASSERT(tx != NULL); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; return (0); } else { return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, blk, tx)); } } static int zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk, dmu_tx_t *tx) { int bs = FZAP_BLOCK_SHIFT(zap); int epb = bs >> 3; /* entries per block */ int err = 0; ASSERT(tx != NULL); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); /* * Check for i/o errors */ for (int i = 0; i < nptrs; i += epb) { uint64_t blk; err = zap_idx_to_blk(zap, idx + i, &blk); if (err != 0) { return (err); } } for (int i = 0; i < nptrs; i++) { err = zap_set_idx_to_blk(zap, idx + i, blk, tx); ASSERT0(err); /* we checked for i/o errors above */ if (err != 0) break; } return (err); } #define ZAP_PREFIX_HASH(pref, pref_len) ((pref) << (64 - (pref_len))) /* * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl. * If two leaves are siblings, their ranges are adjecent and contain the same * number of entries. In order to find out if a leaf has a sibling, we need to * check the range corresponding to the sibling leaf. There is no need to check * all entries in the range, we only need to check the frist and the last one. */ static uint64_t check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len) { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len); uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len; uint64_t nptrs = (1 << pref_diff); uint64_t first; uint64_t last; ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); if (zap_idx_to_blk(zap, idx, &first) != 0) return (0); if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0) return (0); if (first != last) return (0); return (first); } static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { uint64_t blk; ASSERT(zap->zap_dbuf == NULL || zap_f_phys(zap) == zap->zap_dbuf->db_data); /* Reality check for corrupt zap objects (leaf or header). */ if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { return (SET_ERROR(EIO)); } uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); int err = zap_idx_to_blk(zap, idx, &blk); if (err != 0) return (err); err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); ASSERT(err || ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == zap_leaf_phys(*lp)->l_hdr.lh_prefix); return (err); } static int zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, const void *tag, dmu_tx_t *tx, zap_leaf_t **lp) { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; int err; int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); if (zap_tryupgradedir(zap, tx) == 0 || old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { /* We failed to upgrade, or need to grow the pointer table */ objset_t *os = zap->zap_objset; uint64_t object = zap->zap_object; zap_put_leaf(l); *lp = l = NULL; zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err != 0) return (err); ASSERT(!zap->zap_ismicro); while (old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { err = zap_grow_ptrtbl(zap, tx); if (err != 0) return (err); } err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); if (err != 0) return (err); if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { /* it split while our locks were down */ *lp = l; return (0); } } ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - (old_prefix_len + 1); uint64_t sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; /* check for i/o errors before doing zap_leaf_split */ for (int i = 0; i < (1ULL << prefix_diff); i++) { uint64_t blk; err = zap_idx_to_blk(zap, sibling + i, &blk); if (err != 0) return (err); ASSERT3U(blk, ==, l->l_blkid); } zap_leaf_t *nl = zap_create_leaf(zap, tx); zap_leaf_split(l, nl, zap->zap_normflags != 0); /* set sibling pointers */ for (int i = 0; i < (1ULL << prefix_diff); i++) { err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); ASSERT0(err); /* we checked for i/o errors above */ } ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0); if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { /* we want the sibling */ zap_put_leaf(l); *lp = nl; } else { zap_put_leaf(nl); *lp = l; } return (0); } static void zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, const void *tag, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); zap_put_leaf(l); if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { /* * We are in the middle of growing the pointer table, or * this leaf will soon make us grow it. */ if (zap_tryupgradedir(zap, tx) == 0) { objset_t *os = zap->zap_objset; uint64_t zapobj = zap->zap_object; zap_unlockdir(zap, tag); int err = zap_lockdir(os, zapobj, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err != 0) return; } /* could have finished growing while our locks were down */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) (void) zap_grow_ptrtbl(zap, tx); } } static int fzap_checkname(zap_name_t *zn) { uint32_t maxnamelen = zn->zn_normbuf_len; uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen; /* Only allow directory zap to have longname */ if (len > maxnamelen || (len > ZAP_MAXNAMELEN && zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS)) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int fzap_checksize(uint64_t integer_size, uint64_t num_integers) { /* Only integer sizes supported by C */ switch (integer_size) { case 1: case 2: case 4: case 8: break; default: return (SET_ERROR(EINVAL)); } if (integer_size * num_integers > ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); return (0); } static int fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) { int err = fzap_checkname(zn); if (err != 0) return (err); return (fzap_checksize(integer_size, num_integers)); } /* * Routines for manipulating attributes. */ int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *ncp) { zap_leaf_t *l; zap_entry_handle_t zeh; int err = fzap_checkname(zn); if (err != 0) return (err); err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { if ((err = fzap_checksize(integer_size, num_integers)) != 0) { zap_put_leaf(l); return (err); } err = zap_entry_read(&zeh, integer_size, num_integers, buf); (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); if (ncp) { *ncp = zap_entry_normalization_conflict(&zeh, zn, NULL, zn->zn_zap); } } zap_put_leaf(l); return (err); } int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); ASSERT(fzap_check(zn, integer_size, num_integers) == 0); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { err = SET_ERROR(EEXIST); goto out; } if (err != ENOENT) goto out; err = zap_entry_create(l, zn, cd, integer_size, num_integers, val, &zeh); if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } out: if (l != NULL) { if (err == ENOSPC) zap_put_leaf(l); else zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); } return (err); } int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, const void *tag, dmu_tx_t *tx) { int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, val, ZAP_NEED_CD, tag, tx)); } int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, const void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; boolean_t create; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); create = (err == ENOENT); ASSERT(err == 0 || err == ENOENT); if (create) { err = zap_entry_create(l, zn, ZAP_NEED_CD, integer_size, num_integers, val, &zeh); if (err == 0) zap_increment_num_entries(zap, 1, tx); } else { err = zap_entry_update(&zeh, integer_size, num_integers, val); } if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } if (l != NULL) { if (err == ENOSPC) zap_put_leaf(l); else zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); } return (err); } int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err != 0) goto out; if (integer_size != NULL) *integer_size = zeh.zeh_integer_size; if (num_integers != NULL) *num_integers = zeh.zeh_num_integers; out: zap_put_leaf(l); return (err); } int fzap_remove(zap_name_t *zn, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { zap_entry_remove(&zeh); zap_increment_num_entries(zn->zn_zap, -1, tx); if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 && zap_shrink_enabled) return (zap_shrink(zn, l, tx)); } zap_put_leaf(l); return (err); } void fzap_prefetch(zap_name_t *zn) { uint64_t blk; zap_t *zap = zn->zn_zap; uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, zap_f_phys(zap)->zap_ptrtbl.zt_shift); if (zap_idx_to_blk(zap, idx, &blk) != 0) return; int bs = FZAP_BLOCK_SHIFT(zap); dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } /* * Helper functions for consumers. */ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) { return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); } uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx) { uint64_t new_obj; new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx); VERIFY(new_obj != 0); VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, tx)); return (new_obj); } int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name, uint64_t namelen) { zap_cursor_t zc; int err; if (mask == 0) mask = -1ULL; zap_attribute_t *za = zap_attribute_long_alloc(); for (zap_cursor_init(&zc, os, zapobj); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { if ((za->za_first_integer & mask) == (value & mask)) { if (strlcpy(name, za->za_name, namelen) >= namelen) err = SET_ERROR(ENAMETOOLONG); break; } } zap_cursor_fini(&zc); zap_attribute_free(za); return (err); } int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = zap_attribute_long_alloc(); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za->za_name, 8, 1, &za->za_first_integer, tx); if (err != 0) break; } zap_cursor_fini(&zc); zap_attribute_free(za); return (err); } int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = zap_attribute_long_alloc(); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za->za_name, 8, 1, &value, tx); if (err != 0) break; } zap_cursor_fini(&zc); zap_attribute_free(za); return (err); } int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = zap_attribute_long_alloc(); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { uint64_t delta = 0; if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); if (err != 0 && err != ENOENT) break; delta += za->za_first_integer; err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); if (err != 0) break; } zap_cursor_fini(&zc); zap_attribute_free(za); return (err); } int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_remove(os, obj, name, tx)); } int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_lookup(os, obj, name, 8, 1, &value)); } int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_update_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_update(os, obj, name, 8, 1, &value, tx)); } int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_lookup(os, obj, name, 8, 1, valuep)); } int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx) { uint64_t value = 0; if (delta == 0) return (0); int err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); value += delta; if (value == 0) err = zap_remove(os, obj, name, tx); else err = zap_update(os, obj, name, 8, 1, &value, tx); return (err); } int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_increment(os, obj, name, delta, tx)); } /* * Routines for iterating over the attributes. */ int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) { int err = ENOENT; zap_entry_handle_t zeh; zap_leaf_t *l; /* retrieve the next entry at or after zc_hash/zc_cd */ /* if no entry, return ENOENT */ /* * If we are reading from the beginning, we're almost certain to * iterate over the entire ZAP object. If there are multiple leaf * blocks (freeblk > 2), prefetch the whole object (up to * dmu_prefetch_max bytes), so that we read the leaf blocks * concurrently. (Unless noprefetch was requested via * zap_cursor_init_noprefetch()). */ if (zc->zc_hash == 0 && zap_iterate_prefetch && zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), ZIO_PRIORITY_ASYNC_READ); } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); /* * The leaf was either shrunk or split. */ if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) || (ZAP_HASH_IDX(zc->zc_hash, zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } } again: if (zc->zc_leaf == NULL) { err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, &zc->zc_leaf); if (err != 0) return (err); } l = zc->zc_leaf; err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); if (err == ENOENT) { if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) { zc->zc_hash = -1ULL; zc->zc_cd = 0; } else { uint64_t nocare = (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; zc->zc_cd = 0; if (zc->zc_hash == 0) { zc->zc_hash = -1ULL; } else { zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; goto again; } } } if (err == 0) { zc->zc_hash = zeh.zeh_hash; zc->zc_cd = zeh.zeh_cd; za->za_integer_length = zeh.zeh_integer_size; za->za_num_integers = zeh.zeh_num_integers; if (zeh.zeh_num_integers == 0) { za->za_first_integer = 0; } else { err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); ASSERT(err == 0 || err == EOVERFLOW); } err = zap_entry_read_name(zap, &zeh, za->za_name_len, za->za_name); ASSERT(err == 0); za->za_normalization_conflict = zap_entry_normalization_conflict(&zeh, NULL, za->za_name, zap); } rw_exit(&zc->zc_leaf->l_rwlock); return (err); } static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { uint64_t lastblk = 0; /* * NB: if a leaf has more pointers than an entire ptrtbl block * can hold, then it'll be accounted for more than once, since * we won't have lastblk. */ for (int i = 0; i < len; i++) { zap_leaf_t *l; if (tbl[i] == lastblk) continue; lastblk = tbl[i]; int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); if (err == 0) { zap_leaf_stats(zap, l, zs); zap_put_leaf(l); } } } void fzap_get_stats(zap_t *zap, zap_stats_t *zs) { int bs = FZAP_BLOCK_SHIFT(zap); zs->zs_blocksize = 1ULL << bs; /* * Set zap_phys_t fields */ zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; zs->zs_block_type = zap_f_phys(zap)->zap_block_type; zs->zs_magic = zap_f_phys(zap)->zap_magic; zs->zs_salt = zap_f_phys(zap)->zap_salt; /* * Set zap_ptrtbl fields */ zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; zs->zs_ptrtbl_blks_copied = zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* the ptrtbl is entirely in the header block. */ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { dmu_prefetch_by_dnode(zap->zap_dnode, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; int err; err = dmu_buf_hold_by_dnode(zap->zap_dnode, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs); dmu_buf_rele(db, FTAG); } } } } /* * Find last allocated block and update freeblk. */ static void zap_trunc(zap_t *zap) { uint64_t nentries; uint64_t lastblk; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) { /* External ptrtbl */ nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift); lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk + zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1; } else { /* Embedded ptrtbl */ nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); lastblk = 0; } for (uint64_t idx = 0; idx < nentries; idx++) { uint64_t blk; if (zap_idx_to_blk(zap, idx, &blk) != 0) return; if (blk > lastblk) lastblk = blk; } ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk); zap_f_phys(zap)->zap_freeblk = lastblk + 1; } /* * ZAP shrinking algorithm. * * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf * only if it has a sibling. Sibling leaves have the same prefix length and * their prefixes differ only by the least significant (sibling) bit. We require * both siblings to be empty. This eliminates a need to rehash the non-empty * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl * entries of the removed leaf to point out to the remaining leaf. Prefix length * of the remaining leaf is decremented. As a result, it has a new prefix and it * might have a new sibling. So, we repeat the process. * * Steps: * 1. Check if a sibling leaf (sl) exists and it is empty. * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1. * 3. Release the sibling (sl) to derefer it again with WRITER lock. * 4. Upgrade zapdir lock to WRITER (once). * 5. Derefer released leaves again. * 6. If it is needed, recheck whether both leaves are still siblings and empty. * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of * the remaining leaf (slbit 0). * 8. Free disk block of the removed leaf (dmu_free_range). * 9. Decrement prefix_len of the remaining leaf. * 10. Repeat the steps. */ static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; uint64_t hash = zn->zn_hash; uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; boolean_t trunc = B_FALSE; int err = 0; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); boolean_t writer = B_FALSE; /* * To avoid deadlock always deref leaves in the same order - * sibling 0 first, then sibling 1. */ while (prefix_len) { zap_leaf_t *sl; int64_t prefix_diff = zt_shift - prefix_len; uint64_t sl_prefix = prefix ^ 1; uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); int slbit = prefix & 1; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); /* * Check if there is a sibling by reading ptrtbl ptrs. */ if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0) break; /* * sibling 1, unlock it - we haven't yet dereferenced sibling 0. */ if (slbit == 1) { zap_put_leaf(l); l = NULL; } /* * Dereference sibling leaf and check if it is empty. */ if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER, &sl)) != 0) break; ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix); /* * Check if we have a sibling and it is empty. */ if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len || zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) { zap_put_leaf(sl); break; } zap_put_leaf(sl); /* * If there two empty sibling, we have work to do, so * we need to lock ZAP ptrtbl as WRITER. */ if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) { /* We failed to upgrade */ if (l != NULL) { zap_put_leaf(l); l = NULL; } /* * Usually, the right way to upgrade from a READER lock * to a WRITER lock is to call zap_unlockdir() and * zap_lockdir(), but we do not have a tag. Instead, * we do it in more sophisticated way. */ rw_exit(&zap->zap_rwlock); rw_enter(&zap->zap_rwlock, RW_WRITER); dmu_buf_will_dirty(zap->zap_dbuf, tx); zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; writer = B_TRUE; } /* * Here we have WRITER lock for ptrtbl. * Now, we need a WRITER lock for both siblings leaves. * Also, we have to recheck if the leaves are still siblings * and still empty. */ if (l == NULL) { /* sibling 0 */ if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash), tx, RW_WRITER, &l)) != 0) break; /* * The leaf isn't empty anymore or * it was shrunk/split while our locks were down. */ if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 || zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len) break; } /* sibling 1 */ if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx, RW_WRITER, &sl)) != 0) break; /* * The leaf isn't empty anymore or * it was shrunk/split while our locks were down. */ if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 || zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) { zap_put_leaf(sl); break; } /* If we have gotten here, we have a leaf to collapse */ uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff; uint64_t nptrs = (1ULL << prefix_diff); uint64_t sl_blkid = sl->l_blkid; /* * Set ptrtbl entries to point out to the slibling 0 blkid */ if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid, tx)) != 0) { zap_put_leaf(sl); break; } /* * Free sibling 1 disk block. */ int bs = FZAP_BLOCK_SHIFT(zap); if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1) trunc = B_TRUE; (void) dmu_free_range(zap->zap_objset, zap->zap_object, sl_blkid << bs, 1 << bs, tx); zap_put_leaf(sl); zap_f_phys(zap)->zap_num_leafs--; /* * Update prefix and prefix_len. */ zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1; zap_leaf_phys(l)->l_hdr.lh_prefix_len--; prefix = zap_leaf_phys(l)->l_hdr.lh_prefix; prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; } if (trunc) zap_trunc(zap); if (l != NULL) zap_put_leaf(l); return (err); } -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, "When iterating ZAP object, prefetch it"); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW, "Enable ZAP shrinking"); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index dfe309aa551f..55b60006e58c 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1,2036 +1,2035 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2024, Klara, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * The maximum size (in bytes) of a microzap before it is converted to a * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE). * * By definition, a microzap must fit into a single block, so this has * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default. * Setting this higher requires both the large_blocks feature (to even create * blocks that large) and the large_microzap feature (to enable the stream * machinery to understand not to try to split a microzap block). * * If large_microzap is enabled, this value will be clamped to * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE. */ static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE; uint64_t zap_get_micro_max_size(spa_t *spa) { uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE); if (maxsz <= SPA_OLD_MAXBLOCKSIZE) return (maxsz); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP)) return (MIN(maxsz, spa_maxblocksize(spa))); return (SPA_OLD_MAXBLOCKSIZE); } static int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap_f_phys(zap)->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (int i = 0; i < zn->zn_key_norm_numints; wp++, i++) { uint64_t word = *wp; for (int j = 0; j < 8; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ int len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (int i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * choosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags, size_t outlen) { ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); size_t inlen = strlen(name) + 1; int err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { boolean_t res = B_FALSE; ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype & MT_NORMALIZE) { size_t namelen = zn->zn_normbuf_len; char normbuf[ZAP_MAXNAMELEN]; char *norm = normbuf; /* * Cannot allocate this on-stack as it exceed the stack-limit of * 1024. */ if (namelen > ZAP_MAXNAMELEN) norm = kmem_alloc(namelen, KM_SLEEP); if (zap_normalize(zn->zn_zap, matchname, norm, zn->zn_normflags, namelen) != 0) { res = B_FALSE; } else { res = (strcmp(zn->zn_key_norm, norm) == 0); } if (norm != normbuf) kmem_free(norm, namelen); } else { res = (strcmp(zn->zn_key_orig, matchname) == 0); } return (res); } static kmem_cache_t *zap_name_cache; static kmem_cache_t *zap_attr_cache; static kmem_cache_t *zap_name_long_cache; static kmem_cache_t *zap_attr_long_cache; void zap_init(void) { zap_name_cache = kmem_cache_create("zap_name", sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, NULL, NULL, NULL, 0); zap_attr_cache = kmem_cache_create("zap_attr_cache", sizeof (zap_attribute_t) + ZAP_MAXNAMELEN, 0, NULL, NULL, NULL, NULL, NULL, 0); zap_name_long_cache = kmem_cache_create("zap_name_long", sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, NULL, NULL, NULL, 0); zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache", sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL, NULL, NULL, NULL, 0); } void zap_fini(void) { kmem_cache_destroy(zap_name_cache); kmem_cache_destroy(zap_attr_cache); kmem_cache_destroy(zap_name_long_cache); kmem_cache_destroy(zap_attr_long_cache); } static zap_name_t * zap_name_alloc(zap_t *zap, boolean_t longname) { kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache; zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP); zn->zn_zap = zap; zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; return (zn); } void zap_name_free(zap_name_t *zn) { if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) { kmem_cache_free(zap_name_cache, zn); } else { ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW); kmem_cache_free(zap_name_long_cache, zn); } } static int zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) { zap_t *zap = zn->zn_zap; size_t key_len = strlen(key) + 1; /* Make sure zn is allocated for longname if key is long */ IMPLY(key_len > ZAP_MAXNAMELEN, zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW); zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = key_len; zn->zn_matchtype = mt; zn->zn_normflags = zap->zap_normflags; /* * If we're dealing with a case sensitive lookup on a mixed or * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup * will fold case to all caps overriding the lookup request. */ if (mt & MT_MATCH_CASE) zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; if (zap->zap_normflags) { /* * We *must* use zap_normflags because this normalization is * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, zap->zap_normflags, zn->zn_normbuf_len) != 0) return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != 0) return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); if (zap->zap_normflags != zn->zn_normflags) { /* * We *must* use zn_normflags because this normalization is * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, zn->zn_normflags, zn->zn_normbuf_len) != 0) return (SET_ERROR(ENOTSUP)); zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } return (0); } zap_name_t * zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) { size_t key_len = strlen(key) + 1; zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN)); if (zap_name_init_str(zn, key, mt) != 0) { zap_name_free(zn); return (NULL); } return (zn); } static zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = 0; zn->zn_normbuf_len = ZAP_MAXNAMELEN; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); int max = (size / MZAP_ENT_LEN) - 1; for (int i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } __attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, mze_compare) static void mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { mzap_ent_t mze; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); mze.mze_chunkid = chunkid; ASSERT0(hash & 0xffffffff); mze.mze_hash = hash >> 32; ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); zfs_btree_add(&zap->zap_m.zap_tree, &mze); } static mzap_ent_t * mze_find(zap_name_t *zn, zfs_btree_index_t *idx) { mzap_ent_t mze_tofind; mzap_ent_t *mze; zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); ASSERT0(zn->zn_hash & 0xffffffff); mze_tofind.mze_hash = zn->zn_hash >> 32; mze_tofind.mze_cd = 0; mze = zfs_btree_find(tree, &mze_tofind, idx); if (mze == NULL) mze = zfs_btree_next(tree, idx, idx); for (; mze && mze->mze_hash == mze_tofind.mze_hash; mze = zfs_btree_next(tree, idx, idx)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; zfs_btree_index_t idx; zfs_btree_t *tree = &zap->zap_m.zap_tree; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT0(hash & 0xffffffff); hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; uint32_t cd = 0; for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = zfs_btree_next(tree, &idx, &idx)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } /* * Each mzap entry requires at max : 4 chunks * 3 chunks for names + 1 chunk for value. */ #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) /* * Check if the current entry keeps the colliding entries under the fatzap leaf * size. */ static boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) { zap_t *zap = zn->zn_zap; mzap_ent_t mze_tofind; zfs_btree_index_t idx; zfs_btree_t *tree = &zap->zap_m.zap_tree; uint32_t mzap_ents = 0; ASSERT0(hash & 0xffffffff); hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = zfs_btree_next(tree, &idx, &idx)) { mzap_ents++; } /* Include the new entry being added */ mzap_ents++; return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); } static void mze_destroy(zap_t *zap) { zfs_btree_clear(&zap->zap_m.zap_tree); zfs_btree_destroy(&zap->zap_m.zap_tree); } static zap_t * mzap_open(dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; uint64_t zap_block_type = zap_hdr[0]; uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = dmu_buf_get_objset(db); zap->zap_object = db->db_object; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { winner = NULL; /* No actual winner here... */ goto handle_winner; } } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) goto handle_winner; if (zap->zap_ismicro) { zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; /* * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() * overhead on massive inserts below. It still allows to store * 62 entries before we have to add 2KB B-tree core node. */ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, mze_find_in_buf, sizeof (mzap_ent_t), 512); zap_name_t *zn = zap_name_alloc(zap, B_FALSE); for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap->zap_m.zap_num_entries++; zap_name_init_str(zn, mze->mze_name, 0); mze_insert(zap, i, zn->zn_hash); } } zap_name_free(zn); } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); handle_winner: rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } /* * This routine "consumes" the caller's hold on the dbuf, which must * have the specified tag. */ static int zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); objset_t *os = dmu_buf_get_objset(db); uint64_t obj = db->db_object; dmu_object_info_t doi; *zapp = NULL; dmu_object_info_from_dnode(dn, &doi); if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) return (SET_ERROR(EINVAL)); zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { zap = mzap_open(db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. * Check for corruption! */ return (SET_ERROR(EIO)); } } /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; zap->zap_dnode = dn; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) { dprintf("upgrading obj %llu: num_entries=%u\n", (u_longlong_t)obj, zap->zap_m.zap_num_entries); *zapp = zap; int err = mzap_upgrade(zapp, tag, tx, 0); if (err != 0) rw_exit(&zap->zap_rwlock); return (err); } VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; if (newsz > SPA_OLD_MAXBLOCKSIZE) { dsl_dataset_t *ds = dmu_objset_ds(os); if (!dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_MICROZAP)) { /* * A microzap just grew beyond the old limit * for the first time, so we have to ensure the * feature flag is activated. * zap_get_micro_max_size() won't let us get * here if the feature is not enabled, so we * don't need any other checks beforehand. * * Since we're in open context, we can't * activate the feature directly, so we instead * flag it on the dataset for next sync. */ dsl_dataset_dirty(ds, tx); mutex_enter(&ds->ds_lock); ds->ds_feature_activation [SPA_FEATURE_LARGE_MICROZAP] = (void *)B_TRUE; mutex_exit(&ds->ds_lock); } } } *zapp = zap; return (0); } static int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp) { dmu_buf_t *db; int err; err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) dmu_buf_rele(db, tag); else VERIFY(dnode_add_ref(dn, tag)); return (err); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp) { dnode_t *dn; dmu_buf_t *db; int err; err = dnode_hold(os, obj, tag, &dn); if (err != 0) return (err); err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) { dnode_rele(dn, tag); return (err); } err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) { dmu_buf_rele(db, tag); dnode_rele(dn, tag); } return (err); } void zap_unlockdir(zap_t *zap, const void *tag) { rw_exit(&zap->zap_rwlock); dnode_rele(zap->zap_dnode, tag); dmu_buf_rele(zap->zap_dbuf, tag); } static int mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); int sz = zap->zap_dbuf->db_size; mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); memcpy(mzp, zap->zap_dbuf->db_data, sz); int nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err != 0) { vmem_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", (u_longlong_t)zap->zap_object, nchunks); /* XXX destroy the tree later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); zap_name_t *zn = zap_name_alloc(zap, B_FALSE); for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, (u_longlong_t)mze->mze_value); zap_name_init_str(zn, mze->mze_name, 0); /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx)); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ } zap_name_free(zn); vmem_free(mzp, sz); *zapp = zap; return (0); } /* * The "normflags" determine the behavior of the matchtype_t which is * passed to zap_lookup_norm(). Names which have the same normalized * version will be stored with the same hash value, and therefore we can * perform normalization-insensitive lookups. We can be Unicode form- * insensitive and/or case-insensitive. The following flags are valid for * "normflags": * * U8_TEXTPREP_NFC * U8_TEXTPREP_NFD * U8_TEXTPREP_NFKC * U8_TEXTPREP_NFKD * U8_TEXTPREP_TOUPPER * * The *_NF* (Normalization Form) flags are mutually exclusive; at most one * of them may be supplied. */ void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); mzap_phys_t *zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; zp->mz_normflags = normflags; if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY(dnode_add_ref(dn, FTAG)); VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); } else { dmu_buf_rele(db, FTAG); } } static uint64_t zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { uint64_t obj; ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); if (allocated_dnode == NULL) { dnode_t *dn; obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, &dn, FTAG, tx); mzap_create_impl(dn, normflags, flags, tx); dnode_rele(dn, FTAG); } else { obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx); mzap_create_impl(*allocated_dnode, normflags, flags, tx); } return (obj); } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, 0, ot, bonustype, bonuslen, dnodesize, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { dnode_t *dn; int error; ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, dnodesize, tx); if (error != 0) return (error); error = dnode_hold(os, obj, FTAG, &dn); if (error != 0) return (error); mzap_create_impl(dn, normflags, 0, tx); dnode_rele(dn, FTAG); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, dnodesize, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, 0, ot, 0, 0, bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_flags_dnsize(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } /* * Create a zap object and return a pointer to the newly allocated dnode via * the allocated_dnode argument. The returned dnode will be held and the * caller is responsible for releasing the hold by calling dnode_rele(). */ uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } void zap_evict_sync(void *dbu) { zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap, FTAG); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, zfs_btree_index_t *idx) { boolean_t allocdzn = B_FALSE; mzap_ent_t *other; zfs_btree_index_t oidx; if (zap->zap_normflags == 0) return (B_FALSE); for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); other && other->mze_hash == mze->mze_hash; other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { if (zn == NULL) { zn = zap_name_alloc_str(zap, MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); other && other->mze_hash == mze->mze_hash; other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { if (zn == NULL) { zn = zap_name_alloc_str(zap, MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } static int zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { int err = 0; zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { err = SET_ERROR(EINVAL); } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; if (realname != NULL) (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze, &idx); } } } } zap_name_free(zn); return (err); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch_object(objset_t *os, uint64_t zapobj) { int error; dmu_object_info_t doi; error = dmu_object_info(os, zapobj, &doi); if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) error = SET_ERROR(EINVAL); if (error == 0) dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); return (error); } int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm_by_dnode(dn, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } static int zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints) { zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (0); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_prefetch_uint64_impl(zap, key, key_numints); /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_prefetch_uint64_impl(zap, key, key_numints); /* zap_prefetch_uint64_impl() calls zap_unlockdir() */ return (err); } static int zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } int err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, num_integers, buf); /* zap_lookup_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size, num_integers, buf); /* zap_lookup_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = zap_lookup_norm(os, zapobj, name, 0, 0, NULL, 0, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; uint16_t start = zap->zap_m.zap_alloc_next; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strlcpy(mze->mze_name, zn->zn_key_orig, sizeof (mze->mze_name)); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; mze_insert(zap, i, zn->zn_hash); return; } } if (start != 0) { start = 0; goto again; } cmn_err(CE_PANIC, "out of entries!"); } static int zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, const void *tag) { const uint64_t *intval = val; int err = 0; zap_name_t *zn = zap_name_alloc_str(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN || !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { zfs_btree_index_t idx; if (mze_find(zn, &idx) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, tag); return (err); } int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } static int zap_add_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, const void *tag) { int err; zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, tag); return (err); } int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; const uint64_t *intval = val; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", (u_longlong_t)zapobj, integer_size, (u_longlong_t)num_integers, name); err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); if (err == 0) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze != NULL) { MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } static int zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, const void *tag) { int err; zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, tag); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_update_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_update_uint64_impl(zap, key, key_numints, integer_size, num_integers, val, tx, FTAG); /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, 0, tx)); } static int zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { int err = 0; zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { zfs_btree_index_t idx; mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); } } zap_name_free(zn); return (err); } int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, mt, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, 0, tx); zap_unlockdir(zap, FTAG); return (err); } static int zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, dmu_tx_t *tx, const void *tag) { int err; zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap, tag); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } static zap_attribute_t * zap_attribute_alloc_impl(boolean_t longname) { zap_attribute_t *za; za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache, KM_SLEEP); za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN; return (za); } zap_attribute_t * zap_attribute_alloc(void) { return (zap_attribute_alloc_impl(B_FALSE)); } zap_attribute_t * zap_attribute_long_alloc(void) { return (zap_attribute_alloc_impl(B_TRUE)); } void zap_attribute_free(zap_attribute_t *za) { if (za->za_name_len == ZAP_MAXNAMELEN) { kmem_cache_free(zap_attr_cache, za); } else { ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW); kmem_cache_free(zap_attr_long_cache, za); } } /* * Routines for iterating over the attributes. */ static void zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized, boolean_t prefetch) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; zc->zc_prefetch = prefetch; } void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); } /* * Initialize a cursor at the beginning of the ZAP object. The entire * ZAP object will be prefetched. */ void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); } /* * Initialize a cursor at the beginning, but request that we not prefetch * the entire ZAP object. */ void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err != 0) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { zfs_btree_index_t idx; mzap_ent_t mze_tofind; mze_tofind.mze_hash = zc->zc_hash >> 32; mze_tofind.mze_cd = zc->zc_cd; mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, &mze_tofind, &idx); if (mze == NULL) { mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, &idx, &idx); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze, &idx); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strlcpy(za->za_name, mzep->mze_name, za->za_name_len); zc->zc_hash = (uint64_t)mze->mze_hash << 32; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); memset(zs, 0, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zap_create); EXPORT_SYMBOL(zap_create_dnsize); EXPORT_SYMBOL(zap_create_norm); EXPORT_SYMBOL(zap_create_norm_dnsize); EXPORT_SYMBOL(zap_create_flags); EXPORT_SYMBOL(zap_create_flags_dnsize); EXPORT_SYMBOL(zap_create_claim); EXPORT_SYMBOL(zap_create_claim_norm); EXPORT_SYMBOL(zap_create_claim_norm_dnsize); EXPORT_SYMBOL(zap_create_hold); EXPORT_SYMBOL(zap_destroy); EXPORT_SYMBOL(zap_lookup); EXPORT_SYMBOL(zap_lookup_by_dnode); EXPORT_SYMBOL(zap_lookup_norm); EXPORT_SYMBOL(zap_lookup_uint64); EXPORT_SYMBOL(zap_contains); EXPORT_SYMBOL(zap_prefetch); EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_prefetch_object); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); EXPORT_SYMBOL(zap_add_uint64_by_dnode); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); EXPORT_SYMBOL(zap_update_uint64_by_dnode); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); EXPORT_SYMBOL(zap_remove_uint64_by_dnode); EXPORT_SYMBOL(zap_count); EXPORT_SYMBOL(zap_value_search); EXPORT_SYMBOL(zap_join); EXPORT_SYMBOL(zap_join_increment); EXPORT_SYMBOL(zap_add_int); EXPORT_SYMBOL(zap_remove_int); EXPORT_SYMBOL(zap_lookup_int); EXPORT_SYMBOL(zap_increment_int); EXPORT_SYMBOL(zap_add_int_key); EXPORT_SYMBOL(zap_lookup_int_key); EXPORT_SYMBOL(zap_increment); EXPORT_SYMBOL(zap_cursor_init); EXPORT_SYMBOL(zap_cursor_fini); EXPORT_SYMBOL(zap_cursor_retrieve); EXPORT_SYMBOL(zap_cursor_advance); EXPORT_SYMBOL(zap_cursor_serialize); EXPORT_SYMBOL(zap_cursor_init_serialized); EXPORT_SYMBOL(zap_get_stats); -/* CSTYLED */ ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW, "Maximum micro ZAP size, before converting to a fat ZAP, in bytes"); #endif