diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c index 9d90914aacfa..9daa975226fe 100644 --- a/module/icp/algs/aes/aes_impl.c +++ b/module/icp/algs/aes/aes_impl.c @@ -1,444 +1,444 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include /* * Initialize AES encryption and decryption key schedules. * * Parameters: * cipherKey User key * keyBits AES key size (128, 192, or 256 bits) * keysched AES key schedule to be initialized, of type aes_key_t. * Allocated by aes_alloc_keysched(). */ void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched) { const aes_impl_ops_t *ops = aes_impl_get_ops(); aes_key_t *newbie = keysched; uint_t keysize, i, j; union { uint64_t ka64[4]; uint32_t ka32[8]; } keyarr; switch (keyBits) { case 128: newbie->nr = 10; break; case 192: newbie->nr = 12; break; case 256: newbie->nr = 14; break; default: /* should never get here */ return; } keysize = CRYPTO_BITS2BYTES(keyBits); /* * Generic C implementation requires byteswap for little endian * machines, various accelerated implementations for various * architectures may not. */ if (!ops->needs_byteswap) { /* no byteswap needed */ if (IS_P2ALIGNED(cipherKey, sizeof (uint64_t))) { for (i = 0, j = 0; j < keysize; i++, j += 8) { /* LINTED: pointer alignment */ keyarr.ka64[i] = *((uint64_t *)&cipherKey[j]); } } else { memcpy(keyarr.ka32, cipherKey, keysize); } } else { /* byte swap */ for (i = 0, j = 0; j < keysize; i++, j += 4) { keyarr.ka32[i] = htonl(*(uint32_t *)(void *)&cipherKey[j]); } } ops->generate(newbie, keyarr.ka32, keyBits); newbie->ops = ops; /* * Note: if there are systems that need the AES_64BIT_KS type in the * future, move setting key schedule type to individual implementations */ newbie->type = AES_32BIT_KS; } /* * Encrypt one block using AES. * Align if needed and (for x86 32-bit only) byte-swap. * * Parameters: * ks Key schedule, of type aes_key_t * pt Input block (plain text) * ct Output block (crypto text). Can overlap with pt */ int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct) { aes_key_t *ksch = (aes_key_t *)ks; const aes_impl_ops_t *ops = ksch->ops; if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t)) && !ops->needs_byteswap) { /* LINTED: pointer alignment */ ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ (uint32_t *)pt, (uint32_t *)ct); } else { uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */ if (ops->needs_byteswap) { buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]); buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]); buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]); buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]); } else memcpy(&buffer, pt, AES_BLOCK_LEN); ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr, buffer, buffer); /* Copy result from buffer to output block */ if (ops->needs_byteswap) { *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]); *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]); *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]); *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]); } else memcpy(ct, &buffer, AES_BLOCK_LEN); } return (CRYPTO_SUCCESS); } /* * Decrypt one block using AES. * Align and byte-swap if needed. * * Parameters: * ks Key schedule, of type aes_key_t * ct Input block (crypto text) * pt Output block (plain text). Can overlap with pt */ int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt) { aes_key_t *ksch = (aes_key_t *)ks; const aes_impl_ops_t *ops = ksch->ops; if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t)) && !ops->needs_byteswap) { /* LINTED: pointer alignment */ ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ (uint32_t *)ct, (uint32_t *)pt); } else { uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */ if (ops->needs_byteswap) { buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]); buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]); buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]); buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]); } else memcpy(&buffer, ct, AES_BLOCK_LEN); ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr, buffer, buffer); /* Copy result from buffer to output block */ if (ops->needs_byteswap) { *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]); *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]); *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]); *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]); } else memcpy(pt, &buffer, AES_BLOCK_LEN); } return (CRYPTO_SUCCESS); } /* * Allocate key schedule for AES. * * Return the pointer and set size to the number of bytes allocated. * Memory allocated must be freed by the caller when done. * * Parameters: * size Size of key schedule allocated, in bytes * kmflag Flag passed to kmem_alloc(9F); ignored in userland. */ void * aes_alloc_keysched(size_t *size, int kmflag) { aes_key_t *keysched; - keysched = (aes_key_t *)kmem_alloc(sizeof (aes_key_t), kmflag); + keysched = kmem_alloc(sizeof (aes_key_t), kmflag); if (keysched != NULL) { *size = sizeof (aes_key_t); return (keysched); } return (NULL); } /* AES implementation that contains the fastest methods */ static aes_impl_ops_t aes_fastest_impl = { .name = "fastest" }; /* All compiled in implementations */ static const aes_impl_ops_t *aes_all_impl[] = { &aes_generic_impl, #if defined(__x86_64) &aes_x86_64_impl, #endif #if defined(__x86_64) && defined(HAVE_AES) &aes_aesni_impl, #endif }; /* Indicate that benchmark has been completed */ static boolean_t aes_impl_initialized = B_FALSE; /* Select aes implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX-1) #define AES_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t icp_aes_impl = IMPL_FASTEST; static uint32_t user_sel_impl = IMPL_FASTEST; /* Hold all supported implementations */ static size_t aes_supp_impl_cnt = 0; static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)]; /* * Returns the AES operations for encrypt/decrypt/key setup. When a * SIMD implementation is not allowed in the current context, then * fallback to the fastest generic implementation. */ const aes_impl_ops_t * aes_impl_get_ops(void) { if (!kfpu_allowed()) return (&aes_generic_impl); const aes_impl_ops_t *ops = NULL; const uint32_t impl = AES_IMPL_READ(icp_aes_impl); switch (impl) { case IMPL_FASTEST: ASSERT(aes_impl_initialized); ops = &aes_fastest_impl; break; case IMPL_CYCLE: /* Cycle through supported implementations */ ASSERT(aes_impl_initialized); ASSERT3U(aes_supp_impl_cnt, >, 0); static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt; ops = aes_supp_impl[idx]; break; default: ASSERT3U(impl, <, aes_supp_impl_cnt); ASSERT3U(aes_supp_impl_cnt, >, 0); if (impl < ARRAY_SIZE(aes_all_impl)) ops = aes_supp_impl[impl]; break; } ASSERT3P(ops, !=, NULL); return (ops); } /* * Initialize all supported implementations. */ void aes_impl_init(void) { aes_impl_ops_t *curr_impl; int i, c; /* Move supported implementations into aes_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) { curr_impl = (aes_impl_ops_t *)aes_all_impl[i]; if (curr_impl->is_supported()) aes_supp_impl[c++] = (aes_impl_ops_t *)curr_impl; } aes_supp_impl_cnt = c; /* * Set the fastest implementation given the assumption that the * hardware accelerated version is the fastest. */ #if defined(__x86_64) #if defined(HAVE_AES) if (aes_aesni_impl.is_supported()) { memcpy(&aes_fastest_impl, &aes_aesni_impl, sizeof (aes_fastest_impl)); } else #endif { memcpy(&aes_fastest_impl, &aes_x86_64_impl, sizeof (aes_fastest_impl)); } #else memcpy(&aes_fastest_impl, &aes_generic_impl, sizeof (aes_fastest_impl)); #endif strlcpy(aes_fastest_impl.name, "fastest", AES_IMPL_NAME_MAX); /* Finish initialization */ atomic_swap_32(&icp_aes_impl, user_sel_impl); aes_impl_initialized = B_TRUE; } static const struct { const char *name; uint32_t sel; } aes_impl_opts[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, }; /* * Function sets desired aes implementation. * * If we are called before init(), user preference will be saved in * user_sel_impl, and applied in later init() call. This occurs when module * parameter is specified on module load. Otherwise, directly update * icp_aes_impl. * * @val Name of aes implementation to use * @param Unused. */ int aes_impl_set(const char *val) { int err = -EINVAL; char req_name[AES_IMPL_NAME_MAX]; uint32_t impl = AES_IMPL_READ(user_sel_impl); size_t i; /* sanitize input */ i = strnlen(val, AES_IMPL_NAME_MAX); if (i == 0 || i >= AES_IMPL_NAME_MAX) return (err); strlcpy(req_name, val, AES_IMPL_NAME_MAX); while (i > 0 && isspace(req_name[i-1])) i--; req_name[i] = '\0'; /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) { if (strcmp(req_name, aes_impl_opts[i].name) == 0) { impl = aes_impl_opts[i].sel; err = 0; break; } } /* check all supported impl if init() was already called */ if (err != 0 && aes_impl_initialized) { /* check all supported implementations */ for (i = 0; i < aes_supp_impl_cnt; i++) { if (strcmp(req_name, aes_supp_impl[i]->name) == 0) { impl = i; err = 0; break; } } } if (err == 0) { if (aes_impl_initialized) atomic_swap_32(&icp_aes_impl, impl); else atomic_swap_32(&user_sel_impl, impl); } return (err); } #if defined(_KERNEL) && defined(__linux__) static int icp_aes_impl_set(const char *val, zfs_kernel_param_t *kp) { return (aes_impl_set(val)); } static int icp_aes_impl_get(char *buffer, zfs_kernel_param_t *kp) { int i, cnt = 0; char *fmt; const uint32_t impl = AES_IMPL_READ(icp_aes_impl); ASSERT(aes_impl_initialized); /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) { fmt = (impl == aes_impl_opts[i].sel) ? "[%s] " : "%s "; cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, aes_impl_opts[i].name); } /* list all supported implementations */ for (i = 0; i < aes_supp_impl_cnt; i++) { fmt = (i == impl) ? "[%s] " : "%s "; cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, aes_supp_impl[i]->name); } return (cnt); } module_param_call(icp_aes_impl, icp_aes_impl_set, icp_aes_impl_get, NULL, 0644); MODULE_PARM_DESC(icp_aes_impl, "Select aes implementation."); #endif diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index 558a578090b2..a8792c4555e2 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -1,1596 +1,1596 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #ifdef CAN_USE_GCM_ASM #include #endif #define GHASH(c, d, t, o) \ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ (uint64_t *)(void *)(t)); /* Select GCM implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX-1) #ifdef CAN_USE_GCM_ASM #define IMPL_AVX (UINT32_MAX-2) #endif #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t icp_gcm_impl = IMPL_FASTEST; static uint32_t user_sel_impl = IMPL_FASTEST; #ifdef CAN_USE_GCM_ASM /* Does the architecture we run on support the MOVBE instruction? */ boolean_t gcm_avx_can_use_movbe = B_FALSE; /* * Whether to use the optimized openssl gcm and ghash implementations. * Set to true if module parameter icp_gcm_impl == "avx". */ static boolean_t gcm_use_avx = B_FALSE; #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); static inline boolean_t gcm_avx_will_work(void); static inline void gcm_set_avx(boolean_t); static inline boolean_t gcm_toggle_avx(void); static inline size_t gcm_simd_get_htab_size(boolean_t); static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, crypto_data_t *, size_t); static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, size_t, size_t); #endif /* ifdef CAN_USE_GCM_ASM */ /* * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode * is done in another function. */ int gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_mode_encrypt_contiguous_blocks_avx( ctx, data, length, out, block_size)); #endif const gcm_impl_ops_t *gops; size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; uint8_t *blockp; uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); if (length + ctx->gcm_remainder_len < block_size) { /* accumulate bytes here and return */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, length); ctx->gcm_remainder_len += length; if (ctx->gcm_copy_to == NULL) { ctx->gcm_copy_to = datap; } return (CRYPTO_SUCCESS); } crypto_init_ptrs(out, &iov_or_mp, &offset); gops = gcm_impl_get_ops(); do { /* Unprocessed data from last call. */ if (ctx->gcm_remainder_len > 0) { need = block_size - ctx->gcm_remainder_len; if (need > remainder) return (CRYPTO_DATA_LEN_RANGE); memcpy(&((uint8_t *)ctx->gcm_remainder) [ctx->gcm_remainder_len], datap, need); blockp = (uint8_t *)ctx->gcm_remainder; } else { blockp = datap; } /* * Increment counter. Counter bits are confined * to the bottom 32 bits of the counter block. */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, (uint8_t *)ctx->gcm_tmp); xor_block(blockp, (uint8_t *)ctx->gcm_tmp); lastp = (uint8_t *)ctx->gcm_tmp; ctx->gcm_processed_data_len += block_size; crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, block_size); /* copy block to where it belongs */ if (out_data_1_len == block_size) { copy_block(lastp, out_data_1); } else { memcpy(out_data_1, lastp, out_data_1_len); if (out_data_2 != NULL) { memcpy(out_data_2, lastp + out_data_1_len, block_size - out_data_1_len); } } /* update offset */ out->cd_offset += block_size; /* add ciphertext to the hash */ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); /* Update pointer to next block of data to be processed. */ if (ctx->gcm_remainder_len != 0) { datap += need; ctx->gcm_remainder_len = 0; } else { datap += block_size; } remainder = (size_t)&data[length] - (size_t)datap; /* Incomplete last block. */ if (remainder > 0 && remainder < block_size) { memcpy(ctx->gcm_remainder, datap, remainder); ctx->gcm_remainder_len = remainder; ctx->gcm_copy_to = datap; goto out; } ctx->gcm_copy_to = NULL; } while (remainder > 0); out: return (CRYPTO_SUCCESS); } int gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { (void) copy_block; #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_encrypt_final_avx(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint8_t *ghash, *macp = NULL; int i, rv; if (out->cd_length < (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { return (CRYPTO_DATA_LEN_RANGE); } gops = gcm_impl_get_ops(); ghash = (uint8_t *)ctx->gcm_ghash; if (ctx->gcm_remainder_len > 0) { uint64_t counter; uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; /* * Here is where we deal with data that is not a * multiple of the block size. */ /* * Increment counter. */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, (uint8_t *)ctx->gcm_tmp); macp = (uint8_t *)ctx->gcm_remainder; memset(macp + ctx->gcm_remainder_len, 0, block_size - ctx->gcm_remainder_len); /* XOR with counter block */ for (i = 0; i < ctx->gcm_remainder_len; i++) { macp[i] ^= tmpp[i]; } /* add ciphertext to the hash */ GHASH(ctx, macp, ghash, gops); ctx->gcm_processed_data_len += ctx->gcm_remainder_len; } ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, (uint8_t *)ctx->gcm_J0); xor_block((uint8_t *)ctx->gcm_J0, ghash); if (ctx->gcm_remainder_len > 0) { rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); if (rv != CRYPTO_SUCCESS) return (rv); } out->cd_offset += ctx->gcm_remainder_len; ctx->gcm_remainder_len = 0; rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += ctx->gcm_tag_len; return (CRYPTO_SUCCESS); } /* * This will only deal with decrypting the last block of the input that * might not be a multiple of block length. */ static void gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { uint8_t *datap, *outp, *counterp; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); int i; /* * Increment counter. * Counter bits are confined to the bottom 32 bits */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; datap = (uint8_t *)ctx->gcm_remainder; outp = &((ctx->gcm_pt_buf)[index]); counterp = (uint8_t *)ctx->gcm_tmp; /* authentication tag */ memset((uint8_t *)ctx->gcm_tmp, 0, block_size); memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); /* add ciphertext to the hash */ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); /* decrypt remaining ciphertext */ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); /* XOR with counter block */ for (i = 0; i < ctx->gcm_remainder_len; i++) { outp[i] = datap[i] ^ counterp[i]; } } int gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, (void) xor_block; size_t new_len; uint8_t *new; /* * Copy contiguous ciphertext input blocks to plaintext buffer. * Ciphertext will be decrypted in the final. */ if (length > 0) { new_len = ctx->gcm_pt_buf_len + length; new = vmem_alloc(new_len, KM_SLEEP); if (new == NULL) { vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); ctx->gcm_pt_buf = NULL; return (CRYPTO_HOST_MEMORY); } if (ctx->gcm_pt_buf != NULL) { memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); } else { ASSERT0(ctx->gcm_pt_buf_len); } ctx->gcm_pt_buf = new; ctx->gcm_pt_buf_len = new_len; memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, length); ctx->gcm_processed_data_len += length; } ctx->gcm_remainder_len = 0; return (CRYPTO_SUCCESS); } int gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_decrypt_final_avx(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; size_t pt_len; size_t remainder; uint8_t *ghash; uint8_t *blockp; uint8_t *cbp; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); int processed = 0, rv; ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); gops = gcm_impl_get_ops(); pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; ghash = (uint8_t *)ctx->gcm_ghash; blockp = ctx->gcm_pt_buf; remainder = pt_len; while (remainder > 0) { /* Incomplete last block */ if (remainder < block_size) { memcpy(ctx->gcm_remainder, blockp, remainder); ctx->gcm_remainder_len = remainder; /* * not expecting anymore ciphertext, just * compute plaintext for the remaining input */ gcm_decrypt_incomplete_block(ctx, block_size, processed, encrypt_block, xor_block); ctx->gcm_remainder_len = 0; goto out; } /* add ciphertext to the hash */ GHASH(ctx, blockp, ghash, gops); /* * Increment counter. * Counter bits are confined to the bottom 32 bits */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; cbp = (uint8_t *)ctx->gcm_tmp; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); /* XOR with ciphertext */ xor_block(cbp, blockp); processed += block_size; blockp += block_size; remainder -= block_size; } out: ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, (uint8_t *)ctx->gcm_J0); xor_block((uint8_t *)ctx->gcm_J0, ghash); /* compare the input authentication tag with what we calculated */ if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { /* They don't match */ return (CRYPTO_INVALID_MAC); } else { rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += pt_len; } return (CRYPTO_SUCCESS); } static int gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) { size_t tag_len; /* * Check the length of the authentication tag (in bits). */ tag_len = gcm_param->ulTagBits; switch (tag_len) { case 32: case 64: case 96: case 104: case 112: case 120: case 128: break; default: return (CRYPTO_MECHANISM_PARAM_INVALID); } if (gcm_param->ulIvLen == 0) return (CRYPTO_MECHANISM_PARAM_INVALID); return (CRYPTO_SUCCESS); } static void gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, gcm_ctx_t *ctx, size_t block_size, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { const gcm_impl_ops_t *gops; uint8_t *cb; ulong_t remainder = iv_len; ulong_t processed = 0; uint8_t *datap, *ghash; uint64_t len_a_len_c[2]; gops = gcm_impl_get_ops(); ghash = (uint8_t *)ctx->gcm_ghash; cb = (uint8_t *)ctx->gcm_cb; if (iv_len == 12) { memcpy(cb, iv, 12); cb[12] = 0; cb[13] = 0; cb[14] = 0; cb[15] = 1; /* J0 will be used again in the final */ copy_block(cb, (uint8_t *)ctx->gcm_J0); } else { /* GHASH the IV */ do { if (remainder < block_size) { memset(cb, 0, block_size); memcpy(cb, &(iv[processed]), remainder); datap = (uint8_t *)cb; remainder = 0; } else { datap = (uint8_t *)(&(iv[processed])); processed += block_size; remainder -= block_size; } GHASH(ctx, datap, ghash, gops); } while (remainder > 0); len_a_len_c[0] = 0; len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); /* J0 will be used again in the final */ copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); } } static int gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { const gcm_impl_ops_t *gops; uint8_t *ghash, *datap, *authp; size_t remainder, processed; /* encrypt zero block to get subkey H */ memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, (uint8_t *)ctx->gcm_H); gcm_format_initial_blocks(iv, iv_len, ctx, block_size, copy_block, xor_block); gops = gcm_impl_get_ops(); authp = (uint8_t *)ctx->gcm_tmp; ghash = (uint8_t *)ctx->gcm_ghash; memset(authp, 0, block_size); memset(ghash, 0, block_size); processed = 0; remainder = auth_data_len; do { if (remainder < block_size) { /* * There's not a block full of data, pad rest of * buffer with zero */ if (auth_data != NULL) { memset(authp, 0, block_size); memcpy(authp, &(auth_data[processed]), remainder); } else { ASSERT0(remainder); } datap = (uint8_t *)authp; remainder = 0; } else { datap = (uint8_t *)(&(auth_data[processed])); processed += block_size; remainder -= block_size; } /* add auth data to the hash */ GHASH(ctx, datap, ghash, gops); } while (remainder > 0); return (CRYPTO_SUCCESS); } /* * The following function is called at encrypt or decrypt init time * for AES GCM mode. * * Init the GCM context struct. Handle the cycle and avx implementations here. */ int gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { int rv; CK_AES_GCM_PARAMS *gcm_param; if (param != NULL) { gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; if ((rv = gcm_validate_args(gcm_param)) != 0) { return (rv); } gcm_ctx->gcm_tag_len = gcm_param->ulTagBits; gcm_ctx->gcm_tag_len >>= 3; gcm_ctx->gcm_processed_data_len = 0; /* these values are in bits */ gcm_ctx->gcm_len_a_len_c[0] = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GCM_MODE; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } #ifdef CAN_USE_GCM_ASM if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; } else { /* * Handle the "cycle" implementation by creating avx and * non-avx contexts alternately. */ gcm_ctx->gcm_use_avx = gcm_toggle_avx(); /* * We don't handle byte swapped key schedules in the avx * code path. */ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } /* Use the MOVBE and the BSWAP variants alternately. */ if (gcm_ctx->gcm_use_avx == B_TRUE && zfs_movbe_available() == B_TRUE) { (void) atomic_toggle_boolean_nv( (volatile boolean_t *)&gcm_avx_can_use_movbe); } } /* Allocate Htab memory as needed. */ if (gcm_ctx->gcm_use_avx == B_TRUE) { size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } gcm_ctx->gcm_htab_len = htab_len; gcm_ctx->gcm_Htable = - (uint64_t *)kmem_alloc(htab_len, KM_SLEEP); + kmem_alloc(htab_len, KM_SLEEP); if (gcm_ctx->gcm_Htable == NULL) { return (CRYPTO_HOST_MEMORY); } } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, gcm_param->pAAD, gcm_param->ulAADLen, block_size, encrypt_block, copy_block, xor_block) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } #ifdef CAN_USE_GCM_ASM } else { if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } } #endif /* ifdef CAN_USE_GCM_ASM */ return (rv); } int gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { int rv; CK_AES_GMAC_PARAMS *gmac_param; if (param != NULL) { gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param; gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); gcm_ctx->gcm_processed_data_len = 0; /* these values are in bits */ gcm_ctx->gcm_len_a_len_c[0] = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen)); rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GMAC_MODE; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } #ifdef CAN_USE_GCM_ASM /* * Handle the "cycle" implementation by creating avx and non avx * contexts alternately. */ if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; } else { gcm_ctx->gcm_use_avx = gcm_toggle_avx(); } /* We don't handle byte swapped key schedules in the avx code path. */ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } /* Allocate Htab memory as needed. */ if (gcm_ctx->gcm_use_avx == B_TRUE) { size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } gcm_ctx->gcm_htab_len = htab_len; gcm_ctx->gcm_Htable = - (uint64_t *)kmem_alloc(htab_len, KM_SLEEP); + kmem_alloc(htab_len, KM_SLEEP); if (gcm_ctx->gcm_Htable == NULL) { return (CRYPTO_HOST_MEMORY); } } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, gmac_param->pAAD, gmac_param->ulAADLen, block_size, encrypt_block, copy_block, xor_block) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } #ifdef CAN_USE_GCM_ASM } else { if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } } #endif /* ifdef CAN_USE_GCM_ASM */ return (rv); } void * gcm_alloc_ctx(int kmflag) { gcm_ctx_t *gcm_ctx; if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) return (NULL); gcm_ctx->gcm_flags = GCM_MODE; return (gcm_ctx); } void * gmac_alloc_ctx(int kmflag) { gcm_ctx_t *gcm_ctx; if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) return (NULL); gcm_ctx->gcm_flags = GMAC_MODE; return (gcm_ctx); } /* GCM implementation that contains the fastest methods */ static gcm_impl_ops_t gcm_fastest_impl = { .name = "fastest" }; /* All compiled in implementations */ static const gcm_impl_ops_t *gcm_all_impl[] = { &gcm_generic_impl, #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) &gcm_pclmulqdq_impl, #endif }; /* Indicate that benchmark has been completed */ static boolean_t gcm_impl_initialized = B_FALSE; /* Hold all supported implementations */ static size_t gcm_supp_impl_cnt = 0; static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; /* * Returns the GCM operations for encrypt/decrypt/key setup. When a * SIMD implementation is not allowed in the current context, then * fallback to the fastest generic implementation. */ const gcm_impl_ops_t * gcm_impl_get_ops(void) { if (!kfpu_allowed()) return (&gcm_generic_impl); const gcm_impl_ops_t *ops = NULL; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); switch (impl) { case IMPL_FASTEST: ASSERT(gcm_impl_initialized); ops = &gcm_fastest_impl; break; case IMPL_CYCLE: /* Cycle through supported implementations */ ASSERT(gcm_impl_initialized); ASSERT3U(gcm_supp_impl_cnt, >, 0); static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; ops = gcm_supp_impl[idx]; break; #ifdef CAN_USE_GCM_ASM case IMPL_AVX: /* * Make sure that we return a valid implementation while * switching to the avx implementation since there still * may be unfinished non-avx contexts around. */ ops = &gcm_generic_impl; break; #endif default: ASSERT3U(impl, <, gcm_supp_impl_cnt); ASSERT3U(gcm_supp_impl_cnt, >, 0); if (impl < ARRAY_SIZE(gcm_all_impl)) ops = gcm_supp_impl[impl]; break; } ASSERT3P(ops, !=, NULL); return (ops); } /* * Initialize all supported implementations. */ void gcm_impl_init(void) { gcm_impl_ops_t *curr_impl; int i, c; /* Move supported implementations into gcm_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; if (curr_impl->is_supported()) gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; } gcm_supp_impl_cnt = c; /* * Set the fastest implementation given the assumption that the * hardware accelerated version is the fastest. */ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) if (gcm_pclmulqdq_impl.is_supported()) { memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, sizeof (gcm_fastest_impl)); } else #endif { memcpy(&gcm_fastest_impl, &gcm_generic_impl, sizeof (gcm_fastest_impl)); } strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); #ifdef CAN_USE_GCM_ASM /* * Use the avx implementation if it's available and the implementation * hasn't changed from its default value of fastest on module load. */ if (gcm_avx_will_work()) { #ifdef HAVE_MOVBE if (zfs_movbe_available() == B_TRUE) { atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); } #endif if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { gcm_set_avx(B_TRUE); } } #endif /* Finish initialization */ atomic_swap_32(&icp_gcm_impl, user_sel_impl); gcm_impl_initialized = B_TRUE; } static const struct { const char *name; uint32_t sel; } gcm_impl_opts[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, #ifdef CAN_USE_GCM_ASM { "avx", IMPL_AVX }, #endif }; /* * Function sets desired gcm implementation. * * If we are called before init(), user preference will be saved in * user_sel_impl, and applied in later init() call. This occurs when module * parameter is specified on module load. Otherwise, directly update * icp_gcm_impl. * * @val Name of gcm implementation to use * @param Unused. */ int gcm_impl_set(const char *val) { int err = -EINVAL; char req_name[GCM_IMPL_NAME_MAX]; uint32_t impl = GCM_IMPL_READ(user_sel_impl); size_t i; /* sanitize input */ i = strnlen(val, GCM_IMPL_NAME_MAX); if (i == 0 || i >= GCM_IMPL_NAME_MAX) return (err); strlcpy(req_name, val, GCM_IMPL_NAME_MAX); while (i > 0 && isspace(req_name[i-1])) i--; req_name[i] = '\0'; /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } #endif if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { impl = gcm_impl_opts[i].sel; err = 0; break; } } /* check all supported impl if init() was already called */ if (err != 0 && gcm_impl_initialized) { /* check all supported implementations */ for (i = 0; i < gcm_supp_impl_cnt; i++) { if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { impl = i; err = 0; break; } } } #ifdef CAN_USE_GCM_ASM /* * Use the avx implementation if available and the requested one is * avx or fastest. */ if (gcm_avx_will_work() == B_TRUE && (impl == IMPL_AVX || impl == IMPL_FASTEST)) { gcm_set_avx(B_TRUE); } else { gcm_set_avx(B_FALSE); } #endif if (err == 0) { if (gcm_impl_initialized) atomic_swap_32(&icp_gcm_impl, impl); else atomic_swap_32(&user_sel_impl, impl); } return (err); } #if defined(_KERNEL) && defined(__linux__) static int icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) { return (gcm_impl_set(val)); } static int icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) { int i, cnt = 0; char *fmt; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); ASSERT(gcm_impl_initialized); /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } #endif fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, gcm_impl_opts[i].name); } /* list all supported implementations */ for (i = 0; i < gcm_supp_impl_cnt; i++) { fmt = (i == impl) ? "[%s] " : "%s "; cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, gcm_supp_impl[i]->name); } return (cnt); } module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, NULL, 0644); MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); #endif /* defined(__KERNEL) */ #ifdef CAN_USE_GCM_ASM #define GCM_BLOCK_LEN 16 /* * The openssl asm routines are 6x aggregated and need that many bytes * at minimum. */ #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) /* * Ensure the chunk size is reasonable since we are allocating a * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. */ #define GCM_AVX_MAX_CHUNK_SIZE \ (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) /* Clear the FPU registers since they hold sensitive internal state. */ #define clear_fpu_regs() clear_fpu_regs_avx() #define GHASH_AVX(ctx, in, len) \ gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ in, len) #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) /* Get the chunk size module parameter. */ #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size /* * Module parameter: number of bytes to process at once while owning the FPU. * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. */ static uint32_t gcm_avx_chunk_size = ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; extern void clear_fpu_regs_avx(void); extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); extern void aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, const uint8_t *in, size_t len); extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); static inline boolean_t gcm_avx_will_work(void) { /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ return (kfpu_allowed() && zfs_avx_available() && zfs_aes_available() && zfs_pclmulqdq_available()); } static inline void gcm_set_avx(boolean_t val) { if (gcm_avx_will_work() == B_TRUE) { atomic_swap_32(&gcm_use_avx, val); } } static inline boolean_t gcm_toggle_avx(void) { if (gcm_avx_will_work() == B_TRUE) { return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); } else { return (B_FALSE); } } static inline size_t gcm_simd_get_htab_size(boolean_t simd_mode) { switch (simd_mode) { case B_TRUE: return (2 * 6 * 2 * sizeof (uint64_t)); default: return (0); } } /* * Clear sensitive data in the context. * * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and * ctx->gcm_Htable contain the hash sub key which protects authentication. * * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for * a known plaintext attack, they consists of the IV and the first and last * counter respectively. If they should be cleared is debatable. */ static inline void gcm_clear_ctx(gcm_ctx_t *ctx) { memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0)); memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp)); } /* Increment the GCM counter block by n. */ static inline void gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) { uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + n); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; } /* * Encrypt multiple blocks of data in GCM mode. * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines * if possible. While processing a chunk the FPU is "locked". */ static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size) { size_t bleft = length; size_t need = 0; size_t done = 0; uint8_t *datap = (uint8_t *)data; size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint64_t *ghash = ctx->gcm_ghash; uint64_t *cb = ctx->gcm_cb; uint8_t *ct_buf = NULL; uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; int rv = CRYPTO_SUCCESS; ASSERT(block_size == GCM_BLOCK_LEN); /* * If the last call left an incomplete block, try to fill * it first. */ if (ctx->gcm_remainder_len > 0) { need = block_size - ctx->gcm_remainder_len; if (length < need) { /* Accumulate bytes here and return. */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, length); ctx->gcm_remainder_len += length; if (ctx->gcm_copy_to == NULL) { ctx->gcm_copy_to = datap; } return (CRYPTO_SUCCESS); } else { /* Complete incomplete block. */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, need); ctx->gcm_copy_to = NULL; } } /* Allocate a buffer to encrypt to if there is enough input. */ if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { ct_buf = vmem_alloc(chunk_size, KM_SLEEP); if (ct_buf == NULL) { return (CRYPTO_HOST_MEMORY); } } /* If we completed an incomplete block, encrypt and write it out. */ if (ctx->gcm_remainder_len > 0) { kfpu_begin(); aes_encrypt_intel(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); GHASH_AVX(ctx, tmp, block_size); clear_fpu_regs(); kfpu_end(); rv = crypto_put_output_data(tmp, out, block_size); out->cd_offset += block_size; gcm_incr_counter_block(ctx); ctx->gcm_processed_data_len += block_size; bleft -= need; datap += need; ctx->gcm_remainder_len = 0; } /* Do the bulk encryption in chunk_size blocks. */ for (; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); done = aesni_gcm_encrypt( datap, ct_buf, chunk_size, key, cb, ghash); clear_fpu_regs(); kfpu_end(); if (done != chunk_size) { rv = CRYPTO_FAILED; goto out_nofpu; } rv = crypto_put_output_data(ct_buf, out, chunk_size); if (rv != CRYPTO_SUCCESS) { goto out_nofpu; } out->cd_offset += chunk_size; datap += chunk_size; ctx->gcm_processed_data_len += chunk_size; } /* Check if we are already done. */ if (bleft == 0) { goto out_nofpu; } /* Bulk encrypt the remaining data. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); if (done == 0) { rv = CRYPTO_FAILED; goto out; } rv = crypto_put_output_data(ct_buf, out, done); if (rv != CRYPTO_SUCCESS) { goto out; } out->cd_offset += done; ctx->gcm_processed_data_len += done; datap += done; bleft -= done; } /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ while (bleft > 0) { if (bleft < block_size) { memcpy(ctx->gcm_remainder, datap, bleft); ctx->gcm_remainder_len = bleft; ctx->gcm_copy_to = datap; goto out; } /* Encrypt, hash and write out. */ aes_encrypt_intel(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); gcm_xor_avx(datap, tmp); GHASH_AVX(ctx, tmp, block_size); rv = crypto_put_output_data(tmp, out, block_size); if (rv != CRYPTO_SUCCESS) { goto out; } out->cd_offset += block_size; gcm_incr_counter_block(ctx); ctx->gcm_processed_data_len += block_size; datap += block_size; bleft -= block_size; } out: clear_fpu_regs(); kfpu_end(); out_nofpu: if (ct_buf != NULL) { vmem_free(ct_buf, chunk_size); } return (rv); } /* * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. */ static int gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; uint32_t *J0 = (uint32_t *)ctx->gcm_J0; uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; size_t rem_len = ctx->gcm_remainder_len; const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; int aes_rounds = ((aes_key_t *)keysched)->nr; int rv; ASSERT(block_size == GCM_BLOCK_LEN); if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { return (CRYPTO_DATA_LEN_RANGE); } kfpu_begin(); /* Pad last incomplete block with zeros, encrypt and hash. */ if (rem_len > 0) { uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; const uint32_t *cb = (uint32_t *)ctx->gcm_cb; aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); memset(remainder + rem_len, 0, block_size - rem_len); for (int i = 0; i < rem_len; i++) { remainder[i] ^= tmp[i]; } GHASH_AVX(ctx, remainder, block_size); ctx->gcm_processed_data_len += rem_len; /* No need to increment counter_block, it's the last block. */ } /* Finish tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); aes_encrypt_intel(keysched, aes_rounds, J0, J0); gcm_xor_avx((uint8_t *)J0, ghash); clear_fpu_regs(); kfpu_end(); /* Output remainder. */ if (rem_len > 0) { rv = crypto_put_output_data(remainder, out, rem_len); if (rv != CRYPTO_SUCCESS) return (rv); } out->cd_offset += rem_len; ctx->gcm_remainder_len = 0; rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += ctx->gcm_tag_len; /* Clear sensitive data in the context before returning. */ gcm_clear_ctx(ctx); return (CRYPTO_SUCCESS); } /* * Finalize decryption: We just have accumulated crypto text, so now we * decrypt it here inplace. */ static int gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); ASSERT3U(block_size, ==, 16); size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; uint8_t *datap = ctx->gcm_pt_buf; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint32_t *cb = (uint32_t *)ctx->gcm_cb; uint64_t *ghash = ctx->gcm_ghash; uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; int rv = CRYPTO_SUCCESS; size_t bleft, done; /* * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of * GCM_AVX_MIN_DECRYPT_BYTES. */ for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); done = aesni_gcm_decrypt(datap, datap, chunk_size, (const void *)key, ctx->gcm_cb, ghash); clear_fpu_regs(); kfpu_end(); if (done != chunk_size) { return (CRYPTO_FAILED); } datap += done; } /* Decrypt remainder, which is less than chunk size, in one go. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { done = aesni_gcm_decrypt(datap, datap, bleft, (const void *)key, ctx->gcm_cb, ghash); if (done == 0) { clear_fpu_regs(); kfpu_end(); return (CRYPTO_FAILED); } datap += done; bleft -= done; } ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); /* * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, * decrypt them block by block. */ while (bleft > 0) { /* Incomplete last block. */ if (bleft < block_size) { uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; memset(lastb, 0, block_size); memcpy(lastb, datap, bleft); /* The GCM processing. */ GHASH_AVX(ctx, lastb, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); for (size_t i = 0; i < bleft; i++) { datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; } break; } /* The GCM processing. */ GHASH_AVX(ctx, datap, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); gcm_xor_avx((uint8_t *)tmp, datap); gcm_incr_counter_block(ctx); datap += block_size; bleft -= block_size; } if (rv != CRYPTO_SUCCESS) { clear_fpu_regs(); kfpu_end(); return (rv); } /* Decryption done, finish the tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, (uint32_t *)ctx->gcm_J0); gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); /* We are done with the FPU, restore its state. */ clear_fpu_regs(); kfpu_end(); /* Compare the input authentication tag with what we calculated. */ if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { /* They don't match. */ return (CRYPTO_INVALID_MAC); } rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); if (rv != CRYPTO_SUCCESS) { return (rv); } out->cd_offset += pt_len; gcm_clear_ctx(ctx); return (CRYPTO_SUCCESS); } /* * Initialize the GCM params H, Htabtle and the counter block. Save the * initial counter block. */ static int gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size) { uint8_t *cb = (uint8_t *)ctx->gcm_cb; uint64_t *H = ctx->gcm_H; const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; uint8_t *datap = auth_data; size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; size_t bleft; ASSERT(block_size == GCM_BLOCK_LEN); /* Init H (encrypt zero block) and create the initial counter block. */ memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); memset(H, 0, sizeof (ctx->gcm_H)); kfpu_begin(); aes_encrypt_intel(keysched, aes_rounds, (const uint32_t *)H, (uint32_t *)H); gcm_init_htab_avx(ctx->gcm_Htable, H); if (iv_len == 12) { memcpy(cb, iv, 12); cb[12] = 0; cb[13] = 0; cb[14] = 0; cb[15] = 1; /* We need the ICB later. */ memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); } else { /* * Most consumers use 12 byte IVs, so it's OK to use the * original routines for other IV sizes, just avoid nesting * kfpu_begin calls. */ clear_fpu_regs(); kfpu_end(); gcm_format_initial_blocks(iv, iv_len, ctx, block_size, aes_copy_block, aes_xor_block); kfpu_begin(); } /* Openssl post increments the counter, adjust for that. */ gcm_incr_counter_block(ctx); /* Ghash AAD in chunk_size blocks. */ for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { GHASH_AVX(ctx, datap, chunk_size); datap += chunk_size; clear_fpu_regs(); kfpu_end(); kfpu_begin(); } /* Ghash the remainder and handle possible incomplete GCM block. */ if (bleft > 0) { size_t incomp = bleft % block_size; bleft -= incomp; if (bleft > 0) { GHASH_AVX(ctx, datap, bleft); datap += bleft; } if (incomp > 0) { /* Zero pad and hash incomplete last block. */ uint8_t *authp = (uint8_t *)ctx->gcm_tmp; memset(authp, 0, block_size); memcpy(authp, datap, incomp); GHASH_AVX(ctx, authp, block_size); } } clear_fpu_regs(); kfpu_end(); return (CRYPTO_SUCCESS); } #if defined(_KERNEL) static int icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) { unsigned long val; char val_rounded[16]; int error = 0; error = kstrtoul(buf, 0, &val); if (error) return (error); val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) return (-EINVAL); snprintf(val_rounded, 16, "%u", (uint32_t)val); error = param_set_uint(val_rounded, kp); return (error); } module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, param_get_uint, &gcm_avx_chunk_size, 0644); MODULE_PARM_DESC(icp_gcm_avx_chunk_size, "How many bytes to process while owning the FPU"); #endif /* defined(__KERNEL) */ #endif /* ifdef CAN_USE_GCM_ASM */ diff --git a/module/icp/api/kcf_ctxops.c b/module/icp/api/kcf_ctxops.c index 4fa281676b81..b8cd67ea7f67 100644 --- a/module/icp/api/kcf_ctxops.c +++ b/module/icp/api/kcf_ctxops.c @@ -1,149 +1,149 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include /* * Crypto contexts manipulation routines */ /* * crypto_create_ctx_template() * * Arguments: * * mech: crypto_mechanism_t pointer. * mech_type is a valid value previously returned by * crypto_mech2id(); * When the mech's parameter is not NULL, its definition depends * on the standard definition of the mechanism. * key: pointer to a crypto_key_t structure. * ptmpl: a storage for the opaque crypto_ctx_template_t, allocated and * initialized by the software provider this routine is * dispatched to. * * Description: * Redirects the call to the software provider of the specified * mechanism. That provider will allocate and pre-compute/pre-expand * the context template, reusable by later calls to crypto_xxx_init(). * The size and address of that provider context template are stored * in an internal structure, kcf_ctx_template_t. The address of that * structure is given back to the caller in *ptmpl. * * Context: * Process or interrupt. * * Returns: * CRYPTO_SUCCESS when the context template is successfully created. * CRYPTO_HOST_MEMORY: mem alloc failure * CRYPTO_ARGUMENTS_BAD: NULL storage for the ctx template. * RYPTO_MECHANISM_INVALID: invalid mechanism 'mech'. */ int crypto_create_ctx_template(crypto_mechanism_t *mech, crypto_key_t *key, crypto_ctx_template_t *ptmpl) { int error; kcf_mech_entry_t *me; kcf_provider_desc_t *pd; kcf_ctx_template_t *ctx_tmpl; crypto_mechanism_t prov_mech; /* A few args validation */ if (ptmpl == NULL) return (CRYPTO_ARGUMENTS_BAD); if (mech == NULL) return (CRYPTO_MECHANISM_INVALID); error = kcf_get_sw_prov(mech->cm_type, &pd, &me, B_TRUE); if (error != CRYPTO_SUCCESS) return (error); - if ((ctx_tmpl = (kcf_ctx_template_t *)kmem_alloc( + if ((ctx_tmpl = kmem_alloc( sizeof (kcf_ctx_template_t), KM_SLEEP)) == NULL) { KCF_PROV_REFRELE(pd); return (CRYPTO_HOST_MEMORY); } /* Pass a mechtype that the provider understands */ prov_mech.cm_type = KCF_TO_PROV_MECHNUM(pd, mech->cm_type); prov_mech.cm_param = mech->cm_param; prov_mech.cm_param_len = mech->cm_param_len; error = KCF_PROV_CREATE_CTX_TEMPLATE(pd, &prov_mech, key, &(ctx_tmpl->ct_prov_tmpl), &(ctx_tmpl->ct_size)); if (error == CRYPTO_SUCCESS) { *ptmpl = ctx_tmpl; } else { kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t)); } KCF_PROV_REFRELE(pd); return (error); } /* * crypto_destroy_ctx_template() * * Arguments: * * tmpl: an opaque crypto_ctx_template_t previously created by * crypto_create_ctx_template() * * Description: * Frees the embedded crypto_spi_ctx_template_t, then the * kcf_ctx_template_t. * * Context: * Process or interrupt. * */ void crypto_destroy_ctx_template(crypto_ctx_template_t tmpl) { kcf_ctx_template_t *ctx_tmpl = (kcf_ctx_template_t *)tmpl; if (ctx_tmpl == NULL) return; ASSERT(ctx_tmpl->ct_prov_tmpl != NULL); memset(ctx_tmpl->ct_prov_tmpl, 0, ctx_tmpl->ct_size); kmem_free(ctx_tmpl->ct_prov_tmpl, ctx_tmpl->ct_size); kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t)); } #if defined(_KERNEL) EXPORT_SYMBOL(crypto_create_ctx_template); EXPORT_SYMBOL(crypto_destroy_ctx_template); #endif diff --git a/module/os/freebsd/spl/callb.c b/module/os/freebsd/spl/callb.c index 47f3ccc0c7fa..2bfd4ea169bb 100644 --- a/module/os/freebsd/spl/callb.c +++ b/module/os/freebsd/spl/callb.c @@ -1,372 +1,372 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for delay() */ #include /* For TASKQ_NAMELEN */ #include #define CB_MAXNAME TASKQ_NAMELEN /* * The callb mechanism provides generic event scheduling/echoing. * A callb function is registered and called on behalf of the event. */ typedef struct callb { struct callb *c_next; /* next in class or on freelist */ kthread_id_t c_thread; /* ptr to caller's thread struct */ char c_flag; /* info about the callb state */ uchar_t c_class; /* this callb's class */ kcondvar_t c_done_cv; /* signal callb completion */ boolean_t (*c_func)(void *, int); /* cb function: returns true if ok */ void *c_arg; /* arg to c_func */ char c_name[CB_MAXNAME+1]; /* debug:max func name length */ } callb_t; /* * callb c_flag bitmap definitions */ #define CALLB_FREE 0x0 #define CALLB_TAKEN 0x1 #define CALLB_EXECUTING 0x2 /* * Basic structure for a callb table. * All callbs are organized into different class groups described * by ct_class array. * The callbs within a class are single-linked and normally run by a * serial execution. */ typedef struct callb_table { kmutex_t ct_lock; /* protect all callb states */ callb_t *ct_freelist; /* free callb structures */ boolean_t ct_busy; /* B_TRUE prevents additions */ kcondvar_t ct_busy_cv; /* to wait for not busy */ int ct_ncallb; /* num of callbs allocated */ callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ } callb_table_t; int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; static callb_id_t callb_add_common(boolean_t (*)(void *, int), void *, int, char *, kthread_id_t); static callb_table_t callb_table; /* system level callback table */ static callb_table_t *ct = &callb_table; static kmutex_t callb_safe_mutex; callb_cpr_t callb_cprinfo_safe = { &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} }; /* * Init all callb tables in the system. */ static void callb_init(void *dummy __unused) { callb_table.ct_busy = B_FALSE; /* mark table open for additions */ mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); } static void callb_fini(void *dummy __unused) { callb_t *cp; int i; mutex_enter(&ct->ct_lock); for (i = 0; i < 16; i++) { while ((cp = ct->ct_freelist) != NULL) { ct->ct_freelist = cp->c_next; ct->ct_ncallb--; kmem_free(cp, sizeof (callb_t)); } if (ct->ct_ncallb == 0) break; /* Not all callbacks finished, waiting for the rest. */ mutex_exit(&ct->ct_lock); tsleep(ct, 0, "callb", hz / 4); mutex_enter(&ct->ct_lock); } if (ct->ct_ncallb > 0) printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); mutex_exit(&ct->ct_lock); mutex_destroy(&callb_safe_mutex); mutex_destroy(&callb_table.ct_lock); } /* * callout_add() is called to register func() be called later. */ static callb_id_t callb_add_common(boolean_t (*func)(void *arg, int code), void *arg, int class, char *name, kthread_id_t t) { callb_t *cp; ASSERT3S(class, <, NCBCLASS); mutex_enter(&ct->ct_lock); while (ct->ct_busy) cv_wait(&ct->ct_busy_cv, &ct->ct_lock); if ((cp = ct->ct_freelist) == NULL) { ct->ct_ncallb++; - cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); + cp = kmem_zalloc(sizeof (callb_t), KM_SLEEP); } ct->ct_freelist = cp->c_next; cp->c_thread = t; cp->c_func = func; cp->c_arg = arg; cp->c_class = (uchar_t)class; cp->c_flag |= CALLB_TAKEN; #ifdef ZFS_DEBUG if (strlen(name) > CB_MAXNAME) cmn_err(CE_WARN, "callb_add: name of callback function '%s' " "too long -- truncated to %d chars", name, CB_MAXNAME); #endif (void) strlcpy(cp->c_name, name, sizeof (cp->c_name)); /* * Insert the new callb at the head of its class list. */ cp->c_next = ct->ct_first_cb[class]; ct->ct_first_cb[class] = cp; mutex_exit(&ct->ct_lock); return ((callb_id_t)cp); } /* * The default function to add an entry to the callback table. Since * it uses curthread as the thread identifier to store in the table, * it should be used for the normal case of a thread which is calling * to add ITSELF to the table. */ callb_id_t callb_add(boolean_t (*func)(void *arg, int code), void *arg, int class, char *name) { return (callb_add_common(func, arg, class, name, curthread)); } /* * A special version of callb_add() above for use by threads which * might be adding an entry to the table on behalf of some other * thread (for example, one which is constructed but not yet running). * In this version the thread id is an argument. */ callb_id_t callb_add_thread(boolean_t (*func)(void *arg, int code), void *arg, int class, char *name, kthread_id_t t) { return (callb_add_common(func, arg, class, name, t)); } /* * callout_delete() is called to remove an entry identified by id * that was originally placed there by a call to callout_add(). * return -1 if fail to delete a callb entry otherwise return 0. */ int callb_delete(callb_id_t id) { callb_t **pp; callb_t *me = (callb_t *)id; mutex_enter(&ct->ct_lock); for (;;) { pp = &ct->ct_first_cb[me->c_class]; while (*pp != NULL && *pp != me) pp = &(*pp)->c_next; #ifdef ZFS_DEBUG if (*pp != me) { cmn_err(CE_WARN, "callb delete bogus entry 0x%p", (void *)me); mutex_exit(&ct->ct_lock); return (-1); } #endif /* DEBUG */ /* * It is not allowed to delete a callb in the middle of * executing otherwise, the callb_execute() will be confused. */ if (!(me->c_flag & CALLB_EXECUTING)) break; cv_wait(&me->c_done_cv, &ct->ct_lock); } /* relink the class list */ *pp = me->c_next; /* clean up myself and return the free callb to the head of freelist */ me->c_flag = CALLB_FREE; me->c_next = ct->ct_freelist; ct->ct_freelist = me; mutex_exit(&ct->ct_lock); return (0); } /* * class: indicates to execute all callbs in the same class; * code: optional argument for the callb functions. * return: = 0: success * != 0: ptr to string supplied when callback was registered */ void * callb_execute_class(int class, int code) { callb_t *cp; void *ret = NULL; ASSERT3S(class, <, NCBCLASS); mutex_enter(&ct->ct_lock); for (cp = ct->ct_first_cb[class]; cp != NULL && ret == 0; cp = cp->c_next) { while (cp->c_flag & CALLB_EXECUTING) cv_wait(&cp->c_done_cv, &ct->ct_lock); /* * cont if the callb is deleted while we're sleeping */ if (cp->c_flag == CALLB_FREE) continue; cp->c_flag |= CALLB_EXECUTING; #ifdef CALLB_DEBUG printf("callb_execute: name=%s func=%p arg=%p\n", cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); #endif /* CALLB_DEBUG */ mutex_exit(&ct->ct_lock); /* If callback function fails, pass back client's name */ if (!(*cp->c_func)(cp->c_arg, code)) ret = cp->c_name; mutex_enter(&ct->ct_lock); cp->c_flag &= ~CALLB_EXECUTING; cv_broadcast(&cp->c_done_cv); } mutex_exit(&ct->ct_lock); return (ret); } /* * callers make sure no recursive entries to this func. * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. * * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we * use a cv_timedwait() in case the kernel thread is blocked. * * Note that this is a generic callback handler for daemon CPR and * should NOT be changed to accommodate any specific requirement in a daemon. * Individual daemons that require changes to the handler shall write * callback routines in their own daemon modules. */ boolean_t callb_generic_cpr(void *arg, int code) { callb_cpr_t *cp = (callb_cpr_t *)arg; clock_t ret = 0; /* assume success */ mutex_enter(cp->cc_lockp); switch (code) { case CB_CODE_CPR_CHKPT: cp->cc_events |= CALLB_CPR_START; #ifdef CPR_NOT_THREAD_SAFE while (!(cp->cc_events & CALLB_CPR_SAFE)) /* cv_timedwait() returns -1 if it times out. */ if ((ret = cv_reltimedwait(&cp->cc_callb_cv, cp->cc_lockp, (callb_timeout_sec * hz), TR_CLOCK_TICK)) == -1) break; #endif break; case CB_CODE_CPR_RESUME: cp->cc_events &= ~CALLB_CPR_START; cv_signal(&cp->cc_stop_cv); break; } mutex_exit(cp->cc_lockp); return (ret != -1); } /* * The generic callback function associated with kernel threads which * are always considered safe. */ boolean_t callb_generic_cpr_safe(void *arg, int code) { (void) arg, (void) code; return (B_TRUE); } /* * Prevent additions to callback table. */ void callb_lock_table(void) { mutex_enter(&ct->ct_lock); ASSERT(!ct->ct_busy); ct->ct_busy = B_TRUE; mutex_exit(&ct->ct_lock); } /* * Allow additions to callback table. */ void callb_unlock_table(void) { mutex_enter(&ct->ct_lock); ASSERT(ct->ct_busy); ct->ct_busy = B_FALSE; cv_broadcast(&ct->ct_busy_cv); mutex_exit(&ct->ct_lock); } SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index edd04783b363..4cceeb8bdffd 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -1,1465 +1,1465 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #include #include #include #include #include #include #include #include #include #include /* * Within the scope of spl-kmem.c file the kmem_cache_* definitions * are removed to allow access to the real Linux slab allocator. */ #undef kmem_cache_destroy #undef kmem_cache_create #undef kmem_cache_alloc #undef kmem_cache_free /* * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() * with smp_mb__{before,after}_atomic() because they were redundant. This is * only used inside our SLAB allocator, so we implement an internal wrapper * here to give us smp_mb__{before,after}_atomic() on older kernels. */ #ifndef smp_mb__before_atomic #define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) #endif #ifndef smp_mb__after_atomic #define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) #endif /* BEGIN CSTYLED */ /* * Cache magazines are an optimization designed to minimize the cost of * allocating memory. They do this by keeping a per-cpu cache of recently * freed objects, which can then be reallocated without taking a lock. This * can improve performance on highly contended caches. However, because * objects in magazines will prevent otherwise empty slabs from being * immediately released this may not be ideal for low memory machines. * * For this reason spl_kmem_cache_magazine_size can be used to set a maximum * magazine size. When this value is set to 0 the magazine size will be * automatically determined based on the object size. Otherwise magazines * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines * may never be entirely disabled in this implementation. */ static unsigned int spl_kmem_cache_magazine_size = 0; module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); /* * The default behavior is to report the number of objects remaining in the * cache. This allows the Linux VM to repeatedly reclaim objects from the * cache when memory is low satisfy other memory allocations. Alternately, * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache * is reclaimed. This may increase the likelihood of out of memory events. */ static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; module_param(spl_kmem_cache_reclaim, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); static unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; module_param(spl_kmem_cache_max_size, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); /* * For small objects the Linux slab allocator should be used to make the most * efficient use of the memory. However, large objects are not supported by * the Linux slab and therefore the SPL implementation is preferred. A cutoff * of 16K was determined to be optimal for architectures using 4K pages and * to also work well on architecutres using larger 64K page sizes. */ static unsigned int spl_kmem_cache_slab_limit = 16384; module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); /* * The number of threads available to allocate new slabs for caches. This * should not need to be tuned but it is available for performance analysis. */ static unsigned int spl_kmem_cache_kmem_threads = 4; module_param(spl_kmem_cache_kmem_threads, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, "Number of spl_kmem_cache threads"); /* END CSTYLED */ /* * Slab allocation interfaces * * While the Linux slab implementation was inspired by the Solaris * implementation I cannot use it to emulate the Solaris APIs. I * require two features which are not provided by the Linux slab. * * 1) Constructors AND destructors. Recent versions of the Linux * kernel have removed support for destructors. This is a deal * breaker for the SPL which contains particularly expensive * initializers for mutex's, condition variables, etc. We also * require a minimal level of cleanup for these data types unlike * many Linux data types which do need to be explicitly destroyed. * * 2) Virtual address space backed slab. Callers of the Solaris slab * expect it to work well for both small are very large allocations. * Because of memory fragmentation the Linux slab which is backed * by kmalloc'ed memory performs very badly when confronted with * large numbers of large allocations. Basing the slab on the * virtual address space removes the need for contiguous pages * and greatly improve performance for large allocations. * * For these reasons, the SPL has its own slab implementation with * the needed features. It is not as highly optimized as either the * Solaris or Linux slabs, but it should get me most of what is * needed until it can be optimized or obsoleted by another approach. * * One serious concern I do have about this method is the relatively * small virtual address space on 32bit arches. This will seriously * constrain the size of the slab caches and their performance. */ struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ static taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); static void * kv_alloc(spl_kmem_cache_t *skc, int size, int flags) { gfp_t lflags = kmem_flags_convert(flags); void *ptr; ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); /* Resulting allocated memory will be page aligned */ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); return (ptr); } static void kv_free(spl_kmem_cache_t *skc, void *ptr, int size) { ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); /* * The Linux direct reclaim path uses this out of band value to * determine if forward progress is being made. Normally this is * incremented by kmem_freepages() which is part of the various * Linux slab implementations. However, since we are using none * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; vfree(ptr); } /* * Required space for each aligned sks. */ static inline uint32_t spl_sks_size(spl_kmem_cache_t *skc) { return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), skc->skc_obj_align, uint32_t)); } /* * Required space for each aligned object. */ static inline uint32_t spl_obj_size(spl_kmem_cache_t *skc) { uint32_t align = skc->skc_obj_align; return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); } uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache) { return (cache->skc_obj_total); } EXPORT_SYMBOL(spl_kmem_cache_inuse); uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache) { return (cache->skc_obj_size); } EXPORT_SYMBOL(spl_kmem_cache_entry_size); /* * Lookup the spl_kmem_object_t for an object given that object. */ static inline spl_kmem_obj_t * spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) { return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, skc->skc_obj_align, uint32_t)); } /* * It's important that we pack the spl_kmem_obj_t structure and the * actual objects in to one large address space to minimize the number * of calls to the allocator. It is far better to do a few large * allocations and then subdivide it ourselves. Now which allocator * we use requires balancing a few trade offs. * * For small objects we use kmem_alloc() because as long as you are * only requesting a small number of pages (ideally just one) its cheap. * However, when you start requesting multiple pages with kmem_alloc() * it gets increasingly expensive since it requires contiguous pages. * For this reason we shift to vmem_alloc() for slabs of large objects * which removes the need for contiguous pages. We do not use * vmem_alloc() in all cases because there is significant locking * overhead in __get_vm_area_node(). This function takes a single * global lock when acquiring an available virtual address range which * serializes all vmem_alloc()'s for all slab caches. Using slightly * different allocation functions for small and large objects should * give us the best of both worlds. * * +------------------------+ * | spl_kmem_slab_t --+-+ | * | skc_obj_size <-+ | | * | spl_kmem_obj_t | | * | skc_obj_size <---+ | * | spl_kmem_obj_t | | * | ... v | * +------------------------+ */ static spl_kmem_slab_t * spl_slab_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; void *base; uint32_t obj_size; base = kv_alloc(skc, skc->skc_slab_size, flags); if (base == NULL) return (NULL); sks = (spl_kmem_slab_t *)base; sks->sks_magic = SKS_MAGIC; sks->sks_objs = skc->skc_slab_objs; sks->sks_age = jiffies; sks->sks_cache = skc; INIT_LIST_HEAD(&sks->sks_list); INIT_LIST_HEAD(&sks->sks_free_list); sks->sks_ref = 0; obj_size = spl_obj_size(skc); for (int i = 0; i < sks->sks_objs; i++) { void *obj = base + spl_sks_size(skc) + (i * obj_size); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj); sko->sko_addr = obj; sko->sko_magic = SKO_MAGIC; sko->sko_slab = sks; INIT_LIST_HEAD(&sko->sko_list); list_add_tail(&sko->sko_list, &sks->sks_free_list); } return (sks); } /* * Remove a slab from complete or partial list, it must be called with * the 'skc->skc_lock' held but the actual free must be performed * outside the lock to prevent deadlocking on vmem addresses. */ static void spl_slab_free(spl_kmem_slab_t *sks, struct list_head *sks_list, struct list_head *sko_list) { spl_kmem_cache_t *skc; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref == 0); skc = sks->sks_cache; ASSERT(skc->skc_magic == SKC_MAGIC); /* * Update slab/objects counters in the cache, then remove the * slab from the skc->skc_partial_list. Finally add the slab * and all its objects in to the private work lists where the * destructors will be called and the memory freed to the system. */ skc->skc_obj_total -= sks->sks_objs; skc->skc_slab_total--; list_del(&sks->sks_list); list_add(&sks->sks_list, sks_list); list_splice_init(&sks->sks_free_list, sko_list); } /* * Reclaim empty slabs at the end of the partial list. */ static void spl_slab_reclaim(spl_kmem_cache_t *skc) { spl_kmem_slab_t *sks = NULL, *m = NULL; spl_kmem_obj_t *sko = NULL, *n = NULL; LIST_HEAD(sks_list); LIST_HEAD(sko_list); /* * Empty slabs and objects must be moved to a private list so they * can be safely freed outside the spin lock. All empty slabs are * at the end of skc->skc_partial_list, therefore once a non-empty * slab is found we can stop scanning. */ spin_lock(&skc->skc_lock); list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list, sks_list) { if (sks->sks_ref > 0) break; spl_slab_free(sks, &sks_list, &sko_list); } spin_unlock(&skc->skc_lock); /* * The following two loops ensure all the object destructors are run, * and the slabs themselves are freed. This is all done outside the * skc->skc_lock since this allows the destructor to sleep, and * allows us to perform a conditional reschedule when a freeing a * large number of objects and slabs back to the system. */ list_for_each_entry_safe(sko, n, &sko_list, sko_list) { ASSERT(sko->sko_magic == SKO_MAGIC); } list_for_each_entry_safe(sks, m, &sks_list, sks_list) { ASSERT(sks->sks_magic == SKS_MAGIC); kv_free(skc, sks, skc->skc_slab_size); } } static spl_kmem_emergency_t * spl_emergency_search(struct rb_root *root, void *obj) { struct rb_node *node = root->rb_node; spl_kmem_emergency_t *ske; unsigned long address = (unsigned long)obj; while (node) { ske = container_of(node, spl_kmem_emergency_t, ske_node); if (address < ske->ske_obj) node = node->rb_left; else if (address > ske->ske_obj) node = node->rb_right; else return (ske); } return (NULL); } static int spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) { struct rb_node **new = &(root->rb_node), *parent = NULL; spl_kmem_emergency_t *ske_tmp; unsigned long address = ske->ske_obj; while (*new) { ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); parent = *new; if (address < ske_tmp->ske_obj) new = &((*new)->rb_left); else if (address > ske_tmp->ske_obj) new = &((*new)->rb_right); else return (0); } rb_link_node(&ske->ske_node, parent, new); rb_insert_color(&ske->ske_node, root); return (1); } /* * Allocate a single emergency object and track it in a red black tree. */ static int spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) { gfp_t lflags = kmem_flags_convert(flags); spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); int empty; /* Last chance use a partial slab if one now exists */ spin_lock(&skc->skc_lock); empty = list_empty(&skc->skc_partial_list); spin_unlock(&skc->skc_lock); if (!empty) return (-EEXIST); ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); ske->ske_obj = __get_free_pages(lflags, order); if (ske->ske_obj == 0) { kfree(ske); return (-ENOMEM); } spin_lock(&skc->skc_lock); empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); if (likely(empty)) { skc->skc_obj_total++; skc->skc_obj_emergency++; if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) skc->skc_obj_emergency_max = skc->skc_obj_emergency; } spin_unlock(&skc->skc_lock); if (unlikely(!empty)) { free_pages(ske->ske_obj, order); kfree(ske); return (-EINVAL); } *obj = (void *)ske->ske_obj; return (0); } /* * Locate the passed object in the red black tree and free it. */ static int spl_emergency_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_emergency_t *ske; int order = get_order(skc->skc_obj_size); spin_lock(&skc->skc_lock); ske = spl_emergency_search(&skc->skc_emergency_tree, obj); if (ske) { rb_erase(&ske->ske_node, &skc->skc_emergency_tree); skc->skc_obj_emergency--; skc->skc_obj_total--; } spin_unlock(&skc->skc_lock); if (ske == NULL) return (-ENOENT); free_pages(ske->ske_obj, order); kfree(ske); return (0); } /* * Release objects from the per-cpu magazine back to their slab. The flush * argument contains the max number of entries to remove from the magazine. */ static void spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) { spin_lock(&skc->skc_lock); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); int count = MIN(flush, skm->skm_avail); for (int i = 0; i < count; i++) spl_cache_shrink(skc, skm->skm_objs[i]); skm->skm_avail -= count; memmove(skm->skm_objs, &(skm->skm_objs[count]), sizeof (void *) * skm->skm_avail); spin_unlock(&skc->skc_lock); } /* * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, * for very small objects we may end up with more than this so as not * to waste space in the minimal allocation of a single page. */ static int spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) { uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; sks_size = spl_sks_size(skc); obj_size = spl_obj_size(skc); max_size = (spl_kmem_cache_max_size * 1024 * 1024); tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); if (tgt_size <= max_size) { tgt_objs = (tgt_size - sks_size) / obj_size; } else { tgt_objs = (max_size - sks_size) / obj_size; tgt_size = (tgt_objs * obj_size) + sks_size; } if (tgt_objs == 0) return (-ENOSPC); *objs = tgt_objs; *size = tgt_size; return (0); } /* * Make a guess at reasonable per-cpu magazine size based on the size of * each object and the cost of caching N of them in each magazine. Long * term this should really adapt based on an observed usage heuristic. */ static int spl_magazine_size(spl_kmem_cache_t *skc) { uint32_t obj_size = spl_obj_size(skc); int size; if (spl_kmem_cache_magazine_size > 0) return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); /* Per-magazine sizes below assume a 4Kib page size */ if (obj_size > (PAGE_SIZE * 256)) size = 4; /* Minimum 4Mib per-magazine */ else if (obj_size > (PAGE_SIZE * 32)) size = 16; /* Minimum 2Mib per-magazine */ else if (obj_size > (PAGE_SIZE)) size = 64; /* Minimum 256Kib per-magazine */ else if (obj_size > (PAGE_SIZE / 4)) size = 128; /* Minimum 128Kib per-magazine */ else size = 256; return (size); } /* * Allocate a per-cpu magazine to associate with a specific core. */ static spl_kmem_magazine_t * spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) { spl_kmem_magazine_t *skm; int size = sizeof (spl_kmem_magazine_t) + sizeof (void *) * skc->skc_mag_size; skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (skm) { skm->skm_magic = SKM_MAGIC; skm->skm_avail = 0; skm->skm_size = skc->skc_mag_size; skm->skm_refill = skc->skc_mag_refill; skm->skm_cache = skc; skm->skm_cpu = cpu; } return (skm); } /* * Free a per-cpu magazine associated with a specific core. */ static void spl_magazine_free(spl_kmem_magazine_t *skm) { ASSERT(skm->skm_magic == SKM_MAGIC); ASSERT(skm->skm_avail == 0); kfree(skm); } /* * Create all pre-cpu magazines of reasonable sizes. */ static int spl_magazine_create(spl_kmem_cache_t *skc) { int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); skc->skc_mag_size = spl_magazine_size(skc); skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; for_each_possible_cpu(i) { skc->skc_mag[i] = spl_magazine_alloc(skc, i); if (!skc->skc_mag[i]) { for (i--; i >= 0; i--) spl_magazine_free(skc->skc_mag[i]); kfree(skc->skc_mag); return (-ENOMEM); } } return (0); } /* * Destroy all pre-cpu magazines. */ static void spl_magazine_destroy(spl_kmem_cache_t *skc) { spl_kmem_magazine_t *skm; int i = 0; ASSERT((skc->skc_flags & KMC_SLAB) == 0); for_each_possible_cpu(i) { skm = skc->skc_mag[i]; spl_cache_flush(skc, skm, skm->skm_avail); spl_magazine_free(skm); } kfree(skc->skc_mag); } /* * Create a object cache based on the following arguments: * name cache name * size cache object size * align cache object alignment * ctor cache object constructor * dtor cache object destructor * reclaim cache object reclaim * priv cache private data for ctor/dtor/reclaim * vmp unused must be NULL * flags * KMC_KVMEM Force kvmem backed SPL cache * KMC_SLAB Force Linux slab backed cache * KMC_NODEBUG Disable debugging (unsupported) */ spl_kmem_cache_t * spl_kmem_cache_create(const char *name, size_t size, size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, void *priv, void *vmp, int flags) { gfp_t lflags = kmem_flags_convert(KM_SLEEP); spl_kmem_cache_t *skc; int rc; /* * Unsupported flags */ ASSERT(vmp == NULL); ASSERT(reclaim == NULL); might_sleep(); skc = kzalloc(sizeof (*skc), lflags); if (skc == NULL) return (NULL); skc->skc_magic = SKC_MAGIC; skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + skc->skc_name = kmalloc(skc->skc_name_size, lflags); if (skc->skc_name == NULL) { kfree(skc); return (NULL); } strlcpy(skc->skc_name, name, skc->skc_name_size); skc->skc_ctor = ctor; skc->skc_dtor = dtor; skc->skc_private = priv; skc->skc_vmp = vmp; skc->skc_linux_cache = NULL; skc->skc_flags = flags; skc->skc_obj_size = size; skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; atomic_set(&skc->skc_ref, 0); INIT_LIST_HEAD(&skc->skc_list); INIT_LIST_HEAD(&skc->skc_complete_list); INIT_LIST_HEAD(&skc->skc_partial_list); skc->skc_emergency_tree = RB_ROOT; spin_lock_init(&skc->skc_lock); init_waitqueue_head(&skc->skc_waitq); skc->skc_slab_fail = 0; skc->skc_slab_create = 0; skc->skc_slab_destroy = 0; skc->skc_slab_total = 0; skc->skc_slab_alloc = 0; skc->skc_slab_max = 0; skc->skc_obj_total = 0; skc->skc_obj_alloc = 0; skc->skc_obj_max = 0; skc->skc_obj_deadlock = 0; skc->skc_obj_emergency = 0; skc->skc_obj_emergency_max = 0; rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0, GFP_KERNEL); if (rc != 0) { kfree(skc); return (NULL); } /* * Verify the requested alignment restriction is sane. */ if (align) { VERIFY(ISP2(align)); VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); VERIFY3U(align, <=, PAGE_SIZE); skc->skc_obj_align = align; } /* * When no specific type of slab is requested (kmem, vmem, or * linuxslab) then select a cache type based on the object size * and default tunables. */ if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) { if (spl_kmem_cache_slab_limit && size <= (size_t)spl_kmem_cache_slab_limit) { /* * Objects smaller than spl_kmem_cache_slab_limit can * use the Linux slab for better space-efficiency. */ skc->skc_flags |= KMC_SLAB; } else { /* * All other objects are considered large and are * placed on kvmem backed slabs. */ skc->skc_flags |= KMC_KVMEM; } } /* * Given the type of slab allocate the required resources. */ if (skc->skc_flags & KMC_KVMEM) { rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size); if (rc) goto out; rc = spl_magazine_create(skc); if (rc) goto out; } else { unsigned long slabflags = 0; if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { rc = EINVAL; goto out; } #if defined(SLAB_USERCOPY) /* * Required for PAX-enabled kernels if the slab is to be * used for copying between user and kernel space. */ slabflags |= SLAB_USERCOPY; #endif #if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) /* * Newer grsec patchset uses kmem_cache_create_usercopy() * instead of SLAB_USERCOPY flag */ skc->skc_linux_cache = kmem_cache_create_usercopy( skc->skc_name, size, align, slabflags, 0, size, NULL); #else skc->skc_linux_cache = kmem_cache_create( skc->skc_name, size, align, slabflags, NULL); #endif if (skc->skc_linux_cache == NULL) { rc = ENOMEM; goto out; } } down_write(&spl_kmem_cache_sem); list_add_tail(&skc->skc_list, &spl_kmem_cache_list); up_write(&spl_kmem_cache_sem); return (skc); out: kfree(skc->skc_name); percpu_counter_destroy(&skc->skc_linux_alloc); kfree(skc); return (NULL); } EXPORT_SYMBOL(spl_kmem_cache_create); /* * Register a move callback for cache defragmentation. * XXX: Unimplemented but harmless to stub out for now. */ void spl_kmem_cache_set_move(spl_kmem_cache_t *skc, kmem_cbrc_t (move)(void *, void *, size_t, void *)) { ASSERT(move != NULL); } EXPORT_SYMBOL(spl_kmem_cache_set_move); /* * Destroy a cache and all objects associated with the cache. */ void spl_kmem_cache_destroy(spl_kmem_cache_t *skc) { DECLARE_WAIT_QUEUE_HEAD(wq); taskqid_t id; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB)); down_write(&spl_kmem_cache_sem); list_del_init(&skc->skc_list); up_write(&spl_kmem_cache_sem); /* Cancel any and wait for any pending delayed tasks */ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); spin_lock(&skc->skc_lock); id = skc->skc_taskqid; spin_unlock(&skc->skc_lock); taskq_cancel_id(spl_kmem_cache_taskq, id); /* * Wait until all current callers complete, this is mainly * to catch the case where a low memory situation triggers a * cache reaping action which races with this destroy. */ wait_event(wq, atomic_read(&skc->skc_ref) == 0); if (skc->skc_flags & KMC_KVMEM) { spl_magazine_destroy(skc); spl_slab_reclaim(skc); } else { ASSERT(skc->skc_flags & KMC_SLAB); kmem_cache_destroy(skc->skc_linux_cache); } spin_lock(&skc->skc_lock); /* * Validate there are no objects in use and free all the * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ ASSERT3U(skc->skc_slab_alloc, ==, 0); ASSERT3U(skc->skc_obj_alloc, ==, 0); ASSERT3U(skc->skc_slab_total, ==, 0); ASSERT3U(skc->skc_obj_total, ==, 0); ASSERT3U(skc->skc_obj_emergency, ==, 0); ASSERT(list_empty(&skc->skc_complete_list)); ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); percpu_counter_destroy(&skc->skc_linux_alloc); spin_unlock(&skc->skc_lock); kfree(skc->skc_name); kfree(skc); } EXPORT_SYMBOL(spl_kmem_cache_destroy); /* * Allocate an object from a slab attached to the cache. This is used to * repopulate the per-cpu magazine caches in batches when they run low. */ static void * spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) { spl_kmem_obj_t *sko; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(sks->sks_magic == SKS_MAGIC); sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); ASSERT(sko->sko_magic == SKO_MAGIC); ASSERT(sko->sko_addr != NULL); /* Remove from sks_free_list */ list_del_init(&sko->sko_list); sks->sks_age = jiffies; sks->sks_ref++; skc->skc_obj_alloc++; /* Track max obj usage statistics */ if (skc->skc_obj_alloc > skc->skc_obj_max) skc->skc_obj_max = skc->skc_obj_alloc; /* Track max slab usage statistics */ if (sks->sks_ref == 1) { skc->skc_slab_alloc++; if (skc->skc_slab_alloc > skc->skc_slab_max) skc->skc_slab_max = skc->skc_slab_alloc; } return (sko->sko_addr); } /* * Generic slab allocation function to run by the global work queues. * It is responsible for allocating a new slab, linking it in to the list * of partial slabs, and then waking any waiters. */ static int __spl_cache_grow(spl_kmem_cache_t *skc, int flags) { spl_kmem_slab_t *sks; fstrans_cookie_t cookie = spl_fstrans_mark(); sks = spl_slab_alloc(skc, flags); spl_fstrans_unmark(cookie); spin_lock(&skc->skc_lock); if (sks) { skc->skc_slab_total++; skc->skc_obj_total += sks->sks_objs; list_add_tail(&sks->sks_list, &skc->skc_partial_list); smp_mb__before_atomic(); clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); smp_mb__after_atomic(); } spin_unlock(&skc->skc_lock); return (sks == NULL ? -ENOMEM : 0); } static void spl_cache_grow_work(void *data) { spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; spl_kmem_cache_t *skc = ska->ska_cache; int error = __spl_cache_grow(skc, ska->ska_flags); atomic_dec(&skc->skc_ref); smp_mb__before_atomic(); clear_bit(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); if (error == 0) wake_up_all(&skc->skc_waitq); kfree(ska); } /* * Returns non-zero when a new slab should be available. */ static int spl_cache_grow_wait(spl_kmem_cache_t *skc) { return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); } /* * No available objects on any slabs, create a new slab. Note that this * functionality is disabled for KMC_SLAB caches which are backed by the * Linux slab. */ static int spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) { int remaining, rc = 0; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); might_sleep(); *obj = NULL; /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. */ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, TASK_UNINTERRUPTIBLE); return (rc ? rc : -EAGAIN); } /* * Note: It would be nice to reduce the overhead of context switch * and improve NUMA locality, by trying to allocate a new slab in the * current process context with KM_NOSLEEP flag. * * However, this can't be applied to vmem/kvmem due to a bug that * spl_vmalloc() doesn't honor gfp flags in page table allocation. */ /* * This is handled by dispatching a work request to the global work * queue. This allows us to asynchronously allocate a new slab while * retaining the ability to safely fall back to a smaller synchronous * allocations to ensure forward progress is always maintained. */ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { spl_kmem_alloc_t *ska; ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); if (ska == NULL) { clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_all(&skc->skc_waitq); return (-ENOMEM); } atomic_inc(&skc->skc_ref); ska->ska_cache = skc; ska->ska_flags = flags; taskq_init_ent(&ska->ska_tqe); taskq_dispatch_ent(spl_kmem_cache_taskq, spl_cache_grow_work, ska, 0, &ska->ska_tqe); } /* * The goal here is to only detect the rare case where a virtual slab * allocation has deadlocked. We must be careful to minimize the use * of emergency objects which are more expensive to track. Therefore, * we set a very long timeout for the asynchronous allocation and if * the timeout is reached the cache is flagged as deadlocked. From * this point only new emergency objects will be allocated until the * asynchronous allocation completes and clears the deadlocked flag. */ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { rc = spl_emergency_alloc(skc, flags, obj); } else { remaining = wait_event_timeout(skc->skc_waitq, spl_cache_grow_wait(skc), HZ / 10); if (!remaining) { spin_lock(&skc->skc_lock); if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); skc->skc_obj_deadlock++; } spin_unlock(&skc->skc_lock); } rc = -ENOMEM; } return (rc); } /* * Refill a per-cpu magazine with objects from the slabs for this cache. * Ideally the magazine can be repopulated using existing objects which have * been released, however if we are unable to locate enough free objects new * slabs of objects will be created. On success NULL is returned, otherwise * the address of a single emergency object is returned for use by the caller. */ static void * spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) { spl_kmem_slab_t *sks; int count = 0, rc, refill; void *obj = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(skm->skm_magic == SKM_MAGIC); refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); while (refill > 0) { /* No slabs available we may need to grow the cache */ if (list_empty(&skc->skc_partial_list)) { spin_unlock(&skc->skc_lock); local_irq_enable(); rc = spl_cache_grow(skc, flags, &obj); local_irq_disable(); /* Emergency object for immediate use by caller */ if (rc == 0 && obj != NULL) return (obj); if (rc) goto out; /* Rescheduled to different CPU skm is not local */ if (skm != skc->skc_mag[smp_processor_id()]) goto out; /* * Potentially rescheduled to the same CPU but * allocations may have occurred from this CPU while * we were sleeping so recalculate max refill. */ refill = MIN(refill, skm->skm_size - skm->skm_avail); spin_lock(&skc->skc_lock); continue; } /* Grab the next available slab */ sks = list_entry((&skc->skc_partial_list)->next, spl_kmem_slab_t, sks_list); ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_ref < sks->sks_objs); ASSERT(!list_empty(&sks->sks_free_list)); /* * Consume as many objects as needed to refill the requested * cache. We must also be careful not to overfill it. */ while (sks->sks_ref < sks->sks_objs && refill-- > 0 && ++count) { ASSERT(skm->skm_avail < skm->skm_size); ASSERT(count < skm->skm_size); skm->skm_objs[skm->skm_avail++] = spl_cache_obj(skc, sks); } /* Move slab to skc_complete_list when full */ if (sks->sks_ref == sks->sks_objs) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_complete_list); } } spin_unlock(&skc->skc_lock); out: return (NULL); } /* * Release an object back to the slab from which it came. */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) { spl_kmem_slab_t *sks = NULL; spl_kmem_obj_t *sko = NULL; ASSERT(skc->skc_magic == SKC_MAGIC); sko = spl_sko_from_obj(skc, obj); ASSERT(sko->sko_magic == SKO_MAGIC); sks = sko->sko_slab; ASSERT(sks->sks_magic == SKS_MAGIC); ASSERT(sks->sks_cache == skc); list_add(&sko->sko_list, &sks->sks_free_list); sks->sks_age = jiffies; sks->sks_ref--; skc->skc_obj_alloc--; /* * Move slab to skc_partial_list when no longer full. Slabs * are added to the head to keep the partial list is quasi-full * sorted order. Fuller at the head, emptier at the tail. */ if (sks->sks_ref == (sks->sks_objs - 1)) { list_del(&sks->sks_list); list_add(&sks->sks_list, &skc->skc_partial_list); } /* * Move empty slabs to the end of the partial list so * they can be easily found and freed during reclamation. */ if (sks->sks_ref == 0) { list_del(&sks->sks_list); list_add_tail(&sks->sks_list, &skc->skc_partial_list); skc->skc_slab_alloc--; } } /* * Allocate an object from the per-cpu magazine, or if the magazine * is empty directly allocate from a slab and repopulate the magazine. */ void * spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) { spl_kmem_magazine_t *skm; void *obj = NULL; ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Allocate directly from a Linux slab. All optimizations are left * to the underlying cache we only need to guarantee that KM_SLEEP * callers will never fail. */ if (skc->skc_flags & KMC_SLAB) { struct kmem_cache *slc = skc->skc_linux_cache; do { obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); } while ((obj == NULL) && !(flags & KM_NOSLEEP)); if (obj != NULL) { /* * Even though we leave everything up to the * underlying cache we still keep track of * how many objects we've allocated in it for * better debuggability. */ percpu_counter_inc(&skc->skc_linux_alloc); } goto ret; } local_irq_disable(); restart: /* * Safe to update per-cpu structure without lock, but * in the restart case we must be careful to reacquire * the local magazine since this may have changed * when we need to grow the cache. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); if (likely(skm->skm_avail)) { /* Object available in CPU cache, use it */ obj = skm->skm_objs[--skm->skm_avail]; } else { obj = spl_cache_refill(skc, skm, flags); if ((obj == NULL) && !(flags & KM_NOSLEEP)) goto restart; local_irq_enable(); goto ret; } local_irq_enable(); ASSERT(obj); ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); ret: /* Pre-emptively migrate object to CPU L1 cache */ if (obj) { if (obj && skc->skc_ctor) skc->skc_ctor(obj, skc->skc_private, flags); else prefetchw(obj); } return (obj); } EXPORT_SYMBOL(spl_kmem_cache_alloc); /* * Free an object back to the local per-cpu magazine, there is no * guarantee that this is the same magazine the object was originally * allocated from. We may need to flush entire from the magazine * back to the slabs to make space. */ void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) { spl_kmem_magazine_t *skm; unsigned long flags; int do_reclaim = 0; int do_emergency = 0; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); /* * Run the destructor */ if (skc->skc_dtor) skc->skc_dtor(obj, skc->skc_private); /* * Free the object from the Linux underlying Linux slab. */ if (skc->skc_flags & KMC_SLAB) { kmem_cache_free(skc->skc_linux_cache, obj); percpu_counter_dec(&skc->skc_linux_alloc); return; } /* * While a cache has outstanding emergency objects all freed objects * must be checked. However, since emergency objects will never use * a virtual address these objects can be safely excluded as an * optimization. */ if (!is_vmalloc_addr(obj)) { spin_lock(&skc->skc_lock); do_emergency = (skc->skc_obj_emergency > 0); spin_unlock(&skc->skc_lock); if (do_emergency && (spl_emergency_free(skc, obj) == 0)) return; } local_irq_save(flags); /* * Safe to update per-cpu structure without lock, but * no remote memory allocation tracking is being performed * it is entirely possible to allocate an object from one * CPU cache and return it to another. */ skm = skc->skc_mag[smp_processor_id()]; ASSERT(skm->skm_magic == SKM_MAGIC); /* * Per-CPU cache full, flush it to make space for this object, * this may result in an empty slab which can be reclaimed once * interrupts are re-enabled. */ if (unlikely(skm->skm_avail >= skm->skm_size)) { spl_cache_flush(skc, skm, skm->skm_refill); do_reclaim = 1; } /* Available space in cache, use it */ skm->skm_objs[skm->skm_avail++] = obj; local_irq_restore(flags); if (do_reclaim) spl_slab_reclaim(skc); } EXPORT_SYMBOL(spl_kmem_cache_free); /* * Depending on how many and which objects are released it may simply * repopulate the local magazine which will then need to age-out. Objects * which cannot fit in the magazine will be released back to their slabs * which will also need to age out before being released. This is all just * best effort and we do not want to thrash creating and destroying slabs. */ void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) { ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); if (skc->skc_flags & KMC_SLAB) return; atomic_inc(&skc->skc_ref); /* * Prevent concurrent cache reaping when contended. */ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) goto out; /* Reclaim from the magazine and free all now empty slabs. */ unsigned long irq_flags; local_irq_save(irq_flags); spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; spl_cache_flush(skc, skm, skm->skm_avail); local_irq_restore(irq_flags); spl_slab_reclaim(skc); clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); smp_mb__after_atomic(); wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); out: atomic_dec(&skc->skc_ref); } EXPORT_SYMBOL(spl_kmem_cache_reap_now); /* * This is stubbed out for code consistency with other platforms. There * is existing logic to prevent concurrent reaping so while this is ugly * it should do no harm. */ int spl_kmem_cache_reap_active(void) { return (0); } EXPORT_SYMBOL(spl_kmem_cache_reap_active); /* * Reap all free slabs from all registered caches. */ void spl_kmem_reap(void) { spl_kmem_cache_t *skc = NULL; down_read(&spl_kmem_cache_sem); list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { spl_kmem_cache_reap_now(skc); } up_read(&spl_kmem_cache_sem); } EXPORT_SYMBOL(spl_kmem_reap); int spl_kmem_cache_init(void) { init_rwsem(&spl_kmem_cache_sem); INIT_LIST_HEAD(&spl_kmem_cache_list); spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", spl_kmem_cache_kmem_threads, maxclsyspri, spl_kmem_cache_kmem_threads * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (spl_kmem_cache_taskq == NULL) return (-ENOMEM); return (0); } void spl_kmem_cache_fini(void) { taskq_destroy(spl_kmem_cache_taskq); } diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c index 4a9a36d87e66..91247f29278f 100644 --- a/module/zfs/zfs_chksum.c +++ b/module/zfs/zfs_chksum.c @@ -1,357 +1,357 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2021-2022 Tino Reichardt */ #include #include #include #include #include #include /* limit benchmarking to max 256KiB, when EdonR is slower then this: */ #define LIMIT_PERF_MBS 300 typedef struct { const char *name; const char *impl; uint64_t bs1k; uint64_t bs4k; uint64_t bs16k; uint64_t bs64k; uint64_t bs256k; uint64_t bs1m; uint64_t bs4m; uint64_t bs16m; zio_cksum_salt_t salt; zio_checksum_t *(func); zio_checksum_tmpl_init_t *(init); zio_checksum_tmpl_free_t *(free); } chksum_stat_t; static chksum_stat_t *chksum_stat_data = 0; static int chksum_stat_cnt = 0; static kstat_t *chksum_kstat = NULL; /* * i3-1005G1 test output: * * implementation 1k 4k 16k 64k 256k 1m 4m * fletcher-4 5421 15001 26468 32555 34720 32801 18847 * edonr-generic 1196 1602 1761 1749 1762 1759 1751 * skein-generic 546 591 608 615 619 612 616 * sha256-generic 246 270 274 274 277 275 276 * sha256-avx 262 296 304 307 307 307 306 * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228 * sha256-openssl 240 300 316 314 304 285 276 * sha512-generic 333 374 385 392 391 393 392 * sha512-openssl 353 441 467 476 472 467 426 * sha512-avx 362 444 473 475 479 476 478 * sha512-avx2 394 500 530 538 543 545 542 * blake3-generic 308 313 313 313 312 313 312 * blake3-sse2 402 1289 1423 1446 1432 1458 1413 * blake3-sse41 427 1470 1625 1704 1679 1607 1629 * blake3-avx2 428 1920 3095 3343 3356 3318 3204 * blake3-avx512 473 2687 4905 5836 5844 5643 5374 */ static int chksum_kstat_headers(char *buf, size_t size) { ssize_t off = 0; off += kmem_scnprintf(buf + off, size, "%-23s", "implementation"); off += kmem_scnprintf(buf + off, size - off, "%8s", "1k"); off += kmem_scnprintf(buf + off, size - off, "%8s", "4k"); off += kmem_scnprintf(buf + off, size - off, "%8s", "16k"); off += kmem_scnprintf(buf + off, size - off, "%8s", "64k"); off += kmem_scnprintf(buf + off, size - off, "%8s", "256k"); off += kmem_scnprintf(buf + off, size - off, "%8s", "1m"); off += kmem_scnprintf(buf + off, size - off, "%8s", "4m"); (void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m"); return (0); } static int chksum_kstat_data(char *buf, size_t size, void *data) { chksum_stat_t *cs; ssize_t off = 0; char b[24]; cs = (chksum_stat_t *)data; kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl); off += kmem_scnprintf(buf + off, size - off, "%-23s", b); off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs1k); off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs4k); off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs16k); off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs64k); off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs256k); off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs1m); off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs4m); (void) kmem_scnprintf(buf + off, size - off, "%8llu\n", (u_longlong_t)cs->bs16m); return (0); } static void * chksum_kstat_addr(kstat_t *ksp, loff_t n) { if (n < chksum_stat_cnt) ksp->ks_private = (void *)(chksum_stat_data + n); else ksp->ks_private = NULL; return (ksp->ks_private); } static void chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, uint64_t *result) { hrtime_t start; uint64_t run_bw, run_time_ns, run_count = 0, size = 0; uint32_t l, loops = 0; zio_cksum_t zcp; switch (round) { case 1: /* 1k */ size = 1<<10; loops = 128; break; case 2: /* 2k */ size = 1<<12; loops = 64; break; case 3: /* 4k */ size = 1<<14; loops = 32; break; case 4: /* 16k */ size = 1<<16; loops = 16; break; case 5: /* 256k */ size = 1<<18; loops = 8; break; case 6: /* 1m */ size = 1<<20; loops = 4; break; case 7: /* 4m */ size = 1<<22; loops = 1; break; case 8: /* 16m */ size = 1<<24; loops = 1; break; } kpreempt_disable(); start = gethrtime(); do { for (l = 0; l < loops; l++, run_count++) cs->func(abd, size, ctx, &zcp); run_time_ns = gethrtime() - start; } while (run_time_ns < MSEC2NSEC(1)); kpreempt_enable(); run_bw = size * run_count * NANOSEC; run_bw /= run_time_ns; /* B/s */ *result = run_bw/1024/1024; /* MiB/s */ } #define LIMIT_INIT 0 #define LIMIT_NEEDED 1 #define LIMIT_NOLIMIT 2 static void chksum_benchit(chksum_stat_t *cs) { abd_t *abd; void *ctx = 0; void *salt = &cs->salt.zcs_bytes; static int chksum_stat_limit = LIMIT_INIT; memset(salt, 0, sizeof (cs->salt.zcs_bytes)); if (cs->init) ctx = cs->init(&cs->salt); /* allocate test memory via abd linear interface */ abd = abd_alloc_linear(1<<20, B_FALSE); chksum_run(cs, abd, ctx, 1, &cs->bs1k); chksum_run(cs, abd, ctx, 2, &cs->bs4k); chksum_run(cs, abd, ctx, 3, &cs->bs16k); chksum_run(cs, abd, ctx, 4, &cs->bs64k); chksum_run(cs, abd, ctx, 5, &cs->bs256k); /* check if we ran on a slow cpu */ if (chksum_stat_limit == LIMIT_INIT) { if (cs->bs1k < LIMIT_PERF_MBS) { chksum_stat_limit = LIMIT_NEEDED; } else { chksum_stat_limit = LIMIT_NOLIMIT; } } /* skip benchmarks >= 1MiB when the CPU is to slow */ if (chksum_stat_limit == LIMIT_NEEDED) goto abort; chksum_run(cs, abd, ctx, 6, &cs->bs1m); abd_free(abd); /* allocate test memory via abd non linear interface */ abd = abd_alloc(1<<24, B_FALSE); chksum_run(cs, abd, ctx, 7, &cs->bs4m); chksum_run(cs, abd, ctx, 8, &cs->bs16m); abort: abd_free(abd); /* free up temp memory */ if (cs->free) cs->free(ctx); } /* * Initialize and benchmark all supported implementations. */ static void chksum_benchmark(void) { #ifndef _KERNEL /* we need the benchmark only for the kernel module */ return; #endif chksum_stat_t *cs; int cbid = 0; uint64_t max = 0; uint32_t id, id_save; /* space for the benchmark times */ chksum_stat_cnt = 4; chksum_stat_cnt += blake3_impl_getcnt(); - chksum_stat_data = (chksum_stat_t *)kmem_zalloc( + chksum_stat_data = kmem_zalloc( sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); /* edonr - needs to be the first one here (slow CPU check) */ cs = &chksum_stat_data[cbid++]; cs->init = abd_checksum_edonr_tmpl_init; cs->func = abd_checksum_edonr_native; cs->free = abd_checksum_edonr_tmpl_free; cs->name = "edonr"; cs->impl = "generic"; chksum_benchit(cs); /* skein */ cs = &chksum_stat_data[cbid++]; cs->init = abd_checksum_skein_tmpl_init; cs->func = abd_checksum_skein_native; cs->free = abd_checksum_skein_tmpl_free; cs->name = "skein"; cs->impl = "generic"; chksum_benchit(cs); /* sha256 */ cs = &chksum_stat_data[cbid++]; cs->init = 0; cs->func = abd_checksum_SHA256; cs->free = 0; cs->name = "sha256"; cs->impl = "generic"; chksum_benchit(cs); /* sha512 */ cs = &chksum_stat_data[cbid++]; cs->init = 0; cs->func = abd_checksum_SHA512_native; cs->free = 0; cs->name = "sha512"; cs->impl = "generic"; chksum_benchit(cs); /* blake3 */ id_save = blake3_impl_getid(); for (id = 0; id < blake3_impl_getcnt(); id++) { blake3_impl_setid(id); cs = &chksum_stat_data[cbid++]; cs->init = abd_checksum_blake3_tmpl_init; cs->func = abd_checksum_blake3_native; cs->free = abd_checksum_blake3_tmpl_free; cs->name = "blake3"; cs->impl = blake3_impl_getname(); chksum_benchit(cs); if (cs->bs256k > max) { max = cs->bs256k; blake3_impl_set_fastest(id); } } /* restore initial value */ blake3_impl_setid(id_save); } void chksum_init(void) { #ifdef _KERNEL blake3_per_cpu_ctx_init(); #endif /* Benchmark supported implementations */ chksum_benchmark(); /* Install kstats for all implementations */ chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (chksum_kstat != NULL) { chksum_kstat->ks_data = NULL; chksum_kstat->ks_ndata = UINT32_MAX; kstat_set_raw_ops(chksum_kstat, chksum_kstat_headers, chksum_kstat_data, chksum_kstat_addr); kstat_install(chksum_kstat); } } void chksum_fini(void) { if (chksum_kstat != NULL) { kstat_delete(chksum_kstat); chksum_kstat = NULL; } if (chksum_stat_cnt) { kmem_free(chksum_stat_data, sizeof (chksum_stat_t) * chksum_stat_cnt); chksum_stat_cnt = 0; chksum_stat_data = 0; } #ifdef _KERNEL blake3_per_cpu_ctx_fini(); #endif } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 45ecb0773260..0c392b9da0fb 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1,1005 +1,1005 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static ulong_t zfs_fsync_sync_cnt = 4; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { int error = 0; zfsvfs_t *zfsvfs = ZTOZSB(zp); (void) tsd_set(zfs_fsyncer_key, (void *)(uintptr_t)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) goto out; atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); atomic_dec_32(&zp->z_sync_writes_cnt); zfs_exit(zfsvfs, FTAG); } out: tsd_set(zfs_fsyncer_key, NULL); return (error); } #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* * Lseek support for finding holes (cmd == SEEK_HOLE) and * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) { zfs_locked_range_t *lr; uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == F_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; /* Flush any mmap()'d data to disk */ if (zn_has_cached_data(zp)) zn_flush_cached_data(zp, B_FALSE); lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER); error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); zfs_rangelock_exit(lr); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* File was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } int zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_holey_common(zp, cmd, off); zfs_exit(zfsvfs, FTAG); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ int zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (flag & V_ACE_MASK) #if defined(__linux__) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, kcred->user_ns); #else error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, NULL); #endif else #if defined(__linux__) error = zfs_zaccess_rwx(zp, mode, flag, cr, kcred->user_ns); #else error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); #endif zfs_exit(zfsvfs, FTAG); return (error); } static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: zp - inode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - O_SYNC flags; used to provide FRSYNC semantics. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * inode - atime updated if byte count > 0 */ int zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { (void) cr; int error = 0; boolean_t frsync = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (zp->z_pflags & ZFS_AV_QUARANTINED) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } /* We don't copy out anything useful for directories. */ if (Z_ISDIR(ZTOTYPE(zp))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } /* * Validate file offset */ if (zfs_uio_offset(uio) < (offset_t)0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (zfs_uio_resid(uio) == 0) { zfs_exit(zfsvfs, FTAG); return (0); } #ifdef FRSYNC /* * If we're in FRSYNC mode, sync out this znode before reading it. * Only do this for non-snapshots. * * Some platforms do not support FRSYNC and instead map it * to O_SYNC, which results in unnecessary calls to zil_commit. We * only honor FRSYNC requests on platforms which support it. */ frsync = !!(ioflag & FRSYNC); #endif if (zfsvfs->z_log && (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (zfs_uio_offset(uio) >= zp->z_size) { error = 0; goto out; } ASSERT(zfs_uio_offset(uio) < zp->z_size); #if defined(__linux__) ssize_t start_offset = zfs_uio_offset(uio); #endif ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; while (n > 0) { ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); #if defined(__linux__) /* * if we actually read some bytes, bubbling EFAULT * up to become EAGAIN isn't what we want here... * * ...on Linux, at least. On FBSD, doing this breaks. */ if (error == EFAULT && (zfs_uio_offset(uio) - start_offset) != 0) error = 0; #endif break; } n -= nbytes; } int64_t nread = start_resid - n; dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); task_io_account_read(nread); out: zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); zfs_exit(zfsvfs, FTAG); return (error); } static void zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) { zilog_t *zilog = zfsvfs->z_log; const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); ASSERT(clear_setid_bits_txgp != NULL); ASSERT(tx != NULL); /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * * It would be nice to do this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(zp, cr, ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); mutex_exit(&zp->z_acl_lock); /* * Make sure SUID/SGID bits will be removed when we replay the * log. If the setid bits are keep coming back, don't log more * than one TX_SETATTR per transaction group. */ if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { vattr_t va = {0}; va.va_mask = ATTR_MODE; va.va_nodeid = zp->z_id; va.va_mode = newmode; zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, ATTR_MODE, NULL); *clear_setid_bits_txgp = dmu_tx_get_txg(tx); } } else { mutex_exit(&zp->z_acl_lock); } } /* * Write the bytes to a file. * * IN: zp - znode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - O_APPEND flag set if in append mode. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated if byte count > 0 */ int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0, error1; ssize_t start_resid = zfs_uio_resid(uio); uint64_t clear_setid_bits_txg = 0; /* * Fasttrack empty write */ ssize_t n = start_resid; if (n == 0) return (0); zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); sa_bulk_attr_t bulk[4]; int count = 0; uint64_t mtime[2], ctime[2]; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && (zfs_uio_offset(uio) < zp->z_size))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* * Validate file offset */ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } const uint64_t max_blksz = zfsvfs->z_max_blksz; /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFAULT)); } /* * If in append mode, set the io offset pointer to eof. */ zfs_locked_range_t *lr; if (ioflag & O_APPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } zfs_uio_setoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (zn_rlimit_fsize(zp, uio)) { zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } const rlim64_t limit = MAXOFFSET_T; if (woff >= limit) { zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFBIG)); } if (n > limit - woff) n = limit - woff; uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); const uint64_t projid = zp->z_projid; /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { woff = zfs_uio_offset(uio); if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid))) { error = SET_ERROR(EDQUOT); break; } arc_buf_t *abuf = NULL; if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if ((error = zfs_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } ASSERT3S(cbytes, ==, max_blksz); } /* * Start a transaction. */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, MIN(n, max_blksz)); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * NB: We must call zfs_clear_setid_bits_if_necessary before * committing the transaction! */ /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ const ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); dmu_tx_commit(tx); /* * Account for partial writes before * continuing the loop. * Update needs to occur before the next * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ if (tx_bytes != zfs_uio_resid(uio)) n -= tx_bytes - zfs_uio_resid(uio); if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { break; } continue; } #endif /* * On FreeBSD, EFAULT should be propagated back to the * VFS, which will handle faulting and will retry. */ if (error != 0 && error != EFAULT) { zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); dmu_tx_commit(tx); break; } tx_bytes -= zfs_uio_resid(uio); } else { /* Implied by abuf != NULL: */ ASSERT3S(n, >=, max_blksz); ASSERT0(P2PHASE(woff, max_blksz)); /* * We can simplify nbytes to MIN(n, max_blksz) since * P2PHASE(woff, max_blksz) is 0, and knowing * n >= max_blksz lets us simplify further: */ ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. * * dmu_assign_arcbuf_by_dbuf() will directly assign the * arc buffer to a dbuf. */ error = dmu_assign_arcbuf_by_dbuf( sa_get_db(zp->z_sa_hdl), woff, abuf, tx); if (error != 0) { /* * XXX This might not be necessary if * dmu_assign_arcbuf_by_dbuf is guaranteed * to be atomic. */ zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); dmu_return_arcbuf(abuf); dmu_tx_commit(tx); break; } ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } if (tx_bytes && zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, &clear_setid_bits_txg, tx); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, zfs_uio_offset(uio)); ASSERT(error == 0 || error == EFAULT); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); if (error1 != 0) /* Avoid clobbering EFAULT. */ error = error1; /* * NB: During replay, the TX_SETATTR record logged by * zfs_clear_setid_bits_if_necessary must precede any of * the TX_WRITE records logged here. */ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, NULL, NULL); dmu_tx_commit(tx); if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; if (n > 0) { if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { error = SET_ERROR(EFAULT); break; } } } zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's * at least a partial write, so it's successful. */ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { zfs_exit(zfsvfs, FTAG); return (error); } if (ioflag & (O_SYNC | O_DSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); zfs_exit(zfsvfs, FTAG); return (0); } int zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_getacl(zp, vsecp, skipaclchk, cr); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif static void zfs_get_done(zgd_t *zgd, int error); /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; uint64_t zp_gen; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } /* check if generation number matches */ if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (zp_gen)) != 0) { zfs_zrele_async(zp); return (SET_ERROR(EIO)); } if (zp_gen != gen) { zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } static void zfs_get_done(zgd_t *zgd, int error) { (void) error; znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); kmem_free(zgd, sizeof (zgd_t)); } EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_holey); EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 20578a8223b2..1371e5187516 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1,1739 +1,1739 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev// * * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ /* * Note on locking of zvol state structures. * * These structures are used to maintain internal state used to emulate block * devices on top of zvols. In particular, management of device minor number * operations - create, remove, rename, and set_snapdev - involves access to * these structures. The zvol_state_lock is primarily used to protect the * zvol_state_list. The zv->zv_state_lock is used to protect the contents * of the zvol_state_t structures, as well as to make sure that when the * time comes to remove the structure from the list, it is not in use, and * therefore, it can be taken off zvol_state_list and freed. * * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, * e.g. for the duration of receive and rollback operations. This lock can be * held for significant periods of time. Given that it is undesirable to hold * mutexes for long periods of time, the following lock ordering applies: * - take zvol_state_lock if necessary, to protect zvol_state_list * - take zv_suspend_lock if necessary, by the code path in question * - take zv_state_lock to protect zvol_state_t * * The minor operations are issued to spa->spa_zvol_taskq queues, that are * single-threaded (to preserve order of minor operations), and are executed * through the zvol_task_cb that dispatches the specific operations. Therefore, * these operations are serialized per pool. Consequently, we can be certain * that for a given zvol, there is only one operation at a time in progress. * That is why one can be sure that first, zvol_state_t for a given zvol is * allocated and placed on zvol_state_list, and then other minor operations * for this zvol are going to proceed in the order of issue. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; typedef enum { ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, ZVOL_ASYNC_SET_VOLMODE, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t op; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; uint64_t value; } zvol_task_t; uint64_t zvol_name_hash(const char *name) { int i; uint64_t crc = -1ULL; const uint8_t *p = (const uint8_t *)name; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; } return (crc); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; struct hlist_node *p = NULL; rw_enter(&zvol_state_lock, RW_READER); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); mutex_enter(&zv->zv_state_lock); if (zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0) { /* * this is the right zvol, take the locks in the * right order */ if (mode != RW_NONE && !rw_tryenter(&zv->zv_suspend_lock, mode)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, mode); mutex_enter(&zv->zv_state_lock); /* * zvol cannot be renamed as we continue * to hold zvol_state_lock */ ASSERT(zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0); } rw_exit(&zvol_state_lock); return (zv); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ static zvol_state_t * zvol_find_by_name(const char *name, int mode) { return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT(error == 0); } /* * ZFS_IOC_OBJSET_STATS entry point. */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (SET_ERROR(error)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi->doi_data_block_size); } kmem_free(doi, sizeof (dmu_object_info_t)); return (SET_ERROR(error)); } /* * Sanity check volume size. */ int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } /* * Ensure the zap is flushed then inform the VFS of the capacity change. */ static int zvol_update_volsize(uint64_t volsize, objset_t *os) { dmu_tx_t *tx; int error; uint64_t txg; tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (SET_ERROR(error)); } txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } /* * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume * size will result in a udev "change" event being generated. */ int zvol_set_volsize(const char *name, uint64_t volsize) { objset_t *os = NULL; uint64_t readonly; int error; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (SET_ERROR(error)); if (readonly) return (SET_ERROR(EROFS)); zvol_state_t *zv = zvol_find_by_name(name, RW_READER); ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && RW_READ_HELD(&zv->zv_suspend_lock))); if (zv == NULL || zv->zv_objset == NULL) { if (zv != NULL) rw_exit(&zv->zv_suspend_lock); if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, FTAG, &os)) != 0) { if (zv != NULL) mutex_exit(&zv->zv_state_lock); return (SET_ERROR(error)); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { os = zv->zv_objset; } dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) goto out; error = zvol_update_volsize(volsize, os); if (error == 0 && zv != NULL) { zv->zv_volsize = volsize; zv->zv_changed = 1; } out: kmem_free(doi, sizeof (dmu_object_info_t)); if (owned) { dmu_objset_disown(os, B_TRUE, FTAG); if (zv != NULL) zv->zv_objset = NULL; } else { rw_exit(&zv->zv_suspend_lock); } if (zv != NULL) mutex_exit(&zv->zv_state_lock); if (error == 0 && zv != NULL) zvol_os_update_volsize(zv, volsize); return (SET_ERROR(error)); } /* * Sanity check volume block size. */ int zvol_check_volblocksize(const char *name, uint64_t volblocksize) { /* Record sizes above 128k need the feature to be enabled */ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (volblocksize > zfs_max_recordsize) return (SET_ERROR(EDOM)); spa_close(spa, FTAG); } if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_truncate_t *lr = arg2; uint64_t offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); int error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } return (error); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_write_t *lr = arg2; objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t offset, length; dmu_tx_t *tx; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); } return (error); } static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { (void) arg1, (void) arg2, (void) byteswap; return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ zvol_replay_err, /* TX_MKXATTR */ zvol_replay_err, /* TX_SYMLINK */ zvol_replay_err, /* TX_REMOVE */ zvol_replay_err, /* TX_RMDIR */ zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ }; /* * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. * * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ static const ssize_t zvol_immediate_write_sz = 32768; void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, int sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; uint64_t sz = size; if (zil_replaying(zilog, tx)) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && size >= blocksize && blocksize > zvol_immediate_write_sz) write_state = WR_INDIRECT; else if (sync) write_state = WR_COPIED; else write_state = WR_NEED_COPY; while (size) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = size; if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(offset, blocksize), size); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; itx->itx_sync = sync; (void) zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); } } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; itx->itx_sync = sync; zil_itx_assign(zilog, itx, tx); } static void zvol_get_done(zgd_t *zgd, int error) { (void) error; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change * the data. Contrarily to zfs_get_data we need not re-check * blocksize after we get the lock because it cannot be changed. */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (SET_ERROR(error)); } /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ void zvol_insert(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void zvol_remove(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_remove(&zvol_state_list, zv); hlist_del(&zv->zv_hlink); } /* * Setup zv after we just own the zv->objset */ static int zvol_setup_zv(zvol_state_t *zv) { uint64_t volsize; int error; uint64_t ro; objset_t *os = zv->zv_objset; ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); zv->zv_zilog = NULL; zv->zv_flags &= ~ZVOL_WRITTEN_TO; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) return (SET_ERROR(error)); error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) return (SET_ERROR(error)); zvol_os_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { zvol_os_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { zvol_os_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); } /* * Shutdown every zv_objset related stuff except zv_objset itself. * The is the reverse of zvol_setup_zv. */ static void zvol_shutdown_zv(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock) && RW_LOCK_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_WRITTEN_TO) { ASSERT(zv->zv_zilog != NULL); zil_close(zv->zv_zilog); } zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); } /* * return the proper tag for rollback and recv */ void * zvol_tag(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); return (zv->zv_open_count > 0 ? zv : NULL); } /* * Suspend the zvol for recv and rollback. */ zvol_state_t * zvol_suspend(const char *name) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) return (NULL); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); /* * do not hold zv_state_lock across suspend/resume to * avoid locking up zvol lookups */ mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ return (zv); } int zvol_resume(zvol_state_t *zv) { int error = 0; ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); mutex_enter(&zv->zv_state_lock); if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); dmu_objset_rele(zv->zv_objset, zv); error = zvol_setup_zv(zv); } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing * zv_suspend_lock. zvol_remove_minors_impl thus cannot check * zv_suspend_lock to determine it is safe to free because rwlock is * not inherent atomic. */ atomic_dec(&zv->zv_suspend_ref); return (SET_ERROR(error)); } int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; int error; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(mutex_owned(&spa_namespace_lock)); boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) return (SET_ERROR(error)); zv->zv_objset = os; error = zvol_setup_zv(zv); if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } return (error); } void zvol_last_close(zvol_state_t *zv) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); zv->zv_objset = NULL; } typedef struct minors_job { list_t *list; list_node_t link; /* input */ char *name; /* output */ int error; } minors_job_t; /* * Prefetch zvol dnodes for the minors_job */ static void zvol_prefetch_minors_impl(void *arg) { minors_job_t *job = arg; char *dsname = job->name; objset_t *os = NULL; job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { minors_job_t *j = arg; list_t *minors_list = j->list; const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); /* skip the designated dataset */ if (name && strcmp(dsname, name) == 0) return (0); /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " "%s is not a snapshot name\n", dsname); } else { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); } return (0); } /* * If spa_keystore_load_wkey() is called for an encrypted zvol, * we need to look for any clones also using the key. This function * is "best effort" - so we just skip over it if there are failures. */ static void zvol_add_clones(const char *dsname, list_t *minors_list) { /* Also check if it has clones */ dsl_dir_t *dd = NULL; dsl_pool_t *dp = NULL; if (dsl_pool_hold(dsname, FTAG, &dp) != 0) return; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) goto out; if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0) goto out; if (dsl_dir_phys(dd)->dd_clones == 0) goto out; zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); objset_t *mos = dd->dd_pool->dp_meta_objset; for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; minors_job_t *job; if (dsl_dataset_hold_obj(dd->dd_pool, za->za_first_integer, FTAG, &clone) == 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(clone, name); char *n = kmem_strdup(name); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); dsl_dataset_rele(clone, FTAG); } } zap_cursor_fini(zc); kmem_free(za, sizeof (zap_attribute_t)); kmem_free(zc, sizeof (zap_cursor_t)); out: if (dd != NULL) dsl_dir_rele(dd, FTAG); dsl_pool_rele(dp, FTAG); } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); if (error) return (0); /* * Given the name and the 'snapdev' property, create device minor nodes * with the linkages to zvols/snapshots as needed. * If the name represents a zvol, create a minor node for the zvol, then * check if its snapshots are 'visible', and if so, iterate over the * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); zvol_add_clones(dsname, minors_list); if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ error = dmu_objset_find(dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", dsname); } return (0); } /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots * only if they are 'visible'. This approach allows one to assure that the * snapshot metadata is read from disk only if it is needed. * * The name can represent a dataset to be recursively scanned for zvols and * their snapshots, or a single zvol snapshot. If the name represents a * dataset, the scan is performed in two nested stages: * - scan the dataset for zvols, and * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ void zvol_create_minors_recursive(const char *name) { list_t minors_list; minors_job_t *job; if (zvol_inhibit_dev) return; /* * This is the list for prefetch jobs. Whenever we found a match * during dmu_objset_find, we insert a minors_job to the list and do * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * * We will use this list to do zvol_os_create_minor after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) zvol_os_create_minor(name); } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } taskq_wait_outstanding(system_taskq, 0); /* * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ while ((job = list_head(&minors_list)) != NULL) { list_remove(&minors_list, job); if (!job->error) (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); } void zvol_create_minor(const char *name) { /* * Note: the dsl_pool_config_lock must not be held. * Minor node creation needs to obtain the zvol_state_lock. * zvol_open() obtains the zvol_state_lock and then the dsl pool * config lock. Therefore, we can't have the config lock now if * we are going to wait for the zvol_state_lock, because it * would be a lock order inversion which could lead to deadlock. */ if (zvol_inhibit_dev) return; if (strchr(name, '@') != NULL) { uint64_t snapdev; int error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) (void) zvol_os_create_minor(name); } else { (void) zvol_os_create_minor(name); } } /* * Remove minors for specified dataset including children and snapshots. */ static void zvol_free_task(void *arg) { zvol_os_free(arg); } void zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t; list_t free_list; if (zvol_inhibit_dev) return; list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zvol_os_clear_private(zv); /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ t = taskq_dispatch(system_taskq, zvol_free_task, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); } else { mutex_exit(&zv->zv_state_lock); } } rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ while ((zv = list_head(&free_list)) != NULL) { list_remove(&free_list, zv); zvol_os_free(zv); } } /* Remove minor for this specific volume only */ static void zvol_remove_minor_impl(const char *name) { zvol_state_t *zv = NULL, *zv_next; if (zvol_inhibit_dev) return; rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, name) == 0) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { mutex_exit(&zv->zv_state_lock); continue; } zvol_remove(zv); zvol_os_clear_private(zv); mutex_exit(&zv->zv_state_lock); break; } else { mutex_exit(&zv->zv_state_lock); } } /* Drop zvol_state_lock before calling zvol_free() */ rw_exit(&zvol_state_lock); if (zv != NULL) zvol_os_free(zv); } /* * Rename minors for specified dataset including children and snapshots. */ static void zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { zvol_os_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); zvol_os_rename_minor(zv, name); kmem_strfree(name); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); } typedef struct zvol_snapdev_cb_arg { uint64_t snapdev; } zvol_snapdev_cb_arg_t; static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: (void) zvol_os_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); break; } return (0); } static void zvol_set_snapdev_impl(char *name, uint64_t snapdev) { zvol_snapdev_cb_arg_t arg = {snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately * in the dataset hierarchy. Here, we only scan snapshots. */ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); spl_fstrans_unmark(cookie); } static void zvol_set_volmode_impl(char *name, uint64_t volmode) { fstrans_cookie_t cookie; uint64_t old_volmode; zvol_state_t *zv; if (strchr(name, '@') != NULL) return; /* * It's unfortunate we need to remove minors before we create new ones: * this is necessary because our backing gendisk (zvol_state->zv_disk) * could be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). */ zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL && volmode == ZFS_VOLMODE_NONE) return; if (zv != NULL) { old_volmode = zv->zv_volmode; mutex_exit(&zv->zv_state_lock); if (old_volmode == volmode) return; zvol_wait_close(zv); } cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: (void) zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: (void) zvol_remove_minor_impl(name); (void) zvol_os_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: (void) zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ (void) zvol_os_create_minor(name); break; } spl_fstrans_unmark(cookie); } static zvol_task_t * zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t value) { zvol_task_t *task; /* Never allow tasks on hidden names. */ if (name1[0] == '$') return (NULL); task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->value = value; strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) strlcpy(task->name2, name2, MAXNAMELEN); return (task); } static void zvol_task_free(zvol_task_t *task) { kmem_free(task, sizeof (zvol_task_t)); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *arg) { zvol_task_t *task = arg; switch (task->op) { case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task->name1, task->name2); break; case ZVOL_ASYNC_SET_SNAPDEV: zvol_set_snapdev_impl(task->name1, task->value); break; case ZVOL_ASYNC_SET_VOLMODE: zvol_set_volmode_impl(task->name1, task->value); break; default: VERIFY(0); break; } zvol_task_free(task); } typedef struct zvol_set_prop_int_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; dmu_tx_t *zsda_tx; } zvol_set_prop_int_arg_t; /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } static int zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) arg; char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t snapdev; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply snapdev appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "snapdev" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = snapdev; return (dsl_sync_task(ddname, zvol_set_snapdev_check, zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_volmode_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } static int zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { (void) arg; char dsname[MAXNAMELEN]; zvol_task_t *task; uint64_t volmode; dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0) return (0); task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply volmode appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "volmode" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_volmode_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = volmode; return (dsl_sync_task(ddname, zvol_set_volmode_check, zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } boolean_t zvol_is_zvol(const char *name) { return (zvol_os_is_zvol(name)); } int zvol_init_impl(void) { int i; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); return (0); } void zvol_fini_impl(void) { zvol_remove_minors_impl(NULL); /* * The call to "zvol_remove_minors_impl" may dispatch entries to * the system_taskq, but it doesn't wait for those entries to * complete before it returns. Thus, we must wait for all of the * removals to finish, before we can continue. */ taskq_wait_outstanding(system_taskq, 0); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); } diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c index 1bb95e460a81..ed0271a8d683 100644 --- a/module/zstd/zfs_zstd.c +++ b/module/zstd/zfs_zstd.c @@ -1,904 +1,904 @@ /* * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 2016-2018, Klara Inc. * Copyright (c) 2016-2018, Allan Jude * Copyright (c) 2018-2020, Sebastian Gottschall * Copyright (c) 2019-2020, Michael Niewöhner * Copyright (c) 2020, The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ #include #include #include #include #include #include #define ZSTD_STATIC_LINKING_ONLY #include "lib/zstd.h" #include "lib/common/zstd_errors.h" static uint_t zstd_earlyabort_pass = 1; static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3; static unsigned int zstd_abort_size = (128 * 1024); static kstat_t *zstd_ksp = NULL; typedef struct zstd_stats { kstat_named_t zstd_stat_alloc_fail; kstat_named_t zstd_stat_alloc_fallback; kstat_named_t zstd_stat_com_alloc_fail; kstat_named_t zstd_stat_dec_alloc_fail; kstat_named_t zstd_stat_com_inval; kstat_named_t zstd_stat_dec_inval; kstat_named_t zstd_stat_dec_header_inval; kstat_named_t zstd_stat_com_fail; kstat_named_t zstd_stat_dec_fail; /* * LZ4 first-pass early abort verdict */ kstat_named_t zstd_stat_lz4pass_allowed; kstat_named_t zstd_stat_lz4pass_rejected; /* * zstd-1 second-pass early abort verdict */ kstat_named_t zstd_stat_zstdpass_allowed; kstat_named_t zstd_stat_zstdpass_rejected; /* * We excluded this from early abort for some reason */ kstat_named_t zstd_stat_passignored; kstat_named_t zstd_stat_passignored_size; kstat_named_t zstd_stat_buffers; kstat_named_t zstd_stat_size; } zstd_stats_t; static zstd_stats_t zstd_stats = { { "alloc_fail", KSTAT_DATA_UINT64 }, { "alloc_fallback", KSTAT_DATA_UINT64 }, { "compress_alloc_fail", KSTAT_DATA_UINT64 }, { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, { "compress_level_invalid", KSTAT_DATA_UINT64 }, { "decompress_level_invalid", KSTAT_DATA_UINT64 }, { "decompress_header_invalid", KSTAT_DATA_UINT64 }, { "compress_failed", KSTAT_DATA_UINT64 }, { "decompress_failed", KSTAT_DATA_UINT64 }, { "lz4pass_allowed", KSTAT_DATA_UINT64 }, { "lz4pass_rejected", KSTAT_DATA_UINT64 }, { "zstdpass_allowed", KSTAT_DATA_UINT64 }, { "zstdpass_rejected", KSTAT_DATA_UINT64 }, { "passignored", KSTAT_DATA_UINT64 }, { "passignored_size", KSTAT_DATA_UINT64 }, { "buffers", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, }; #ifdef _KERNEL static int kstat_zstd_update(kstat_t *ksp, int rw) { ASSERT(ksp != NULL); if (rw == KSTAT_WRITE && ksp == zstd_ksp) { ZSTDSTAT_ZERO(zstd_stat_alloc_fail); ZSTDSTAT_ZERO(zstd_stat_alloc_fallback); ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail); ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail); ZSTDSTAT_ZERO(zstd_stat_com_inval); ZSTDSTAT_ZERO(zstd_stat_dec_inval); ZSTDSTAT_ZERO(zstd_stat_dec_header_inval); ZSTDSTAT_ZERO(zstd_stat_com_fail); ZSTDSTAT_ZERO(zstd_stat_dec_fail); ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed); ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected); ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed); ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected); ZSTDSTAT_ZERO(zstd_stat_passignored); ZSTDSTAT_ZERO(zstd_stat_passignored_size); } return (0); } #endif /* Enums describing the allocator type specified by kmem_type in zstd_kmem */ enum zstd_kmem_type { ZSTD_KMEM_UNKNOWN = 0, /* Allocation type using kmem_vmalloc */ ZSTD_KMEM_DEFAULT, /* Pool based allocation using mempool_alloc */ ZSTD_KMEM_POOL, /* Reserved fallback memory for decompression only */ ZSTD_KMEM_DCTX, ZSTD_KMEM_COUNT, }; /* Structure for pooled memory objects */ struct zstd_pool { void *mem; size_t size; kmutex_t barrier; hrtime_t timeout; }; /* Global structure for handling memory allocations */ struct zstd_kmem { enum zstd_kmem_type kmem_type; size_t kmem_size; struct zstd_pool *pool; }; /* Fallback memory structure used for decompression only if memory runs out */ struct zstd_fallback_mem { size_t mem_size; void *mem; kmutex_t barrier; }; struct zstd_levelmap { int16_t zstd_level; enum zio_zstd_levels level; }; /* * ZSTD memory handlers * * For decompression we use a different handler which also provides fallback * memory allocation in case memory runs out. * * The ZSTD handlers were split up for the most simplified implementation. */ static void *zstd_alloc(void *opaque, size_t size); static void *zstd_dctx_alloc(void *opaque, size_t size); static void zstd_free(void *opaque, void *ptr); /* Compression memory handler */ static const ZSTD_customMem zstd_malloc = { zstd_alloc, zstd_free, NULL, }; /* Decompression memory handler */ static const ZSTD_customMem zstd_dctx_malloc = { zstd_dctx_alloc, zstd_free, NULL, }; /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ static struct zstd_levelmap zstd_levels[] = { {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, {-1, ZIO_ZSTD_LEVEL_FAST_1}, {-2, ZIO_ZSTD_LEVEL_FAST_2}, {-3, ZIO_ZSTD_LEVEL_FAST_3}, {-4, ZIO_ZSTD_LEVEL_FAST_4}, {-5, ZIO_ZSTD_LEVEL_FAST_5}, {-6, ZIO_ZSTD_LEVEL_FAST_6}, {-7, ZIO_ZSTD_LEVEL_FAST_7}, {-8, ZIO_ZSTD_LEVEL_FAST_8}, {-9, ZIO_ZSTD_LEVEL_FAST_9}, {-10, ZIO_ZSTD_LEVEL_FAST_10}, {-20, ZIO_ZSTD_LEVEL_FAST_20}, {-30, ZIO_ZSTD_LEVEL_FAST_30}, {-40, ZIO_ZSTD_LEVEL_FAST_40}, {-50, ZIO_ZSTD_LEVEL_FAST_50}, {-60, ZIO_ZSTD_LEVEL_FAST_60}, {-70, ZIO_ZSTD_LEVEL_FAST_70}, {-80, ZIO_ZSTD_LEVEL_FAST_80}, {-90, ZIO_ZSTD_LEVEL_FAST_90}, {-100, ZIO_ZSTD_LEVEL_FAST_100}, {-500, ZIO_ZSTD_LEVEL_FAST_500}, {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, }; /* * This variable represents the maximum count of the pool based on the number * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. */ static int pool_count = 16; #define ZSTD_POOL_MAX pool_count #define ZSTD_POOL_TIMEOUT 60 * 2 static struct zstd_fallback_mem zstd_dctx_fallback; static struct zstd_pool *zstd_mempool_cctx; static struct zstd_pool *zstd_mempool_dctx; /* * The library zstd code expects these if ADDRESS_SANITIZER gets defined, * and while ASAN does this, KASAN defines that and does not. So to avoid * changing the external code, we do this. */ #if defined(ZFS_ASAN_ENABLED) #define ADDRESS_SANITIZER 1 #endif #if defined(_KERNEL) && defined(ADDRESS_SANITIZER) void __asan_unpoison_memory_region(void const volatile *addr, size_t size); void __asan_poison_memory_region(void const volatile *addr, size_t size); void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {}; void __asan_poison_memory_region(void const volatile *addr, size_t size) {}; #endif static void zstd_mempool_reap(struct zstd_pool *zstd_mempool) { struct zstd_pool *pool; if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { return; } /* free obsolete slots */ for (int i = 0; i < ZSTD_POOL_MAX; i++) { pool = &zstd_mempool[i]; if (pool->mem && mutex_tryenter(&pool->barrier)) { /* Free memory if unused object older than 2 minutes */ if (pool->mem && gethrestime_sec() > pool->timeout) { vmem_free(pool->mem, pool->size); ZSTDSTAT_SUB(zstd_stat_buffers, 1); ZSTDSTAT_SUB(zstd_stat_size, pool->size); pool->mem = NULL; pool->size = 0; pool->timeout = 0; } mutex_exit(&pool->barrier); } } } /* * Try to get a cached allocated buffer from memory pool or allocate a new one * if necessary. If a object is older than 2 minutes and does not fit the * requested size, it will be released and a new cached entry will be allocated. * If other pooled objects are detected without being used for 2 minutes, they * will be released, too. * * The concept is that high frequency memory allocations of bigger objects are * expensive. So if a lot of work is going on, allocations will be kept for a * while and can be reused in that time frame. * * The scheduled release will be updated every time a object is reused. */ static void * zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) { struct zstd_pool *pool; struct zstd_kmem *mem = NULL; if (!zstd_mempool) { return (NULL); } /* Seek for preallocated memory slot and free obsolete slots */ for (int i = 0; i < ZSTD_POOL_MAX; i++) { pool = &zstd_mempool[i]; /* * This lock is simply a marker for a pool object being in use. * If it's already hold, it will be skipped. * * We need to create it before checking it to avoid race * conditions caused by running in a threaded context. * * The lock is later released by zstd_mempool_free. */ if (mutex_tryenter(&pool->barrier)) { /* * Check if objects fits the size, if so we take it and * update the timestamp. */ if (pool->mem && size <= pool->size) { pool->timeout = gethrestime_sec() + ZSTD_POOL_TIMEOUT; mem = pool->mem; return (mem); } mutex_exit(&pool->barrier); } } /* * If no preallocated slot was found, try to fill in a new one. * * We run a similar algorithm twice here to avoid pool fragmentation. * The first one may generate holes in the list if objects get released. * We always make sure that these holes get filled instead of adding new * allocations constantly at the end. */ for (int i = 0; i < ZSTD_POOL_MAX; i++) { pool = &zstd_mempool[i]; if (mutex_tryenter(&pool->barrier)) { /* Object is free, try to allocate new one */ if (!pool->mem) { mem = vmem_alloc(size, KM_SLEEP); if (mem) { ZSTDSTAT_ADD(zstd_stat_buffers, 1); ZSTDSTAT_ADD(zstd_stat_size, size); pool->mem = mem; pool->size = size; /* Keep track for later release */ mem->pool = pool; mem->kmem_type = ZSTD_KMEM_POOL; mem->kmem_size = size; } } if (size <= pool->size) { /* Update timestamp */ pool->timeout = gethrestime_sec() + ZSTD_POOL_TIMEOUT; return (pool->mem); } mutex_exit(&pool->barrier); } } /* * If the pool is full or the allocation failed, try lazy allocation * instead. */ if (!mem) { mem = vmem_alloc(size, KM_NOSLEEP); if (mem) { mem->pool = NULL; mem->kmem_type = ZSTD_KMEM_DEFAULT; mem->kmem_size = size; } } return (mem); } /* Mark object as released by releasing the barrier mutex */ static void zstd_mempool_free(struct zstd_kmem *z) { mutex_exit(&z->pool->barrier); } /* Convert ZFS internal enum to ZSTD level */ static int zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) { if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { *zstd_level = zstd_levels[level - 1].zstd_level; return (0); } if (level >= ZIO_ZSTD_LEVEL_FAST_1 && level <= ZIO_ZSTD_LEVEL_FAST_1000) { *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 + ZIO_ZSTD_LEVEL_19].zstd_level; return (0); } /* Invalid/unknown zfs compression enum - this should never happen. */ return (1); } size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len, int level) { int16_t zstd_level; if (zstd_enum_to_level(level, &zstd_level)) { ZSTDSTAT_BUMP(zstd_stat_com_inval); return (s_len); } /* * A zstd early abort heuristic. * * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently * 128k), don't try any of this, just go. * (because experimentally that was a reasonable cutoff for a perf win * with tiny ratio change) * - First, we try LZ4 compression, and if it doesn't early abort, we * jump directly to whatever compression level we intended to try. * - Second, we try zstd-1 - if that errors out (usually, but not * exclusively, if it would overflow), we give up early. * * If it works, instead we go on and compress anyway. * * Why two passes? LZ4 alone gets you a lot of the way, but on highly * compressible data, it was losing up to 8.5% of the compressed * savings versus no early abort, and all the zstd-fast levels are * worse indications on their own than LZ4, and don't improve the LZ4 * pass noticably if stacked like this. */ size_t actual_abort_size = zstd_abort_size; if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && s_len >= actual_abort_size) { int pass_len = 1; pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0); if (pass_len < d_len) { ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed); goto keep_trying; } ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected); pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len, ZIO_ZSTD_LEVEL_1); if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) { ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected); return (s_len); } ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed); } else { ZSTDSTAT_BUMP(zstd_stat_passignored); if (s_len < actual_abort_size) { ZSTDSTAT_BUMP(zstd_stat_passignored_size); } } keep_trying: return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level)); } /* Compress block using zstd */ size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int level) { size_t c_len; int16_t zstd_level; zfs_zstdhdr_t *hdr; ZSTD_CCtx *cctx; hdr = (zfs_zstdhdr_t *)d_start; /* Skip compression if the specified level is invalid */ if (zstd_enum_to_level(level, &zstd_level)) { ZSTDSTAT_BUMP(zstd_stat_com_inval); return (s_len); } ASSERT3U(d_len, >=, sizeof (*hdr)); ASSERT3U(d_len, <=, s_len); ASSERT3U(zstd_level, !=, 0); cctx = ZSTD_createCCtx_advanced(zstd_malloc); /* * Out of kernel memory, gently fall through - this will disable * compression in zio_compress_data */ if (!cctx) { ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); return (s_len); } /* Set the compression level */ ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); /* Use the "magicless" zstd header which saves us 4 header bytes */ ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); /* * Disable redundant checksum calculation and content size storage since * this is already done by ZFS itself. */ ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); c_len = ZSTD_compress2(cctx, hdr->data, d_len - sizeof (*hdr), s_start, s_len); ZSTD_freeCCtx(cctx); /* Error in the compression routine, disable compression. */ if (ZSTD_isError(c_len)) { /* * If we are aborting the compression because the saves are * too small, that is not a failure. Everything else is a * failure, so increment the compression failure counter. */ int err = ZSTD_getErrorCode(c_len); if (err != ZSTD_error_dstSize_tooSmall) { ZSTDSTAT_BUMP(zstd_stat_com_fail); dprintf("Error: %s", ZSTD_getErrorString(err)); } return (s_len); } /* * Encode the compressed buffer size at the start. We'll need this in * decompression to counter the effects of padding which might be added * to the compressed buffer and which, if unhandled, would confuse the * hell out of our decompression function. */ hdr->c_len = BE_32(c_len); /* * Check version for overflow. * The limit of 24 bits must not be exceeded. This allows a maximum * version 1677.72.15 which we don't expect to be ever reached. */ ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); /* * Encode the compression level as well. We may need to know the * original compression level if compressed_arc is disabled, to match * the compression settings to write this block to the L2ARC. * * Encode the actual level, so if the enum changes in the future, we * will be compatible. * * The upper 24 bits store the ZSTD version to be able to provide * future compatibility, since new versions might enhance the * compression algorithm in a way, where the compressed data will * change. * * As soon as such incompatibility occurs, handling code needs to be * added, differentiating between the versions. */ zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); zfs_set_hdrlevel(hdr, level); hdr->raw_version_level = BE_32(hdr->raw_version_level); return (c_len + sizeof (*hdr)); } /* Decompress block using zstd and return its stored level */ int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, size_t d_len, uint8_t *level) { ZSTD_DCtx *dctx; size_t result; int16_t zstd_level; uint32_t c_len; const zfs_zstdhdr_t *hdr; zfs_zstdhdr_t hdr_copy; hdr = (const zfs_zstdhdr_t *)s_start; c_len = BE_32(hdr->c_len); /* * Make a copy instead of directly converting the header, since we must * not modify the original data that may be used again later. */ hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); /* * NOTE: We ignore the ZSTD version for now. As soon as any * incompatibility occurs, it has to be handled accordingly. * The version can be accessed via `hdr_copy.version`. */ /* * Convert and check the level * An invalid level is a strong indicator for data corruption! In such * case return an error so the upper layers can try to fix it. */ if (zstd_enum_to_level(curlevel, &zstd_level)) { ZSTDSTAT_BUMP(zstd_stat_dec_inval); return (1); } ASSERT3U(d_len, >=, s_len); ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); /* Invalid compressed buffer size encoded at start */ if (c_len + sizeof (*hdr) > s_len) { ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); return (1); } dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); if (!dctx) { ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); return (1); } /* Set header type to "magicless" */ ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); /* Decompress the data and release the context */ result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); ZSTD_freeDCtx(dctx); /* * Returns 0 on success (decompression function returned non-negative) * and non-zero on failure (decompression function returned negative. */ if (ZSTD_isError(result)) { ZSTDSTAT_BUMP(zstd_stat_dec_fail); return (1); } if (level) { *level = curlevel; } return (0); } /* Decompress datablock using zstd */ int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int level __maybe_unused) { return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, NULL)); } /* Allocator for zstd compression context using mempool_allocator */ static void * zstd_alloc(void *opaque __maybe_unused, size_t size) { size_t nbytes = sizeof (struct zstd_kmem) + size; struct zstd_kmem *z = NULL; z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); if (!z) { ZSTDSTAT_BUMP(zstd_stat_alloc_fail); return (NULL); } return ((void*)z + (sizeof (struct zstd_kmem))); } /* * Allocator for zstd decompression context using mempool_allocator with * fallback to reserved memory if allocation fails */ static void * zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) { size_t nbytes = sizeof (struct zstd_kmem) + size; struct zstd_kmem *z = NULL; enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); if (!z) { /* Try harder, decompression shall not fail */ z = vmem_alloc(nbytes, KM_SLEEP); if (z) { z->pool = NULL; } ZSTDSTAT_BUMP(zstd_stat_alloc_fail); } else { return ((void*)z + (sizeof (struct zstd_kmem))); } /* Fallback if everything fails */ if (!z) { /* * Barrier since we only can handle it in a single thread. All * other following threads need to wait here until decompression * is completed. zstd_free will release this barrier later. */ mutex_enter(&zstd_dctx_fallback.barrier); z = zstd_dctx_fallback.mem; type = ZSTD_KMEM_DCTX; ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); } /* Allocation should always be successful */ if (!z) { return (NULL); } z->kmem_type = type; z->kmem_size = nbytes; return ((void*)z + (sizeof (struct zstd_kmem))); } /* Free allocated memory by its specific type */ static void zstd_free(void *opaque __maybe_unused, void *ptr) { struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); enum zstd_kmem_type type; ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); type = z->kmem_type; switch (type) { case ZSTD_KMEM_DEFAULT: vmem_free(z, z->kmem_size); break; case ZSTD_KMEM_POOL: zstd_mempool_free(z); break; case ZSTD_KMEM_DCTX: mutex_exit(&zstd_dctx_fallback.barrier); break; default: break; } } /* Allocate fallback memory to ensure safe decompression */ static void __init create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) { mem->mem_size = size; mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); } /* Initialize memory pool barrier mutexes */ static void __init zstd_mempool_init(void) { - zstd_mempool_cctx = (struct zstd_pool *) + zstd_mempool_cctx = kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); - zstd_mempool_dctx = (struct zstd_pool *) + zstd_mempool_dctx = kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); for (int i = 0; i < ZSTD_POOL_MAX; i++) { mutex_init(&zstd_mempool_cctx[i].barrier, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zstd_mempool_dctx[i].barrier, NULL, MUTEX_DEFAULT, NULL); } } /* Initialize zstd-related memory handling */ static int __init zstd_meminit(void) { zstd_mempool_init(); /* * Estimate the size of the fallback decompression context. * The expected size on x64 with current ZSTD should be about 160 KB. */ create_fallback_mem(&zstd_dctx_fallback, P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), PAGESIZE)); return (0); } /* Release object from pool and free memory */ static void release_pool(struct zstd_pool *pool) { mutex_destroy(&pool->barrier); vmem_free(pool->mem, pool->size); pool->mem = NULL; pool->size = 0; } /* Release memory pool objects */ static void zstd_mempool_deinit(void) { for (int i = 0; i < ZSTD_POOL_MAX; i++) { release_pool(&zstd_mempool_cctx[i]); release_pool(&zstd_mempool_dctx[i]); } kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); zstd_mempool_dctx = NULL; zstd_mempool_cctx = NULL; } /* release unused memory from pool */ void zfs_zstd_cache_reap_now(void) { /* * calling alloc with zero size seeks * and releases old unused objects */ zstd_mempool_reap(zstd_mempool_cctx); zstd_mempool_reap(zstd_mempool_dctx); } extern int __init zstd_init(void) { /* Set pool size by using maximum sane thread count * 4 */ pool_count = (boot_ncpus * 4); zstd_meminit(); /* Initialize kstat */ zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zstd_ksp != NULL) { zstd_ksp->ks_data = &zstd_stats; kstat_install(zstd_ksp); #ifdef _KERNEL zstd_ksp->ks_update = kstat_zstd_update; #endif } return (0); } extern void zstd_fini(void) { /* Deinitialize kstat */ if (zstd_ksp != NULL) { kstat_delete(zstd_ksp); zstd_ksp = NULL; } /* Release fallback memory */ vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); mutex_destroy(&zstd_dctx_fallback.barrier); /* Deinit memory pool */ zstd_mempool_deinit(); } #if defined(_KERNEL) #ifdef __FreeBSD__ module_init(zstd_init); module_exit(zstd_fini); #endif ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW, "Enable early abort attempts when using zstd"); ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW, "Minimal size of block to attempt early abort"); #endif