diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index db5b539648a6..e913a0bd7fda 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -1,179 +1,180 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. */ #ifndef _ZFS_FLETCHER_H #define _ZFS_FLETCHER_H extern __attribute__((visibility("default"))) #include #include #ifdef __cplusplus extern "C" { #endif /* * fletcher checksum functions * * Note: Fletcher checksum methods expect buffer size to be 4B aligned. This * limitation stems from the algorithm design. Performing incremental checksum * without said alignment would yield different results. Therefore, the code * includes assertions for the size alignment. * For compatibility, it is required that some code paths calculate checksum of * non-aligned buffer sizes. For this purpose, `fletcher_4_native_varsize()` * checksum method is added. This method will ignore last (size % 4) bytes of * the data buffer. */ _ZFS_FLETCHER_H void fletcher_init(zio_cksum_t *); _ZFS_FLETCHER_H void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); _ZFS_FLETCHER_H void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); _ZFS_FLETCHER_H void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); _ZFS_FLETCHER_H int fletcher_2_incremental_native(void *, size_t, void *); _ZFS_FLETCHER_H int fletcher_2_incremental_byteswap(void *, size_t, void *); _ZFS_FLETCHER_H void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *); _ZFS_FLETCHER_H void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); _ZFS_FLETCHER_H int fletcher_4_incremental_native(void *, size_t, void *); _ZFS_FLETCHER_H int fletcher_4_incremental_byteswap(void *, size_t, void *); _ZFS_FLETCHER_H int fletcher_4_impl_set(const char *selector); _ZFS_FLETCHER_H void fletcher_4_init(void); _ZFS_FLETCHER_H void fletcher_4_fini(void); /* Internal fletcher ctx */ typedef struct zfs_fletcher_superscalar { uint64_t v[4]; } zfs_fletcher_superscalar_t; typedef struct zfs_fletcher_sse { uint64_t v[2] __attribute__((aligned(16))); } zfs_fletcher_sse_t; typedef struct zfs_fletcher_avx { uint64_t v[4] __attribute__((aligned(32))); } zfs_fletcher_avx_t; typedef struct zfs_fletcher_avx512 { uint64_t v[8] __attribute__((aligned(64))); } zfs_fletcher_avx512_t; typedef struct zfs_fletcher_aarch64_neon { uint64_t v[2] __attribute__((aligned(16))); } zfs_fletcher_aarch64_neon_t; typedef union fletcher_4_ctx { zio_cksum_t scalar; zfs_fletcher_superscalar_t superscalar[4]; #if defined(HAVE_SSE2) || (defined(HAVE_SSE2) && defined(HAVE_SSSE3)) zfs_fletcher_sse_t sse[4]; #endif #if defined(HAVE_AVX) && defined(HAVE_AVX2) zfs_fletcher_avx_t avx[4]; #endif #if defined(__x86_64) && defined(HAVE_AVX512F) zfs_fletcher_avx512_t avx512[4]; #endif #if defined(__aarch64__) zfs_fletcher_aarch64_neon_t aarch64_neon[4]; #endif } fletcher_4_ctx_t; /* * fletcher checksum struct */ typedef void (*fletcher_4_init_f)(fletcher_4_ctx_t *); typedef void (*fletcher_4_fini_f)(fletcher_4_ctx_t *, zio_cksum_t *); typedef void (*fletcher_4_compute_f)(fletcher_4_ctx_t *, const void *, uint64_t); typedef struct fletcher_4_func { fletcher_4_init_f init_native; fletcher_4_fini_f fini_native; fletcher_4_compute_f compute_native; fletcher_4_init_f init_byteswap; fletcher_4_fini_f fini_byteswap; fletcher_4_compute_f compute_byteswap; boolean_t (*valid)(void); + boolean_t uses_fpu; const char *name; -} fletcher_4_ops_t; +} __attribute__((aligned(64))) fletcher_4_ops_t; _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops; _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops; #if defined(HAVE_SSE2) _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_sse2_ops; #endif #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_ssse3_ops; #endif #if defined(HAVE_AVX) && defined(HAVE_AVX2) _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx2_ops; #endif #if defined(__x86_64) && defined(HAVE_AVX512F) _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx512f_ops; #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_avx512bw_ops; #endif #if defined(__aarch64__) _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_aarch64_neon_ops; #endif #ifdef __cplusplus } #endif #if defined(ZFS_UBSAN_ENABLED) #if defined(__has_attribute) #if __has_attribute(no_sanitize_undefined) #define ZFS_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined)) #elif __has_attribute(no_sanitize) #define ZFS_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined"))) #else #error "Compiler has to support attribute " "`no_sanitize_undefined` or `no_sanitize(\"undefined\")`" "when compiling with UBSan enabled" #endif /* __has_attribute(no_sanitize_undefined) */ #endif /* defined(__has_attribute) */ #else #define ZFS_NO_SANITIZE_UNDEFINED #endif /* defined(ZFS_UBSAN_ENABLED) */ #endif /* _ZFS_FLETCHER_H */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 918a9105b2d1..9d3c0379da18 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1,9335 +1,9338 @@ - - - - - - - + + + + + + + - + + + + diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index fa9b8447e983..eae854f3d452 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -1,992 +1,1015 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. */ /* * Copyright (c) 2016 by Delphix. All rights reserved. */ /* * Fletcher Checksums * ------------------ * * ZFS's 2nd and 4th order Fletcher checksums are defined by the following * recurrence relations: * * a = a + f * i i-1 i-1 * * b = b + a * i i-1 i * * c = c + b (fletcher-4 only) * i i-1 i * * d = d + c (fletcher-4 only) * i i-1 i * * Where * a_0 = b_0 = c_0 = d_0 = 0 * and * f_0 .. f_(n-1) are the input data. * * Using standard techniques, these translate into the following series: * * __n_ __n_ * \ | \ | * a = > f b = > i * f * n /___| n - i n /___| n - i * i = 1 i = 1 * * * __n_ __n_ * \ | i*(i+1) \ | i*(i+1)*(i+2) * c = > ------- f d = > ------------- f * n /___| 2 n - i n /___| 6 n - i * i = 1 i = 1 * * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. * Since the additions are done mod (2^64), errors in the high bits may not * be noticed. For this reason, fletcher-2 is deprecated. * * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. * A conservative estimate of how big the buffer can get before we overflow * can be estimated using f_i = 0xffffffff for all i: * * % bc * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 * 2264 * quit * % * * So blocks of up to 2k will not overflow. Our largest block size is * 128k, which has 32k 4-byte words, so we can compute the largest possible * accumulators, then divide by 2^64 to figure the max amount of overflow: * * % bc * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } * a/2^64;b/2^64;c/2^64;d/2^64 * 0 * 0 * 1365 * 11186858 * quit * % * * So a and b cannot overflow. To make sure each bit of input has some * effect on the contents of c and d, we can look at what the factors of * the coefficients in the equations for c_n and d_n are. The number of 2s * in the factors determines the lowest set bit in the multiplier. Running * through the cases for n*(n+1)/2 reveals that the highest power of 2 is * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow * the 64-bit accumulators, every bit of every f_i effects every accumulator, * even for 128k blocks. * * If we wanted to make a stronger version of fletcher4 (fletcher4c?), * we could do our calculations mod (2^32 - 1) by adding in the carries * periodically, and store the number of carries in the top 32-bits. * * -------------------- * Checksum Performance * -------------------- * * There are two interesting components to checksum performance: cached and * uncached performance. With cached data, fletcher-2 is about four times * faster than fletcher-4. With uncached data, the performance difference is * negligible, since the cost of a cache fill dominates the processing time. * Even though fletcher-4 is slower than fletcher-2, it is still a pretty * efficient pass over the data. * * In normal operation, the data which is being checksummed is in a buffer * which has been filled either by: * * 1. a compression step, which will be mostly cached, or * 2. a memcpy() or copyin(), which will be uncached * (because the copy is cache-bypassing). * * For both cached and uncached data, both fletcher checksums are much faster * than sha-256, and slower than 'off', which doesn't touch the data at all. */ #include #include #include #include #include #include #include #include #define FLETCHER_MIN_SIMD_SIZE 64 static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx); static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp); static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size); static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size); static boolean_t fletcher_4_scalar_valid(void); static const fletcher_4_ops_t fletcher_4_scalar_ops = { .init_native = fletcher_4_scalar_init, .fini_native = fletcher_4_scalar_fini, .compute_native = fletcher_4_scalar_native, .init_byteswap = fletcher_4_scalar_init, .fini_byteswap = fletcher_4_scalar_fini, .compute_byteswap = fletcher_4_scalar_byteswap, .valid = fletcher_4_scalar_valid, + .uses_fpu = B_FALSE, .name = "scalar" }; static fletcher_4_ops_t fletcher_4_fastest_impl = { .name = "fastest", .valid = fletcher_4_scalar_valid }; static const fletcher_4_ops_t *fletcher_4_impls[] = { &fletcher_4_scalar_ops, &fletcher_4_superscalar_ops, &fletcher_4_superscalar4_ops, #if defined(HAVE_SSE2) &fletcher_4_sse2_ops, #endif #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) &fletcher_4_ssse3_ops, #endif #if defined(HAVE_AVX) && defined(HAVE_AVX2) &fletcher_4_avx2_ops, #endif #if defined(__x86_64) && defined(HAVE_AVX512F) &fletcher_4_avx512f_ops, #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) &fletcher_4_avx512bw_ops, #endif #if defined(__aarch64__) && !defined(__FreeBSD__) &fletcher_4_aarch64_neon_ops, #endif }; /* Hold all supported implementations */ static uint32_t fletcher_4_supp_impls_cnt = 0; static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)]; /* Select fletcher4 implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX - 1) #define IMPL_SCALAR (0) static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST; #define IMPL_READ(i) (*(volatile uint32_t *) &(i)) static struct fletcher_4_impl_selector { const char *fis_name; uint32_t fis_sel; } fletcher_4_impl_selectors[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, { "scalar", IMPL_SCALAR } }; #if defined(_KERNEL) static kstat_t *fletcher_4_kstat; static struct fletcher_4_kstat { uint64_t native; uint64_t byteswap; } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; #endif /* Indicate that benchmark has been completed */ static boolean_t fletcher_4_initialized = B_FALSE; void fletcher_init(zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } int fletcher_2_incremental_native(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; a0 = zcp->zc_word[0]; a1 = zcp->zc_word[1]; b0 = zcp->zc_word[2]; b1 = zcp->zc_word[3]; for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; b1 += a1; } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); return (0); } void fletcher_2_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_init(zcp); (void) fletcher_2_incremental_native((void *) buf, size, zcp); } int fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; a0 = zcp->zc_word[0]; a1 = zcp->zc_word[1]; b0 = zcp->zc_word[2]; b1 = zcp->zc_word[3]; for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; b1 += a1; } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); return (0); } void fletcher_2_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; fletcher_init(zcp); (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) { ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; a = ctx->scalar.zc_word[0]; b = ctx->scalar.zc_word[1]; c = ctx->scalar.zc_word[2]; d = ctx->scalar.zc_word[3]; for (; ip < ipend; ip++) { a += ip[0]; b += a; c += b; d += c; } ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; a = ctx->scalar.zc_word[0]; b = ctx->scalar.zc_word[1]; c = ctx->scalar.zc_word[2]; d = ctx->scalar.zc_word[3]; for (; ip < ipend; ip++) { a += BSWAP_32(ip[0]); b += a; c += b; d += c; } ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); } static boolean_t fletcher_4_scalar_valid(void) { return (B_TRUE); } int fletcher_4_impl_set(const char *val) { int err = -EINVAL; uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); size_t i, val_len; val_len = strlen(val); while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ val_len--; /* check mandatory implementations */ for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) { const char *name = fletcher_4_impl_selectors[i].fis_name; if (val_len == strlen(name) && strncmp(val, name, val_len) == 0) { impl = fletcher_4_impl_selectors[i].fis_sel; err = 0; break; } } if (err != 0 && fletcher_4_initialized) { /* check all supported implementations */ for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { const char *name = fletcher_4_supp_impls[i]->name; if (val_len == strlen(name) && strncmp(val, name, val_len) == 0) { impl = i; err = 0; break; } } } if (err == 0) { atomic_swap_32(&fletcher_4_impl_chosen, impl); membar_producer(); } return (err); } /* * Returns the Fletcher 4 operations for checksums. When a SIMD * implementation is not allowed in the current context, then fallback * to the fastest generic implementation. */ static inline const fletcher_4_ops_t * fletcher_4_impl_get(void) { if (!kfpu_allowed()) return (&fletcher_4_superscalar4_ops); const fletcher_4_ops_t *ops = NULL; uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); switch (impl) { case IMPL_FASTEST: ASSERT(fletcher_4_initialized); ops = &fletcher_4_fastest_impl; break; case IMPL_CYCLE: /* Cycle through supported implementations */ ASSERT(fletcher_4_initialized); ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); static uint32_t cycle_count = 0; uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; ops = fletcher_4_supp_impls[idx]; break; default: ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); ops = fletcher_4_supp_impls[impl]; break; } ASSERT3P(ops, !=, NULL); return (ops); } static inline void fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) { fletcher_4_ctx_t ctx; const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + if (ops->uses_fpu == B_TRUE) { + kfpu_begin(); + } ops->init_native(&ctx); ops->compute_native(&ctx, buf, size); ops->fini_native(&ctx, zcp); + if (ops->uses_fpu == B_TRUE) { + kfpu_end(); + } } void fletcher_4_native(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (size == 0 || p2size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); if (size > 0) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); } else { fletcher_4_native_impl(buf, p2size, zcp); if (p2size < size) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, (char *)buf + p2size, size - p2size); } } void fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); } static inline void fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) { fletcher_4_ctx_t ctx; const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + if (ops->uses_fpu == B_TRUE) { + kfpu_begin(); + } ops->init_byteswap(&ctx); ops->compute_byteswap(&ctx, buf, size); ops->fini_byteswap(&ctx, zcp); + if (ops->uses_fpu == B_TRUE) { + kfpu_end(); + } } void fletcher_4_byteswap(const void *buf, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { (void) ctx_template; const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (size == 0 || p2size == 0) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); if (size > 0) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); } else { fletcher_4_byteswap_impl(buf, p2size, zcp); if (p2size < size) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, (char *)buf + p2size, size - p2size); } } /* Incremental Fletcher 4 */ #define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20) static inline void fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size, const zio_cksum_t *nzcp) { const uint64_t c1 = size / sizeof (uint32_t); const uint64_t c2 = c1 * (c1 + 1) / 2; const uint64_t c3 = c2 * (c1 + 2) / 3; /* * Value of 'c3' overflows on buffer sizes close to 16MiB. For that * reason we split incremental fletcher4 computation of large buffers * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size. */ ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE); zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] + c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0]; zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] + c2 * zcp->zc_word[0]; zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0]; zcp->zc_word[0] += nzcp->zc_word[0]; } static inline void fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, zio_cksum_t *zcp) { while (size > 0) { zio_cksum_t nzc; uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE); if (native) fletcher_4_native(buf, len, NULL, &nzc); else fletcher_4_byteswap(buf, len, NULL, &nzc); fletcher_4_incremental_combine(zcp, len, &nzc); size -= len; buf += len; } } int fletcher_4_incremental_native(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); return (0); } int fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) { zio_cksum_t *zcp = data; /* Use scalar impl to directly update cksum of small blocks */ if (size < SPA_MINBLOCKSIZE) fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); else fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); return (0); } #if defined(_KERNEL) /* * Fletcher 4 kstats */ static int fletcher_4_kstat_headers(char *buf, size_t size) { ssize_t off = 0; off += snprintf(buf + off, size, "%-17s", "implementation"); off += snprintf(buf + off, size - off, "%-15s", "native"); (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap"); return (0); } static int fletcher_4_kstat_data(char *buf, size_t size, void *data) { struct fletcher_4_kstat *fastest_stat = &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data; ssize_t off = 0; if (curr_stat == fastest_stat) { off += snprintf(buf + off, size - off, "%-17s", "fastest"); off += snprintf(buf + off, size - off, "%-15s", fletcher_4_supp_impls[fastest_stat->native]->name); (void) snprintf(buf + off, size - off, "%-15s\n", fletcher_4_supp_impls[fastest_stat->byteswap]->name); } else { ptrdiff_t id = curr_stat - fletcher_4_stat_data; off += snprintf(buf + off, size - off, "%-17s", fletcher_4_supp_impls[id]->name); off += snprintf(buf + off, size - off, "%-15llu", (u_longlong_t)curr_stat->native); (void) snprintf(buf + off, size - off, "%-15llu\n", (u_longlong_t)curr_stat->byteswap); } return (0); } static void * fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) { if (n <= fletcher_4_supp_impls_cnt) ksp->ks_private = (void *) (fletcher_4_stat_data + n); else ksp->ks_private = NULL; return (ksp->ks_private); } #endif #define FLETCHER_4_FASTEST_FN_COPY(type, src) \ { \ fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ + fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \ } #define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, zio_cksum_t *); #if defined(_KERNEL) static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { struct fletcher_4_kstat *fastest_stat = &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; hrtime_t start; uint64_t run_bw, run_time_ns, best_run = 0; zio_cksum_t zc; uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); fletcher_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : fletcher_4_byteswap; for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; uint64_t run_count = 0; /* temporary set an implementation */ fletcher_4_impl_chosen = i; kpreempt_disable(); start = gethrtime(); do { for (l = 0; l < 32; l++, run_count++) fletcher_4_test(data, data_size, NULL, &zc); run_time_ns = gethrtime() - start; } while (run_time_ns < FLETCHER_4_BENCH_NS); kpreempt_enable(); run_bw = data_size * run_count * NANOSEC; run_bw /= run_time_ns; /* B/s */ if (native) stat->native = run_bw; else stat->byteswap = run_bw; if (run_bw > best_run) { best_run = run_bw; if (native) { fastest_stat->native = i; FLETCHER_4_FASTEST_FN_COPY(native, fletcher_4_supp_impls[i]); } else { fastest_stat->byteswap = i; FLETCHER_4_FASTEST_FN_COPY(byteswap, fletcher_4_supp_impls[i]); } } } /* restore original selection */ atomic_swap_32(&fletcher_4_impl_chosen, sel_save); } #endif /* _KERNEL */ /* * Initialize and benchmark all supported implementations. */ static void fletcher_4_benchmark(void) { fletcher_4_ops_t *curr_impl; int i, c; /* Move supported implementations into fletcher_4_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; if (curr_impl->valid && curr_impl->valid()) fletcher_4_supp_impls[c++] = curr_impl; } membar_producer(); /* complete fletcher_4_supp_impls[] init */ fletcher_4_supp_impls_cnt = c; /* number of supported impl */ #if defined(_KERNEL) static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ char *databuf = vmem_alloc(data_size, KM_SLEEP); for (i = 0; i < data_size / sizeof (uint64_t); i++) ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ fletcher_4_benchmark_impl(B_FALSE, databuf, data_size); fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); vmem_free(databuf, data_size); #else /* * Skip the benchmark in user space to avoid impacting libzpool * consumers (zdb, zhack, zinject, ztest). The last implementation * is assumed to be the fastest and used by default. */ memcpy(&fletcher_4_fastest_impl, fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], sizeof (fletcher_4_fastest_impl)); fletcher_4_fastest_impl.name = "fastest"; membar_producer(); #endif /* _KERNEL */ } void fletcher_4_init(void) { /* Determine the fastest available implementation. */ fletcher_4_benchmark(); #if defined(_KERNEL) /* Install kstats for all implementations */ fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (fletcher_4_kstat != NULL) { fletcher_4_kstat->ks_data = NULL; fletcher_4_kstat->ks_ndata = UINT32_MAX; kstat_set_raw_ops(fletcher_4_kstat, fletcher_4_kstat_headers, fletcher_4_kstat_data, fletcher_4_kstat_addr); kstat_install(fletcher_4_kstat); } #endif /* Finish initialization */ fletcher_4_initialized = B_TRUE; } void fletcher_4_fini(void) { #if defined(_KERNEL) if (fletcher_4_kstat != NULL) { kstat_delete(fletcher_4_kstat); fletcher_4_kstat = NULL; } #endif } /* ABD adapters */ static void abd_fletcher_4_init(zio_abd_checksum_data_t *cdp) { const fletcher_4_ops_t *ops = fletcher_4_impl_get(); cdp->acd_private = (void *) ops; + if (ops->uses_fpu == B_TRUE) { + kfpu_begin(); + } if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) ops->init_native(cdp->acd_ctx); else ops->init_byteswap(cdp->acd_ctx); + } static void abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp) { fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; ASSERT(ops); if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); else ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); + + if (ops->uses_fpu == B_TRUE) { + kfpu_end(); + } } + static void abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, zio_abd_checksum_data_t *cdp) { zio_cksum_t *zcp = cdp->acd_zcp; ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); abd_fletcher_4_fini(cdp); cdp->acd_private = (void *)&fletcher_4_scalar_ops; if (native) fletcher_4_incremental_native(data, size, zcp); else fletcher_4_incremental_byteswap(data, size, zcp); } static int abd_fletcher_4_iter(void *data, size_t size, void *private) { zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private; fletcher_4_ctx_t *ctx = cdp->acd_ctx; fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); if (asize > 0) { if (native) ops->compute_native(ctx, data, asize); else ops->compute_byteswap(ctx, data, asize); size -= asize; data = (char *)data + asize; } if (size > 0) { ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); /* At this point we have to switch to scalar impl */ abd_fletcher_4_simd2scalar(native, data, size, cdp); } return (0); } zio_abd_checksum_func_t fletcher_4_abd_ops = { .acf_init = abd_fletcher_4_init, .acf_fini = abd_fletcher_4_fini, .acf_iter = abd_fletcher_4_iter }; #if defined(_KERNEL) #define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") #if defined(__linux__) static int fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) { const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); char *fmt; int cnt = 0; /* list fastest */ fmt = IMPL_FMT(impl, IMPL_FASTEST); cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest"); /* list all supported implementations */ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, fletcher_4_supp_impls[i]->name); } return (cnt); } static int fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) { return (fletcher_4_impl_set(val)); } #else #include static int fletcher_4_param(ZFS_MODULE_PARAM_ARGS) { int err; if (req->newptr == NULL) { const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); const int init_buflen = 64; const char *fmt; struct sbuf *s; s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); /* list fastest */ fmt = IMPL_FMT(impl, IMPL_FASTEST); (void) sbuf_printf(s, fmt, "fastest"); /* list all supported implementations */ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { fmt = IMPL_FMT(impl, i); (void) sbuf_printf(s, fmt, fletcher_4_supp_impls[i]->name); } err = sbuf_finish(s); sbuf_delete(s); return (err); } char buf[16]; err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err) return (err); return (-fletcher_4_impl_set(buf)); } #endif #undef IMPL_FMT /* * Choose a fletcher 4 implementation in ZFS. * Users can choose "cycle" to exercise all implementations, but this is * for testing purpose therefore it can only be set in user space. */ ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl, fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW, "Select fletcher 4 implementation."); EXPORT_SYMBOL(fletcher_init); EXPORT_SYMBOL(fletcher_2_incremental_native); EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_init); EXPORT_SYMBOL(fletcher_4_fini); EXPORT_SYMBOL(fletcher_2_native); EXPORT_SYMBOL(fletcher_2_byteswap); EXPORT_SYMBOL(fletcher_4_native); EXPORT_SYMBOL(fletcher_4_native_varsize); EXPORT_SYMBOL(fletcher_4_byteswap); EXPORT_SYMBOL(fletcher_4_incremental_native); EXPORT_SYMBOL(fletcher_4_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_abd_ops); #endif diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c index 8f033972886c..cd5fe545a19d 100644 --- a/module/zcommon/zfs_fletcher_aarch64_neon.c +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c @@ -1,211 +1,210 @@ /* * Implement fast Fletcher4 with NEON instructions. (aarch64) * * Use the 128-bit NEON SIMD instructions and registers to compute * Fletcher4 in two incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * This implementation is a derivative of the AVX SIMD implementation by * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). * * Copyright (C) 2016 Romain Dolbeau. * * Authors: * Romain Dolbeau * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(__aarch64__) #include #include #include #include ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { uint64_t A, B, C, D; A = ctx->aarch64_neon[0].v[0] + ctx->aarch64_neon[0].v[1]; B = 2 * ctx->aarch64_neon[1].v[0] + 2 * ctx->aarch64_neon[1].v[1] - ctx->aarch64_neon[0].v[1]; C = 4 * ctx->aarch64_neon[2].v[0] - ctx->aarch64_neon[1].v[0] + 4 * ctx->aarch64_neon[2].v[1] - 3 * ctx->aarch64_neon[1].v[1]; D = 8 * ctx->aarch64_neon[3].v[0] - 4 * ctx->aarch64_neon[2].v[0] + 8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] + ctx->aarch64_neon[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define NEON_INIT_LOOP() \ asm("eor %[ZERO].16b,%[ZERO].16b,%[ZERO].16b\n" \ "ld1 { %[ACC0].4s }, %[CTX0]\n" \ "ld1 { %[ACC1].4s }, %[CTX1]\n" \ "ld1 { %[ACC2].4s }, %[CTX2]\n" \ "ld1 { %[ACC3].4s }, %[CTX3]\n" \ : [ZERO] "=w" (ZERO), \ [ACC0] "=w" (ACC0), [ACC1] "=w" (ACC1), \ [ACC2] "=w" (ACC2), [ACC3] "=w" (ACC3) \ : [CTX0] "Q" (ctx->aarch64_neon[0]), \ [CTX1] "Q" (ctx->aarch64_neon[1]), \ [CTX2] "Q" (ctx->aarch64_neon[2]), \ [CTX3] "Q" (ctx->aarch64_neon[3])) #define NEON_DO_REVERSE "rev32 %[SRC].16b, %[SRC].16b\n" #define NEON_DONT_REVERSE "" #define NEON_MAIN_LOOP(REVERSE) \ asm("ld1 { %[SRC].4s }, %[IP]\n" \ REVERSE \ "zip1 %[TMP1].4s, %[SRC].4s, %[ZERO].4s\n" \ "zip2 %[TMP2].4s, %[SRC].4s, %[ZERO].4s\n" \ "add %[ACC0].2d, %[ACC0].2d, %[TMP1].2d\n" \ "add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \ "add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \ "add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \ "add %[ACC0].2d, %[ACC0].2d, %[TMP2].2d\n" \ "add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \ "add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \ "add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \ : [SRC] "=&w" (SRC), \ [TMP1] "=&w" (TMP1), [TMP2] "=&w" (TMP2), \ [ACC0] "+w" (ACC0), [ACC1] "+w" (ACC1), \ [ACC2] "+w" (ACC2), [ACC3] "+w" (ACC3) \ : [ZERO] "w" (ZERO), [IP] "Q" (*ip)) #define NEON_FINI_LOOP() \ asm("st1 { %[ACC0].4s },%[DST0]\n" \ "st1 { %[ACC1].4s },%[DST1]\n" \ "st1 { %[ACC2].4s },%[DST2]\n" \ "st1 { %[ACC3].4s },%[DST3]\n" \ : [DST0] "=Q" (ctx->aarch64_neon[0]), \ [DST1] "=Q" (ctx->aarch64_neon[1]), \ [DST2] "=Q" (ctx->aarch64_neon[2]), \ [DST3] "=Q" (ctx->aarch64_neon[3]) \ : [ACC0] "w" (ACC0), [ACC1] "w" (ACC1), \ [ACC2] "w" (ACC2), [ACC3] "w" (ACC3)) static void fletcher_4_aarch64_neon_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); #if defined(_KERNEL) register unsigned char ZERO asm("v0") __attribute__((vector_size(16))); register unsigned char ACC0 asm("v1") __attribute__((vector_size(16))); register unsigned char ACC1 asm("v2") __attribute__((vector_size(16))); register unsigned char ACC2 asm("v3") __attribute__((vector_size(16))); register unsigned char ACC3 asm("v4") __attribute__((vector_size(16))); register unsigned char TMP1 asm("v5") __attribute__((vector_size(16))); register unsigned char TMP2 asm("v6") __attribute__((vector_size(16))); register unsigned char SRC asm("v7") __attribute__((vector_size(16))); #else unsigned char ZERO __attribute__((vector_size(16))); unsigned char ACC0 __attribute__((vector_size(16))); unsigned char ACC1 __attribute__((vector_size(16))); unsigned char ACC2 __attribute__((vector_size(16))); unsigned char ACC3 __attribute__((vector_size(16))); unsigned char TMP1 __attribute__((vector_size(16))); unsigned char TMP2 __attribute__((vector_size(16))); unsigned char SRC __attribute__((vector_size(16))); #endif NEON_INIT_LOOP(); do { NEON_MAIN_LOOP(NEON_DONT_REVERSE); } while ((ip += 2) < ipend); NEON_FINI_LOOP(); } static void fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); #if defined(_KERNEL) register unsigned char ZERO asm("v0") __attribute__((vector_size(16))); register unsigned char ACC0 asm("v1") __attribute__((vector_size(16))); register unsigned char ACC1 asm("v2") __attribute__((vector_size(16))); register unsigned char ACC2 asm("v3") __attribute__((vector_size(16))); register unsigned char ACC3 asm("v4") __attribute__((vector_size(16))); register unsigned char TMP1 asm("v5") __attribute__((vector_size(16))); register unsigned char TMP2 asm("v6") __attribute__((vector_size(16))); register unsigned char SRC asm("v7") __attribute__((vector_size(16))); #else unsigned char ZERO __attribute__((vector_size(16))); unsigned char ACC0 __attribute__((vector_size(16))); unsigned char ACC1 __attribute__((vector_size(16))); unsigned char ACC2 __attribute__((vector_size(16))); unsigned char ACC3 __attribute__((vector_size(16))); unsigned char TMP1 __attribute__((vector_size(16))); unsigned char TMP2 __attribute__((vector_size(16))); unsigned char SRC __attribute__((vector_size(16))); #endif NEON_INIT_LOOP(); do { NEON_MAIN_LOOP(NEON_DO_REVERSE); } while ((ip += 2) < ipend); NEON_FINI_LOOP(); } static boolean_t fletcher_4_aarch64_neon_valid(void) { return (kfpu_allowed()); } const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { .init_native = fletcher_4_aarch64_neon_init, .compute_native = fletcher_4_aarch64_neon_native, .fini_native = fletcher_4_aarch64_neon_fini, .init_byteswap = fletcher_4_aarch64_neon_init, .compute_byteswap = fletcher_4_aarch64_neon_byteswap, .fini_byteswap = fletcher_4_aarch64_neon_fini, .valid = fletcher_4_aarch64_neon_valid, + .uses_fpu = B_TRUE, .name = "aarch64_neon" }; #endif /* defined(__aarch64__) */ diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 4a3d5cb24ab5..81182ead2caf 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -1,223 +1,223 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ #if defined(__x86_64) && defined(HAVE_AVX512F) #include #include #include #include #include #include #ifdef __linux__ #define __asm __asm__ __volatile__ #endif ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { static const uint64_t CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 }, CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 }, DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 }, DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 }, DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 }; uint64_t A, B, C, D; uint64_t i; A = ctx->avx512[0].v[0]; B = 8 * ctx->avx512[1].v[0]; C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0]; D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] + DcB[0] * ctx->avx512[1].v[0]; for (i = 1; i < 8; i++) { A += ctx->avx512[0].v[i]; B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i]; C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] + CcA[i] * ctx->avx512[0].v[i]; D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] + DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i]; } ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ { \ __asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \ __asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \ __asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \ __asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \ } #define FLETCHER_4_AVX512_SAVE_CTX(ctx) \ { \ __asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \ __asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \ __asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \ __asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \ } static void fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); FLETCHER_4_AVX512_RESTORE_CTX(ctx); do { __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); __asm("vpaddq %zmm4, %zmm0, %zmm0"); __asm("vpaddq %zmm0, %zmm1, %zmm1"); __asm("vpaddq %zmm1, %zmm2, %zmm2"); __asm("vpaddq %zmm2, %zmm3, %zmm3"); } while ((ip += 8) < ipend); FLETCHER_4_AVX512_SAVE_CTX(ctx); } STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native); static void fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { static const uint64_t byteswap_mask = 0xFFULL; const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); FLETCHER_4_AVX512_RESTORE_CTX(ctx); __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask)); __asm("vpsllq $8, %zmm8, %zmm9"); __asm("vpsllq $16, %zmm8, %zmm10"); __asm("vpsllq $24, %zmm8, %zmm11"); do { __asm("vpmovzxdq %0, %%zmm5"::"m" (*ip)); __asm("vpsrlq $24, %zmm5, %zmm6"); __asm("vpandd %zmm8, %zmm6, %zmm6"); __asm("vpsrlq $8, %zmm5, %zmm7"); __asm("vpandd %zmm9, %zmm7, %zmm7"); __asm("vpord %zmm6, %zmm7, %zmm4"); __asm("vpsllq $8, %zmm5, %zmm6"); __asm("vpandd %zmm10, %zmm6, %zmm6"); __asm("vpord %zmm6, %zmm4, %zmm4"); __asm("vpsllq $24, %zmm5, %zmm5"); __asm("vpandd %zmm11, %zmm5, %zmm5"); __asm("vpord %zmm5, %zmm4, %zmm4"); __asm("vpaddq %zmm4, %zmm0, %zmm0"); __asm("vpaddq %zmm0, %zmm1, %zmm1"); __asm("vpaddq %zmm1, %zmm2, %zmm2"); __asm("vpaddq %zmm2, %zmm3, %zmm3"); } while ((ip += 8) < ipend); FLETCHER_4_AVX512_SAVE_CTX(ctx) } STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); static boolean_t fletcher_4_avx512f_valid(void) { return (kfpu_allowed() && zfs_avx512f_available()); } const fletcher_4_ops_t fletcher_4_avx512f_ops = { .init_native = fletcher_4_avx512f_init, .fini_native = fletcher_4_avx512f_fini, .compute_native = fletcher_4_avx512f_native, .init_byteswap = fletcher_4_avx512f_init, .fini_byteswap = fletcher_4_avx512f_fini, .compute_byteswap = fletcher_4_avx512f_byteswap, .valid = fletcher_4_avx512f_valid, + .uses_fpu = B_TRUE, .name = "avx512f" }; #if defined(HAVE_AVX512BW) static void fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { static const zfs_fletcher_avx512_t mask = { .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } }; const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); FLETCHER_4_AVX512_RESTORE_CTX(ctx); __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask)); do { __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); __asm("vpshufb %zmm5, %zmm4, %zmm4"); __asm("vpaddq %zmm4, %zmm0, %zmm0"); __asm("vpaddq %zmm0, %zmm1, %zmm1"); __asm("vpaddq %zmm1, %zmm2, %zmm2"); __asm("vpaddq %zmm2, %zmm3, %zmm3"); } while ((ip += 8) < ipend); FLETCHER_4_AVX512_SAVE_CTX(ctx) } STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap); static boolean_t fletcher_4_avx512bw_valid(void) { return (fletcher_4_avx512f_valid() && zfs_avx512bw_available()); } const fletcher_4_ops_t fletcher_4_avx512bw_ops = { .init_native = fletcher_4_avx512f_init, .fini_native = fletcher_4_avx512f_fini, .compute_native = fletcher_4_avx512f_native, .init_byteswap = fletcher_4_avx512f_init, .fini_byteswap = fletcher_4_avx512f_fini, .compute_byteswap = fletcher_4_avx512bw_byteswap, .valid = fletcher_4_avx512bw_valid, + .uses_fpu = B_TRUE, .name = "avx512bw" }; #endif #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c index c124d49280c1..6108bda7a07c 100644 --- a/module/zcommon/zfs_fletcher_intel.c +++ b/module/zcommon/zfs_fletcher_intel.c @@ -1,169 +1,168 @@ /* * Implement fast Fletcher4 with AVX2 instructions. (x86_64) * * Use the 256-bit AVX2 SIMD instructions and registers to compute * Fletcher4 in four incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * * Copyright (C) 2015 Intel Corporation. * * Authors: * James Guilford * Jinshan Xiong * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(HAVE_AVX) && defined(HAVE_AVX2) #include #include #include #include ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx2_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { uint64_t A, B, C, D; A = ctx->avx[0].v[0] + ctx->avx[0].v[1] + ctx->avx[0].v[2] + ctx->avx[0].v[3]; B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] + 4 * ctx->avx[1].v[3]; C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] - 10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] - 18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] + 16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] + 16 * ctx->avx[2].v[3]; D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] + 10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] + 34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] - 64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] - 96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] + 64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] + 64 * ctx->avx[3].v[3]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \ { \ asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0])); \ asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1])); \ asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2])); \ asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3])); \ } #define FLETCHER_4_AVX2_SAVE_CTX(ctx) \ { \ asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0])); \ asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1])); \ asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2])); \ asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3])); \ } static void fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); FLETCHER_4_AVX2_RESTORE_CTX(ctx); do { asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); asm volatile("vpaddq %ymm4, %ymm0, %ymm0"); asm volatile("vpaddq %ymm0, %ymm1, %ymm1"); asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); } while ((ip += 2) < ipend); FLETCHER_4_AVX2_SAVE_CTX(ctx); asm volatile("vzeroupper"); } static void fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { static const zfs_fletcher_avx_t mask = { .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } }; const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); FLETCHER_4_AVX2_RESTORE_CTX(ctx); asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask)); do { asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); asm volatile("vpshufb %ymm5, %ymm4, %ymm4"); asm volatile("vpaddq %ymm4, %ymm0, %ymm0"); asm volatile("vpaddq %ymm0, %ymm1, %ymm1"); asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); } while ((ip += 2) < ipend); FLETCHER_4_AVX2_SAVE_CTX(ctx); asm volatile("vzeroupper"); } static boolean_t fletcher_4_avx2_valid(void) { return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); } const fletcher_4_ops_t fletcher_4_avx2_ops = { .init_native = fletcher_4_avx2_init, .fini_native = fletcher_4_avx2_fini, .compute_native = fletcher_4_avx2_native, .init_byteswap = fletcher_4_avx2_init, .fini_byteswap = fletcher_4_avx2_fini, .compute_byteswap = fletcher_4_avx2_byteswap, .valid = fletcher_4_avx2_valid, + .uses_fpu = B_TRUE, .name = "avx2" }; #endif /* defined(HAVE_AVX) && defined(HAVE_AVX2) */ diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index 6c78830be994..096472c9af5f 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -1,224 +1,224 @@ /* * Implement fast Fletcher4 with SSE2,SSSE3 instructions. (x86) * * Use the 128-bit SSE2/SSSE3 SIMD instructions and registers to compute * Fletcher4 in two incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * This implementation is a derivative of the AVX SIMD implementation by * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). * * Copyright (C) 2016 Tyler J. Stachecki. * * Authors: * Tyler J. Stachecki * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(HAVE_SSE2) #include #include #include #include #include ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) { - kfpu_begin(); memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { uint64_t A, B, C, D; /* * The mixing matrix for checksum calculation is: * a = a0 + a1 * b = 2b0 + 2b1 - a1 * c = 4c0 - b0 + 4c1 -3b1 * d = 8d0 - 4c0 + 8d1 - 8c1 + b1; * * c and d are multiplied by 4 and 8, respectively, * before spilling the vectors out to memory. */ A = ctx->sse[0].v[0] + ctx->sse[0].v[1]; B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1]; C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] - 3 * ctx->sse[1].v[1]; D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] - 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); - kfpu_end(); } #define FLETCHER_4_SSE_RESTORE_CTX(ctx) \ { \ asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \ asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \ asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \ asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \ } #define FLETCHER_4_SSE_SAVE_CTX(ctx) \ { \ asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \ asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \ asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \ asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \ } static void fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); FLETCHER_4_SSE_RESTORE_CTX(ctx); asm volatile("pxor %xmm4, %xmm4"); do { asm volatile("movdqu %0, %%xmm5" :: "m"(*ip)); asm volatile("movdqa %xmm5, %xmm6"); asm volatile("punpckldq %xmm4, %xmm5"); asm volatile("punpckhdq %xmm4, %xmm6"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); asm volatile("paddq %xmm6, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } while ((ip += 2) < ipend); FLETCHER_4_SSE_SAVE_CTX(ctx); } static void fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); FLETCHER_4_SSE_RESTORE_CTX(ctx); do { uint32_t scratch1 = BSWAP_32(ip[0]); uint32_t scratch2 = BSWAP_32(ip[1]); asm volatile("movd %0, %%xmm5" :: "r"(scratch1)); asm volatile("movd %0, %%xmm6" :: "r"(scratch2)); asm volatile("punpcklqdq %xmm6, %xmm5"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } while ((ip += 2) < ipend); FLETCHER_4_SSE_SAVE_CTX(ctx); } static boolean_t fletcher_4_sse2_valid(void) { return (kfpu_allowed() && zfs_sse2_available()); } const fletcher_4_ops_t fletcher_4_sse2_ops = { .init_native = fletcher_4_sse2_init, .fini_native = fletcher_4_sse2_fini, .compute_native = fletcher_4_sse2_native, .init_byteswap = fletcher_4_sse2_init, .fini_byteswap = fletcher_4_sse2_fini, .compute_byteswap = fletcher_4_sse2_byteswap, .valid = fletcher_4_sse2_valid, + .uses_fpu = B_TRUE, .name = "sse2" }; #endif /* defined(HAVE_SSE2) */ #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) static void fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { static const zfs_fletcher_sse_t mask = { .v = { 0x0405060700010203, 0x0C0D0E0F08090A0B } }; const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); FLETCHER_4_SSE_RESTORE_CTX(ctx); asm volatile("movdqu %0, %%xmm7"::"m" (mask)); asm volatile("pxor %xmm4, %xmm4"); do { asm volatile("movdqu %0, %%xmm5"::"m" (*ip)); asm volatile("pshufb %xmm7, %xmm5"); asm volatile("movdqa %xmm5, %xmm6"); asm volatile("punpckldq %xmm4, %xmm5"); asm volatile("punpckhdq %xmm4, %xmm6"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); asm volatile("paddq %xmm6, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } while ((ip += 2) < ipend); FLETCHER_4_SSE_SAVE_CTX(ctx); } static boolean_t fletcher_4_ssse3_valid(void) { return (kfpu_allowed() && zfs_sse2_available() && zfs_ssse3_available()); } const fletcher_4_ops_t fletcher_4_ssse3_ops = { .init_native = fletcher_4_sse2_init, .fini_native = fletcher_4_sse2_fini, .compute_native = fletcher_4_sse2_native, .init_byteswap = fletcher_4_sse2_init, .fini_byteswap = fletcher_4_sse2_fini, .compute_byteswap = fletcher_4_ssse3_byteswap, .valid = fletcher_4_ssse3_valid, + .uses_fpu = B_TRUE, .name = "ssse3" }; #endif /* defined(HAVE_SSE2) && defined(HAVE_SSSE3) */ diff --git a/module/zcommon/zfs_fletcher_superscalar.c b/module/zcommon/zfs_fletcher_superscalar.c index 67dc095927f1..8b5b72a7b8b5 100644 --- a/module/zcommon/zfs_fletcher_superscalar.c +++ b/module/zcommon/zfs_fletcher_superscalar.c @@ -1,167 +1,168 @@ /* * Implement fast Fletcher4 using superscalar pipelines. * * Use regular C code to compute * Fletcher4 in two incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * This implementation is a derivative of the AVX SIMD implementation by * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). * * Copyright (C) 2016 Romain Dolbeau. * * Authors: * Romain Dolbeau * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx) { memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { uint64_t A, B, C, D; A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1]; B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] - ctx->superscalar[0].v[1]; C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] + 4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1]; D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] + 8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] + ctx->superscalar[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; uint64_t a2, b2, c2, d2; a = ctx->superscalar[0].v[0]; b = ctx->superscalar[1].v[0]; c = ctx->superscalar[2].v[0]; d = ctx->superscalar[3].v[0]; a2 = ctx->superscalar[0].v[1]; b2 = ctx->superscalar[1].v[1]; c2 = ctx->superscalar[2].v[1]; d2 = ctx->superscalar[3].v[1]; do { a += ip[0]; a2 += ip[1]; b += a; b2 += a2; c += b; c2 += b2; d += c; d2 += c2; } while ((ip += 2) < ipend); ctx->superscalar[0].v[0] = a; ctx->superscalar[1].v[0] = b; ctx->superscalar[2].v[0] = c; ctx->superscalar[3].v[0] = d; ctx->superscalar[0].v[1] = a2; ctx->superscalar[1].v[1] = b2; ctx->superscalar[2].v[1] = c2; ctx->superscalar[3].v[1] = d2; } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; uint64_t a2, b2, c2, d2; a = ctx->superscalar[0].v[0]; b = ctx->superscalar[1].v[0]; c = ctx->superscalar[2].v[0]; d = ctx->superscalar[3].v[0]; a2 = ctx->superscalar[0].v[1]; b2 = ctx->superscalar[1].v[1]; c2 = ctx->superscalar[2].v[1]; d2 = ctx->superscalar[3].v[1]; do { a += BSWAP_32(ip[0]); a2 += BSWAP_32(ip[1]); b += a; b2 += a2; c += b; c2 += b2; d += c; d2 += c2; } while ((ip += 2) < ipend); ctx->superscalar[0].v[0] = a; ctx->superscalar[1].v[0] = b; ctx->superscalar[2].v[0] = c; ctx->superscalar[3].v[0] = d; ctx->superscalar[0].v[1] = a2; ctx->superscalar[1].v[1] = b2; ctx->superscalar[2].v[1] = c2; ctx->superscalar[3].v[1] = d2; } static boolean_t fletcher_4_superscalar_valid(void) { return (B_TRUE); } const fletcher_4_ops_t fletcher_4_superscalar_ops = { .init_native = fletcher_4_superscalar_init, .compute_native = fletcher_4_superscalar_native, .fini_native = fletcher_4_superscalar_fini, .init_byteswap = fletcher_4_superscalar_init, .compute_byteswap = fletcher_4_superscalar_byteswap, .fini_byteswap = fletcher_4_superscalar_fini, .valid = fletcher_4_superscalar_valid, + .uses_fpu = B_FALSE, .name = "superscalar" }; diff --git a/module/zcommon/zfs_fletcher_superscalar4.c b/module/zcommon/zfs_fletcher_superscalar4.c index d2067c12f85b..bef387933917 100644 --- a/module/zcommon/zfs_fletcher_superscalar4.c +++ b/module/zcommon/zfs_fletcher_superscalar4.c @@ -1,233 +1,234 @@ /* * Implement fast Fletcher4 using superscalar pipelines. * * Use regular C code to compute * Fletcher4 in four incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * This implementation is a derivative of the AVX SIMD implementation by * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). * * Copyright (C) 2016 Romain Dolbeau. * * Authors: * Romain Dolbeau * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx) { memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t)); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { uint64_t A, B, C, D; A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1] + ctx->superscalar[0].v[2] + ctx->superscalar[0].v[3]; B = 0 - ctx->superscalar[0].v[1] - 2 * ctx->superscalar[0].v[2] - 3 * ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + 4 * ctx->superscalar[1].v[1] + 4 * ctx->superscalar[1].v[2] + 4 * ctx->superscalar[1].v[3]; C = ctx->superscalar[0].v[2] + 3 * ctx->superscalar[0].v[3] - 6 * ctx->superscalar[1].v[0] - 10 * ctx->superscalar[1].v[1] - 14 * ctx->superscalar[1].v[2] - 18 * ctx->superscalar[1].v[3] + 16 * ctx->superscalar[2].v[0] + 16 * ctx->superscalar[2].v[1] + 16 * ctx->superscalar[2].v[2] + 16 * ctx->superscalar[2].v[3]; D = 0 - ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + 10 * ctx->superscalar[1].v[1] + 20 * ctx->superscalar[1].v[2] + 34 * ctx->superscalar[1].v[3] - 48 * ctx->superscalar[2].v[0] - 64 * ctx->superscalar[2].v[1] - 80 * ctx->superscalar[2].v[2] - 96 * ctx->superscalar[2].v[3] + 64 * ctx->superscalar[3].v[0] + 64 * ctx->superscalar[3].v[1] + 64 * ctx->superscalar[3].v[2] + 64 * ctx->superscalar[3].v[3]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; uint64_t a2, b2, c2, d2; uint64_t a3, b3, c3, d3; uint64_t a4, b4, c4, d4; a = ctx->superscalar[0].v[0]; b = ctx->superscalar[1].v[0]; c = ctx->superscalar[2].v[0]; d = ctx->superscalar[3].v[0]; a2 = ctx->superscalar[0].v[1]; b2 = ctx->superscalar[1].v[1]; c2 = ctx->superscalar[2].v[1]; d2 = ctx->superscalar[3].v[1]; a3 = ctx->superscalar[0].v[2]; b3 = ctx->superscalar[1].v[2]; c3 = ctx->superscalar[2].v[2]; d3 = ctx->superscalar[3].v[2]; a4 = ctx->superscalar[0].v[3]; b4 = ctx->superscalar[1].v[3]; c4 = ctx->superscalar[2].v[3]; d4 = ctx->superscalar[3].v[3]; do { a += ip[0]; a2 += ip[1]; a3 += ip[2]; a4 += ip[3]; b += a; b2 += a2; b3 += a3; b4 += a4; c += b; c2 += b2; c3 += b3; c4 += b4; d += c; d2 += c2; d3 += c3; d4 += c4; } while ((ip += 4) < ipend); ctx->superscalar[0].v[0] = a; ctx->superscalar[1].v[0] = b; ctx->superscalar[2].v[0] = c; ctx->superscalar[3].v[0] = d; ctx->superscalar[0].v[1] = a2; ctx->superscalar[1].v[1] = b2; ctx->superscalar[2].v[1] = c2; ctx->superscalar[3].v[1] = d2; ctx->superscalar[0].v[2] = a3; ctx->superscalar[1].v[2] = b3; ctx->superscalar[2].v[2] = c3; ctx->superscalar[3].v[2] = d3; ctx->superscalar[0].v[3] = a4; ctx->superscalar[1].v[3] = b4; ctx->superscalar[2].v[3] = c4; ctx->superscalar[3].v[3] = d4; } ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; uint64_t a2, b2, c2, d2; uint64_t a3, b3, c3, d3; uint64_t a4, b4, c4, d4; a = ctx->superscalar[0].v[0]; b = ctx->superscalar[1].v[0]; c = ctx->superscalar[2].v[0]; d = ctx->superscalar[3].v[0]; a2 = ctx->superscalar[0].v[1]; b2 = ctx->superscalar[1].v[1]; c2 = ctx->superscalar[2].v[1]; d2 = ctx->superscalar[3].v[1]; a3 = ctx->superscalar[0].v[2]; b3 = ctx->superscalar[1].v[2]; c3 = ctx->superscalar[2].v[2]; d3 = ctx->superscalar[3].v[2]; a4 = ctx->superscalar[0].v[3]; b4 = ctx->superscalar[1].v[3]; c4 = ctx->superscalar[2].v[3]; d4 = ctx->superscalar[3].v[3]; do { a += BSWAP_32(ip[0]); a2 += BSWAP_32(ip[1]); a3 += BSWAP_32(ip[2]); a4 += BSWAP_32(ip[3]); b += a; b2 += a2; b3 += a3; b4 += a4; c += b; c2 += b2; c3 += b3; c4 += b4; d += c; d2 += c2; d3 += c3; d4 += c4; } while ((ip += 4) < ipend); ctx->superscalar[0].v[0] = a; ctx->superscalar[1].v[0] = b; ctx->superscalar[2].v[0] = c; ctx->superscalar[3].v[0] = d; ctx->superscalar[0].v[1] = a2; ctx->superscalar[1].v[1] = b2; ctx->superscalar[2].v[1] = c2; ctx->superscalar[3].v[1] = d2; ctx->superscalar[0].v[2] = a3; ctx->superscalar[1].v[2] = b3; ctx->superscalar[2].v[2] = c3; ctx->superscalar[3].v[2] = d3; ctx->superscalar[0].v[3] = a4; ctx->superscalar[1].v[3] = b4; ctx->superscalar[2].v[3] = c4; ctx->superscalar[3].v[3] = d4; } static boolean_t fletcher_4_superscalar4_valid(void) { return (B_TRUE); } const fletcher_4_ops_t fletcher_4_superscalar4_ops = { .init_native = fletcher_4_superscalar4_init, .compute_native = fletcher_4_superscalar4_native, .fini_native = fletcher_4_superscalar4_fini, .init_byteswap = fletcher_4_superscalar4_init, .compute_byteswap = fletcher_4_superscalar4_byteswap, .fini_byteswap = fletcher_4_superscalar4_fini, .valid = fletcher_4_superscalar4_valid, + .uses_fpu = B_FALSE, .name = "superscalar4" };