diff --git a/include/sys/abd.h b/include/sys/abd.h index 750f9986c1da..b48dc36423f7 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -1,226 +1,226 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016, 2019 by Delphix. All rights reserved. */ #ifndef _ABD_H #define _ABD_H #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef enum abd_flags { ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */ ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */ ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */ } abd_flags_t; typedef struct abd { abd_flags_t abd_flags; uint_t abd_size; /* excludes scattered abd_offset */ list_node_t abd_gang_link; #ifdef ZFS_DEBUG struct abd *abd_parent; zfs_refcount_t abd_children; #endif kmutex_t abd_mtx; union { struct abd_scatter { uint_t abd_offset; #if defined(__FreeBSD__) && defined(_KERNEL) void *abd_chunks[1]; /* actually variable-length */ #else uint_t abd_nents; struct scatterlist *abd_sgl; #endif } abd_scatter; struct abd_linear { void *abd_buf; struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ } abd_linear; struct abd_gang { list_t abd_gang_chain; } abd_gang; } abd_u; } abd_t; typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); extern int zfs_abd_scatter_enabled; /* * Allocations and deallocations */ __attribute__((malloc)) abd_t *abd_alloc(size_t, boolean_t); __attribute__((malloc)) abd_t *abd_alloc_linear(size_t, boolean_t); __attribute__((malloc)) abd_t *abd_alloc_gang(void); __attribute__((malloc)) abd_t *abd_alloc_for_io(size_t, boolean_t); __attribute__((malloc)) abd_t *abd_alloc_sametype(abd_t *, size_t); boolean_t abd_size_alloc_linear(size_t); void abd_gang_add(abd_t *, abd_t *, boolean_t); void abd_free(abd_t *); abd_t *abd_get_offset(abd_t *, size_t); abd_t *abd_get_offset_size(abd_t *, size_t, size_t); abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t); abd_t *abd_get_zeros(size_t); abd_t *abd_get_from_buf(void *, size_t); void abd_cache_reap_now(void); /* * Conversion to and from a normal buffer */ void *abd_to_buf(abd_t *); void *abd_borrow_buf(abd_t *, size_t); void *abd_borrow_buf_copy(abd_t *, size_t); void abd_return_buf(abd_t *, void *, size_t); void abd_return_buf_copy(abd_t *, void *, size_t); void abd_take_ownership_of_buf(abd_t *, boolean_t); void abd_release_ownership_of_buf(abd_t *); /* * ABD operations */ int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); int abd_cmp(abd_t *, abd_t *); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); void abd_verify(abd_t *); -void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, + size_t csize, size_t dsize, const unsigned parity, void (*func_raidz_gen)(void **, const void *, size_t, size_t)); void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, + size_t tsize, const unsigned parity, void (*func_raidz_rec)(void **t, const size_t tsize, void **c, const unsigned *mul), const unsigned *mul); /* * Wrappers for calls with offsets of 0 */ static inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size) { abd_copy_off(dabd, sabd, 0, 0, size); } static inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size) { abd_copy_from_buf_off(abd, buf, 0, size); } static inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size) { abd_copy_to_buf_off(buf, abd, 0, size); } static inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size) { return (abd_cmp_buf_off(abd, buf, 0, size)); } static inline void abd_zero(abd_t *abd, size_t size) { abd_zero_off(abd, 0, size); } /* * ABD type check functions */ static inline boolean_t abd_is_linear(abd_t *abd) { return ((abd->abd_flags & ABD_FLAG_LINEAR) ? B_TRUE : B_FALSE); } static inline boolean_t abd_is_linear_page(abd_t *abd) { return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) ? B_TRUE : B_FALSE); } static inline boolean_t abd_is_gang(abd_t *abd) { return ((abd->abd_flags & ABD_FLAG_GANG) ? B_TRUE : B_FALSE); } static inline uint_t abd_get_size(abd_t *abd) { return (abd->abd_size); } /* * Module lifecycle * Defined in each specific OS's abd_os.c */ void abd_init(void); void abd_fini(void); /* * Linux ABD bio functions */ #if defined(__linux__) && defined(_KERNEL) unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); #endif #ifdef __cplusplus } #endif #endif /* _ABD_H */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index d982f201c930..bcc6ddd5e81b 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -1,1177 +1,1173 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. */ /* * ARC buffer data (ABD). * * ABDs are an abstract data structure for the ARC which can use two * different ways of storing the underlying data: * * (a) Linear buffer. In this case, all the data in the ABD is stored in one * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). * * +-------------------+ * | ABD (linear) | * | abd_flags = ... | * | abd_size = ... | +--------------------------------+ * | abd_buf ------------->| raw buffer of size abd_size | * +-------------------+ +--------------------------------+ * no abd_chunks * * (b) Scattered buffer. In this case, the data in the ABD is split into * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers * to the chunks recorded in an array at the end of the ABD structure. * * +-------------------+ * | ABD (scattered) | * | abd_flags = ... | * | abd_size = ... | * | abd_offset = 0 | +-----------+ * | abd_chunks[0] ----------------------------->| chunk 0 | * | abd_chunks[1] ---------------------+ +-----------+ * | ... | | +-----------+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 | * +-------------------+ | +-----------+ * | ... * | +-----------+ * +----------------->| chunk N-1 | * +-----------+ * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset * within an existing ABD. In linear buffers this is simple (set abd_buf of * the new ABD to the starting point within the original raw buffer), but * scattered ABDs are a little more complex. The new ABD makes a copy of the * relevant abd_chunks pointers (but not the underlying data). However, to * provide arbitrary rather than only chunk-aligned starting offsets, it also * tracks an abd_offset field which represents the starting point of the data * within the first chunk in abd_chunks. For both linear and scattered ABDs, * creating an offset ABD marks the original ABD as the offset's parent, and the * original ABD's abd_children refcount is incremented. This data allows us to * ensure the root ABD isn't deleted before its children. * * Most consumers should never need to know what type of ABD they're using -- * the ABD public API ensures that it's possible to transparently switch from * using a linear ABD to a scattered one when doing so would be beneficial. * * If you need to use the data within an ABD directly, if you know it's linear * (because you allocated it) you can use abd_to_buf() to access the underlying * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions * which will allocate a raw buffer if necessary. Use the abd_return_buf* * functions to return any raw buffers that are no longer necessary when you're * done using them. * * There are a variety of ABD APIs that implement basic buffer operations: * compare, copy, read, write, and fill with zeroes. If you need a custom * function which progressively accesses the whole ABD, use the abd_iterate_* * functions. * * As an additional feature, linear and scatter ABD's can be stitched together * by using the gang ABD type (abd_alloc_gang_abd()). This allows for * multiple ABDs to be viewed as a singular ABD. * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to * B_FALSE. */ #include #include #include #include #include /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; void abd_verify(abd_t *abd) { #ifdef ZFS_DEBUG ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { ASSERT3U(abd->abd_size, >, 0); ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); } else if (abd_is_gang(abd)) { uint_t child_sizes = 0; for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { ASSERT(list_link_active(&cabd->abd_gang_link)); child_sizes += cabd->abd_size; abd_verify(cabd); } ASSERT3U(abd->abd_size, ==, child_sizes); } else { ASSERT3U(abd->abd_size, >, 0); abd_verify_scatter(abd); } #endif } static void abd_init_struct(abd_t *abd) { list_link_init(&abd->abd_gang_link); mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); abd->abd_flags = 0; #ifdef ZFS_DEBUG zfs_refcount_create(&abd->abd_children); abd->abd_parent = NULL; #endif abd->abd_size = 0; } static void abd_fini_struct(abd_t *abd) { mutex_destroy(&abd->abd_mtx); ASSERT(!list_link_active(&abd->abd_gang_link)); #ifdef ZFS_DEBUG zfs_refcount_destroy(&abd->abd_children); #endif } abd_t * abd_alloc_struct(size_t size) { abd_t *abd = abd_alloc_struct_impl(size); abd_init_struct(abd); abd->abd_flags |= ABD_FLAG_ALLOCD; return (abd); } void abd_free_struct(abd_t *abd) { abd_fini_struct(abd); abd_free_struct_impl(abd); } /* * Allocate an ABD, along with its own underlying data buffers. Use this if you * don't care whether the ABD is linear or not. */ abd_t * abd_alloc(size_t size, boolean_t is_metadata) { if (abd_size_alloc_linear(size)) return (abd_alloc_linear(size, is_metadata)); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); abd_t *abd = abd_alloc_struct(size); abd->abd_flags |= ABD_FLAG_OWNER; abd->abd_u.abd_scatter.abd_offset = 0; abd_alloc_chunks(abd, size); if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; abd_update_scatter_stats(abd, ABDSTAT_INCR); return (abd); } /* * Allocate an ABD that must be linear, along with its own underlying data * buffer. Only use this when it would be very annoying to write your ABD * consumer with a scattered ABD. */ abd_t * abd_alloc_linear(size_t size, boolean_t is_metadata) { abd_t *abd = abd_alloc_struct(0); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; if (is_metadata) { ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); } else { ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size); } abd_update_linear_stats(abd, ABDSTAT_INCR); return (abd); } static void abd_free_linear(abd_t *abd) { if (abd_is_linear_page(abd)) { abd_free_linear_page(abd); return; } if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } abd_update_linear_stats(abd, ABDSTAT_DECR); } static void abd_free_gang(abd_t *abd) { ASSERT(abd_is_gang(abd)); abd_t *cabd; while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) { /* * We must acquire the child ABDs mutex to ensure that if it * is being added to another gang ABD we will set the link * as inactive when removing it from this gang ABD and before * adding it to the other gang ABD. */ mutex_enter(&cabd->abd_mtx); ASSERT(list_link_active(&cabd->abd_gang_link)); list_remove(&ABD_GANG(abd).abd_gang_chain, cabd); mutex_exit(&cabd->abd_mtx); if (cabd->abd_flags & ABD_FLAG_GANG_FREE) abd_free(cabd); } list_destroy(&ABD_GANG(abd).abd_gang_chain); } static void abd_free_scatter(abd_t *abd) { abd_free_chunks(abd); abd_update_scatter_stats(abd, ABDSTAT_DECR); } /* * Free an ABD. Use with any kind of abd: those created with abd_alloc_*() * and abd_get_*(), including abd_get_offset_struct(). * * If the ABD was created with abd_alloc_*(), the underlying data * (scatterlist or linear buffer) will also be freed. (Subject to ownership * changes via abd_*_ownership_of_buf().) * * Unless the ABD was created with abd_get_offset_struct(), the abd_t will * also be freed. */ void abd_free(abd_t *abd) { if (abd == NULL) return; abd_verify(abd); #ifdef ZFS_DEBUG IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL); #endif if (abd_is_gang(abd)) { abd_free_gang(abd); } else if (abd_is_linear(abd)) { if (abd->abd_flags & ABD_FLAG_OWNER) abd_free_linear(abd); } else { if (abd->abd_flags & ABD_FLAG_OWNER) abd_free_scatter(abd); } #ifdef ZFS_DEBUG if (abd->abd_parent != NULL) { (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, abd->abd_size, abd); } #endif abd_fini_struct(abd); if (abd->abd_flags & ABD_FLAG_ALLOCD) abd_free_struct_impl(abd); } /* * Allocate an ABD of the same format (same metadata flag, same scatterize * setting) as another ABD. */ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; if (abd_is_linear(sabd) && !abd_is_linear_page(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); } } /* * Create gang ABD that will be the head of a list of ABD's. This is used * to "chain" scatter/gather lists together when constructing aggregated * IO's. To free this abd, abd_free() must be called. */ abd_t * abd_alloc_gang(void) { abd_t *abd = abd_alloc_struct(0); abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER; list_create(&ABD_GANG(abd).abd_gang_chain, sizeof (abd_t), offsetof(abd_t, abd_gang_link)); return (abd); } /* * Add a child gang ABD to a parent gang ABDs chained list. */ static void abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) { ASSERT(abd_is_gang(pabd)); ASSERT(abd_is_gang(cabd)); if (free_on_free) { /* * If the parent is responsible for freeing the child gang * ABD we will just splice the child's children ABD list to * the parent's list and immediately free the child gang ABD * struct. The parent gang ABDs children from the child gang * will retain all the free_on_free settings after being * added to the parents list. */ #ifdef ZFS_DEBUG /* * If cabd had abd_parent, we have to drop it here. We can't * transfer it to pabd, nor we can clear abd_size leaving it. */ if (cabd->abd_parent != NULL) { (void) zfs_refcount_remove_many( &cabd->abd_parent->abd_children, cabd->abd_size, cabd); cabd->abd_parent = NULL; } #endif pabd->abd_size += cabd->abd_size; cabd->abd_size = 0; list_move_tail(&ABD_GANG(pabd).abd_gang_chain, &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); abd_verify(pabd); abd_free(cabd); } else { for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain); child != NULL; child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) { /* * We always pass B_FALSE for free_on_free as it is the * original child gang ABDs responsibility to determine * if any of its child ABDs should be free'd on the call * to abd_free(). */ abd_gang_add(pabd, child, B_FALSE); } abd_verify(pabd); } } /* * Add a child ABD to a gang ABD's chained list. */ void abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) { ASSERT(abd_is_gang(pabd)); abd_t *child_abd = NULL; /* * If the child being added is a gang ABD, we will add the * child's ABDs to the parent gang ABD. This allows us to account * for the offset correctly in the parent gang ABD. */ if (abd_is_gang(cabd)) { ASSERT(!list_link_active(&cabd->abd_gang_link)); return (abd_gang_add_gang(pabd, cabd, free_on_free)); } ASSERT(!abd_is_gang(cabd)); /* * In order to verify that an ABD is not already part of * another gang ABD, we must lock the child ABD's abd_mtx * to check its abd_gang_link status. We unlock the abd_mtx * only after it is has been added to a gang ABD, which * will update the abd_gang_link's status. See comment below * for how an ABD can be in multiple gang ABD's simultaneously. */ mutex_enter(&cabd->abd_mtx); if (list_link_active(&cabd->abd_gang_link)) { /* * If the child ABD is already part of another * gang ABD then we must allocate a new * ABD to use a separate link. We mark the newly * allocated ABD with ABD_FLAG_GANG_FREE, before * adding it to the gang ABD's list, to make the * gang ABD aware that it is responsible to call * abd_free(). We use abd_get_offset() in order * to just allocate a new ABD but avoid copying the * data over into the newly allocated ABD. * * An ABD may become part of multiple gang ABD's. For * example, when writing ditto bocks, the same ABD * is used to write 2 or 3 locations with 2 or 3 * zio_t's. Each of the zio's may be aggregated with * different adjacent zio's. zio aggregation uses gang * zio's, so the single ABD can become part of multiple * gang zio's. * * The ASSERT below is to make sure that if * free_on_free is passed as B_TRUE, the ABD can * not be in multiple gang ABD's. The gang ABD * can not be responsible for cleaning up the child * ABD memory allocation if the ABD can be in * multiple gang ABD's at one time. */ ASSERT3B(free_on_free, ==, B_FALSE); child_abd = abd_get_offset(cabd, 0); child_abd->abd_flags |= ABD_FLAG_GANG_FREE; } else { child_abd = cabd; if (free_on_free) child_abd->abd_flags |= ABD_FLAG_GANG_FREE; } ASSERT3P(child_abd, !=, NULL); list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd); mutex_exit(&cabd->abd_mtx); pabd->abd_size += child_abd->abd_size; } /* * Locate the ABD for the supplied offset in the gang ABD. * Return a new offset relative to the returned ABD. */ abd_t * abd_gang_get_offset(abd_t *abd, size_t *off) { abd_t *cabd; ASSERT(abd_is_gang(abd)); ASSERT3U(*off, <, abd->abd_size); for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { if (*off >= cabd->abd_size) *off -= cabd->abd_size; else return (cabd); } VERIFY3P(cabd, !=, NULL); return (cabd); } /* * Allocate a new ABD, using the provided struct (if non-NULL, and if * circumstances allow - otherwise allocate the struct). The returned ABD will * point to offset off of sabd. It shares the underlying buffer data with sabd. * Use abd_free() to free. sabd must not be freed while any derived ABDs exist. */ static abd_t * abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) { abd_verify(sabd); ASSERT3U(off + size, <=, sabd->abd_size); if (abd_is_linear(sabd)) { if (abd == NULL) abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ abd->abd_flags |= ABD_FLAG_LINEAR; ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else if (abd_is_gang(sabd)) { size_t left = size; if (abd == NULL) { abd = abd_alloc_gang(); } else { abd->abd_flags |= ABD_FLAG_GANG; list_create(&ABD_GANG(abd).abd_gang_chain, sizeof (abd_t), offsetof(abd_t, abd_gang_link)); } abd->abd_flags &= ~ABD_FLAG_OWNER; for (abd_t *cabd = abd_gang_get_offset(sabd, &off); cabd != NULL && left > 0; cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) { int csize = MIN(left, cabd->abd_size - off); abd_t *nabd = abd_get_offset_size(cabd, off, csize); abd_gang_add(abd, nabd, B_TRUE); left -= csize; off = 0; } ASSERT3U(left, ==, 0); } else { abd = abd_get_offset_scatter(abd, sabd, off, size); } ASSERT3P(abd, !=, NULL); abd->abd_size = size; #ifdef ZFS_DEBUG abd->abd_parent = sabd; (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); #endif return (abd); } /* * Like abd_get_offset_size(), but memory for the abd_t is provided by the * caller. Using this routine can improve performance by avoiding the cost * of allocating memory for the abd_t struct, and updating the abd stats. * Usually, the provided abd is returned, but in some circumstances (FreeBSD, * if sabd is scatter and size is more than 2 pages) a new abd_t may need to * be allocated. Therefore callers should be careful to use the returned * abd_t*. */ abd_t * abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size) { abd_t *result; abd_init_struct(abd); result = abd_get_offset_impl(abd, sabd, off, size); if (result != abd) abd_fini_struct(abd); return (result); } abd_t * abd_get_offset(abd_t *sabd, size_t off) { size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; VERIFY3U(size, >, 0); return (abd_get_offset_impl(NULL, sabd, off, size)); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); return (abd_get_offset_impl(NULL, sabd, off, size)); } /* * Return a size scatter ABD containing only zeros. */ abd_t * abd_get_zeros(size_t size) { ASSERT3P(abd_zero_scatter, !=, NULL); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); return (abd_get_offset_size(abd_zero_scatter, 0, size)); } /* * Allocate a linear ABD structure for buf. */ abd_t * abd_get_from_buf(void *buf, size_t size) { abd_t *abd = abd_alloc_struct(0); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); /* * Even if this buf is filesystem metadata, we only track that if we * own the underlying data buffer, which is not true in this case. * Therefore, we don't ever use ABD_FLAG_META here. */ abd->abd_flags |= ABD_FLAG_LINEAR; abd->abd_size = size; ABD_LINEAR_BUF(abd) = buf; return (abd); } /* * Get the raw buffer associated with a linear ABD. */ void * abd_to_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); abd_verify(abd); return (ABD_LINEAR_BUF(abd)); } /* * Borrow a raw buffer from an ABD without copying the contents of the ABD * into the buffer. If the ABD is scattered, this will allocate a raw buffer * whose contents are undefined. To copy over the existing data in the ABD, use * abd_borrow_buf_copy() instead. */ void * abd_borrow_buf(abd_t *abd, size_t n) { void *buf; abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); if (abd_is_linear(abd)) { buf = abd_to_buf(abd); } else { buf = zio_buf_alloc(n); } #ifdef ZFS_DEBUG (void) zfs_refcount_add_many(&abd->abd_children, n, buf); #endif return (buf); } void * abd_borrow_buf_copy(abd_t *abd, size_t n) { void *buf = abd_borrow_buf(abd, n); if (!abd_is_linear(abd)) { abd_copy_to_buf(buf, abd, n); } return (buf); } /* * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will * not change the contents of the ABD and will ASSERT that you didn't modify * the buffer since it was borrowed. If you want any changes you made to buf to * be copied back to abd, use abd_return_buf_copy() instead. */ void abd_return_buf(abd_t *abd, void *buf, size_t n) { abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); #ifdef ZFS_DEBUG (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); #endif if (abd_is_linear(abd)) { ASSERT3P(buf, ==, abd_to_buf(abd)); } else { ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } } void abd_return_buf_copy(abd_t *abd, void *buf, size_t n) { if (!abd_is_linear(abd)) { abd_copy_from_buf(abd, buf, n); } abd_return_buf(abd, buf, n); } void abd_release_ownership_of_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); /* * abd_free() needs to handle LINEAR_PAGE ABD's specially. * Since that flag does not survive the * abd_release_ownership_of_buf() -> abd_get_from_buf() -> * abd_take_ownership_of_buf() sequence, we don't allow releasing * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. */ ASSERT(!abd_is_linear_page(abd)); abd_verify(abd); abd->abd_flags &= ~ABD_FLAG_OWNER; /* Disable this flag since we no longer own the data buffer */ abd->abd_flags &= ~ABD_FLAG_META; abd_update_linear_stats(abd, ABDSTAT_DECR); } /* * Give this ABD ownership of the buffer that it's storing. Can only be used on * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated * with abd_alloc_linear() which subsequently released ownership of their buf * with abd_release_ownership_of_buf(). */ void abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) { ASSERT(abd_is_linear(abd)); ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); abd_verify(abd); abd->abd_flags |= ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd_update_linear_stats(abd, ABDSTAT_INCR); } /* * Initializes an abd_iter based on whether the abd is a gang ABD * or just a single ABD. */ static inline abd_t * abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off) { abd_t *cabd = NULL; if (abd_is_gang(abd)) { cabd = abd_gang_get_offset(abd, &off); if (cabd) { abd_iter_init(aiter, cabd); abd_iter_advance(aiter, off); } } else { abd_iter_init(aiter, abd); abd_iter_advance(aiter, off); } return (cabd); } /* * Advances an abd_iter. We have to be careful with gang ABD as * advancing could mean that we are at the end of a particular ABD and * must grab the ABD in the gang ABD's list. */ static inline abd_t * abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter, size_t len) { abd_iter_advance(aiter, len); if (abd_is_gang(abd) && abd_iter_at_end(aiter)) { ASSERT3P(cabd, !=, NULL); cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd); if (cabd) { abd_iter_init(aiter, cabd); abd_iter_advance(aiter, 0); } } return (cabd); } int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { struct abd_iter aiter; int ret = 0; if (size == 0) return (0); abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { IMPLY(abd_is_gang(abd), c_abd != NULL); abd_iter_map(&aiter); size_t len = MIN(aiter.iter_mapsize, size); ASSERT3U(len, >, 0); ret = func(aiter.iter_mapaddr, len, private); abd_iter_unmap(&aiter); if (ret != 0) break; size -= len; c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); } return (ret); } struct buf_arg { void *arg_buf; }; static int abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) { struct buf_arg *ba_ptr = private; (void) memcpy(ba_ptr->arg_buf, buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (0); } /* * Copy abd to buf. (off is the offset in abd.) */ void abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) { struct buf_arg ba_ptr = { buf }; (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, &ba_ptr); } static int abd_cmp_buf_off_cb(void *buf, size_t size, void *private) { int ret; struct buf_arg *ba_ptr = private; ret = memcmp(buf, ba_ptr->arg_buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (ret); } /* * Compare the contents of abd to buf. (off is the offset in abd.) */ int abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) { struct buf_arg ba_ptr = { (void *) buf }; return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); } static int abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) { struct buf_arg *ba_ptr = private; (void) memcpy(buf, ba_ptr->arg_buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (0); } /* * Copy from buf to abd. (off is the offset in abd.) */ void abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) { struct buf_arg ba_ptr = { (void *) buf }; (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, &ba_ptr); } static int abd_zero_off_cb(void *buf, size_t size, void *private) { (void) private; (void) memset(buf, 0, size); return (0); } /* * Zero out the abd from a particular offset to the end. */ void abd_zero_off(abd_t *abd, size_t off, size_t size) { (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); } /* * Iterate over two ABDs and call func incrementally on the two ABDs' data in * equal-sized chunks (passed to func as raw buffers). func could be called many * times during this iteration. */ int abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size, abd_iter_func2_t *func, void *private) { int ret = 0; struct abd_iter daiter, saiter; abd_t *c_dabd, *c_sabd; if (size == 0) return (0); abd_verify(dabd); abd_verify(sabd); ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); c_dabd = abd_init_abd_iter(dabd, &daiter, doff); c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { IMPLY(abd_is_gang(dabd), c_dabd != NULL); IMPLY(abd_is_gang(sabd), c_sabd != NULL); abd_iter_map(&daiter); abd_iter_map(&saiter); size_t dlen = MIN(daiter.iter_mapsize, size); size_t slen = MIN(saiter.iter_mapsize, size); size_t len = MIN(dlen, slen); ASSERT(dlen > 0 || slen > 0); ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, private); abd_iter_unmap(&saiter); abd_iter_unmap(&daiter); if (ret != 0) break; size -= len; c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, len); c_sabd = abd_advance_abd_iter(sabd, c_sabd, &saiter, len); } return (ret); } static int abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) { (void) private; (void) memcpy(dbuf, sbuf, size); return (0); } /* * Copy from sabd to dabd starting from soff and doff. */ void abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) { (void) abd_iterate_func2(dabd, sabd, doff, soff, size, abd_copy_off_cb, NULL); } static int abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) { (void) private; return (memcmp(bufa, bufb, size)); } /* * Compares the contents of two ABDs. */ int abd_cmp(abd_t *dabd, abd_t *sabd) { ASSERT3U(dabd->abd_size, ==, sabd->abd_size); return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, abd_cmp_cb, NULL)); } /* * Iterate over code ABDs and a data ABD and call @func_raidz_gen. * * @cabds parity ABDs, must have equal size * @dabd data ABD. Can be NULL (in this case @dsize = 0) * @func_raidz_gen should be implemented so that its behaviour * is the same when taking linear and when taking scatter */ void -abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, + size_t csize, size_t dsize, const unsigned parity, void (*func_raidz_gen)(void **, const void *, size_t, size_t)) { int i; - ssize_t len, dlen; + size_t len, dlen; struct abd_iter caiters[3]; struct abd_iter daiter; void *caddrs[3]; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { abd_verify(cabds[i]); - ASSERT3U(csize, <=, cabds[i]->abd_size); - c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); + ASSERT3U(off + csize, <=, cabds[i]->abd_size); + c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off); } - ASSERT3S(dsize, >=, 0); if (dsize > 0) { ASSERT(dabd); abd_verify(dabd); - ASSERT3U(dsize, <=, dabd->abd_size); - c_dabd = abd_init_abd_iter(dabd, &daiter, 0); + ASSERT3U(off + dsize, <=, dabd->abd_size); + c_dabd = abd_init_abd_iter(dabd, &daiter, off); } abd_enter_critical(flags); while (csize > 0) { len = csize; for (i = 0; i < parity; i++) { IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; len = MIN(caiters[i].iter_mapsize, len); } if (dsize > 0) { IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); len = MIN(daiter.iter_mapsize, len); dlen = len; } else dlen = 0; /* must be progressive */ - ASSERT3S(len, >, 0); + ASSERT3U(len, >, 0); /* * The iterated function likely will not do well if each * segment except the last one is not multiple of 512 (raidz). */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); c_cabds[i] = abd_advance_abd_iter(cabds[i], c_cabds[i], &caiters[i], len); } if (dsize > 0) { abd_iter_unmap(&daiter); c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, dlen); dsize -= dlen; } csize -= len; - - ASSERT3S(dsize, >=, 0); - ASSERT3S(csize, >=, 0); } abd_exit_critical(flags); } /* * Iterate over code ABDs and data reconstruction target ABDs and call * @func_raidz_rec. Function maps at most 6 pages atomically. * * @cabds parity ABDs, must have equal size * @tabds rec target ABDs, at most 3 * @tsize size of data target columns * @func_raidz_rec expects syndrome data in target columns. Function * reconstructs data and overwrites target columns. */ void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, + size_t tsize, const unsigned parity, void (*func_raidz_rec)(void **t, const size_t tsize, void **c, const unsigned *mul), const unsigned *mul) { int i; - ssize_t len; + size_t len; struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { abd_verify(cabds[i]); abd_verify(tabds[i]); ASSERT3U(tsize, <=, cabds[i]->abd_size); ASSERT3U(tsize, <=, tabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &citers[i], 0); c_tabds[i] = abd_init_abd_iter(tabds[i], &xiters[i], 0); } abd_enter_critical(flags); while (tsize > 0) { len = tsize; for (i = 0; i < parity; i++) { IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL); abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; len = MIN(citers[i].iter_mapsize, len); len = MIN(xiters[i].iter_mapsize, len); } /* must be progressive */ ASSERT3S(len, >, 0); /* * The iterated function likely will not do well if each * segment except the last one is not multiple of 512 (raidz). */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); func_raidz_rec(xaddrs, len, caddrs, mul); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&xiters[i]); abd_iter_unmap(&citers[i]); c_tabds[i] = abd_advance_abd_iter(tabds[i], c_tabds[i], &xiters[i], len); c_cabds[i] = abd_advance_abd_iter(cabds[i], c_cabds[i], &citers[i], len); } tsize -= len; ASSERT3S(tsize, >=, 0); } abd_exit_critical(flags); } diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 8ba7e0cd769d..5d77c5d046d5 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -1,1502 +1,1528 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ #ifndef _VDEV_RAIDZ_MATH_IMPL_H #define _VDEV_RAIDZ_MATH_IMPL_H #include #include #define raidz_inline inline __attribute__((always_inline)) #ifndef noinline #define noinline __attribute__((noinline)) #endif /* * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. * @rr RAIDZ row * @tgtidx array of missing data indexes * @coeff output array of coefficients. Array must be provided by * user and must hold minimum MUL_CNT values. */ static noinline void raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); } static noinline void raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); } static noinline void raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; a = gf_exp2(x + 255 - y); b = gf_exp2(255 - (ncols - x - 1)); e = a ^ 0x01; coeff[MUL_PQ_X] = gf_div(a, e); coeff[MUL_PQ_Y] = gf_div(b, e); } static noinline void raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; a = gf_exp4(x + 255 - y); b = gf_exp4(255 - (ncols - x - 1)); e = a ^ 0x01; coeff[MUL_PR_X] = gf_div(a, e); coeff[MUL_PR_Y] = gf_div(b, e); } static noinline void raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t nx, ny, nxxy, nxyy, d; nx = gf_exp2(ncols - x - 1); ny = gf_exp2(ncols - y - 1); nxxy = gf_mul(gf_mul(nx, nx), ny); nxyy = gf_mul(gf_mul(nx, ny), ny); d = nxxy ^ nxyy; coeff[MUL_QR_XQ] = ny; coeff[MUL_QR_X] = gf_div(ny, d); coeff[MUL_QR_YQ] = nx; coeff[MUL_QR_Y] = gf_div(nx, d); } static noinline void raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; const unsigned z = tgtidx[TARGET_Z]; gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd; nx = gf_exp2(ncols - x - 1); ny = gf_exp2(ncols - y - 1); nz = gf_exp2(ncols - z - 1); nxx = gf_exp4(ncols - x - 1); nyy = gf_exp4(ncols - y - 1); nzz = gf_exp4(ncols - z - 1); nyyz = gf_mul(gf_mul(ny, nz), ny); nyzz = gf_mul(nzz, ny); xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^ gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^ nyzz; yd = gf_inv(ny ^ nz); coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd); coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd); coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd); coeff[MUL_PQR_YU] = nx; coeff[MUL_PQR_YP] = gf_mul(nz, yd); coeff[MUL_PQR_YQ] = yd; } /* * Method for zeroing a buffer (can be implemented using SIMD). * This method is used by multiple for gen/rec functions. * * @dc Destination buffer * @dsize Destination buffer size * @private Unused */ static int raidz_zero_abd_cb(void *dc, size_t dsize, void *private) { v_t *dst = (v_t *)dc; size_t i; ZERO_DEFINE(); (void) private; /* unused */ ZERO(ZERO_D); for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { STORE(dst + i, ZERO_D); STORE(dst + i + ZERO_STRIDE, ZERO_D); } return (0); } #define raidz_zero(dabd, size) \ { \ abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ } /* * Method for copying two buffers (can be implemented using SIMD). * This method is used by multiple for gen/rec functions. * * @dc Destination buffer * @sc Source buffer * @dsize Destination buffer size * @ssize Source buffer size * @private Unused */ static int raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) { v_t *dst = (v_t *)dc; const v_t *src = (v_t *)sc; size_t i; COPY_DEFINE(); (void) private; /* unused */ for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { LOAD(src + i, COPY_D); STORE(dst + i, COPY_D); LOAD(src + i + COPY_STRIDE, COPY_D); STORE(dst + i + COPY_STRIDE, COPY_D); } return (0); } -#define raidz_copy(dabd, sabd, size) \ +#define raidz_copy(dabd, sabd, off, size) \ { \ - abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ + abd_iterate_func2(dabd, sabd, off, off, size, raidz_copy_abd_cb, \ + NULL); \ } /* * Method for adding (XORing) two buffers. * Source and destination are XORed together and result is stored in * destination buffer. This method is used by multiple for gen/rec functions. * * @dc Destination buffer * @sc Source buffer * @dsize Destination buffer size * @ssize Source buffer size * @private Unused */ static int raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) { v_t *dst = (v_t *)dc; const v_t *src = (v_t *)sc; size_t i; ADD_DEFINE(); (void) private; /* unused */ for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { LOAD(dst + i, ADD_D); XOR_ACC(src + i, ADD_D); STORE(dst + i, ADD_D); LOAD(dst + i + ADD_STRIDE, ADD_D); XOR_ACC(src + i + ADD_STRIDE, ADD_D); STORE(dst + i + ADD_STRIDE, ADD_D); } return (0); } -#define raidz_add(dabd, sabd, size) \ +#define raidz_add(dabd, sabd, off, size) \ { \ - abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ + abd_iterate_func2(dabd, sabd, off, off, size, raidz_add_abd_cb, \ + NULL); \ } /* * Method for multiplying a buffer with a constant in GF(2^8). * Symbols from buffer are multiplied by a constant and result is stored * back in the same buffer. * * @dc In/Out data buffer. * @size Size of the buffer * @private pointer to the multiplication constant (unsigned) */ static int raidz_mul_abd_cb(void *dc, size_t size, void *private) { const unsigned mul = *((unsigned *)private); v_t *d = (v_t *)dc; size_t i; MUL_DEFINE(); for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { LOAD(d + i, MUL_D); MUL(mul, MUL_D); STORE(d + i, MUL_D); LOAD(d + i + MUL_STRIDE, MUL_D); MUL(mul, MUL_D); STORE(d + i + MUL_STRIDE, MUL_D); } return (0); } /* * Syndrome generation/update macros * * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros */ #define P_D_SYNDROME(D, T, t) \ { \ LOAD((t), T); \ XOR(D, T); \ STORE((t), T); \ } #define Q_D_SYNDROME(D, T, t) \ { \ LOAD((t), T); \ MUL2(T); \ XOR(D, T); \ STORE((t), T); \ } #define Q_SYNDROME(T, t) \ { \ LOAD((t), T); \ MUL2(T); \ STORE((t), T); \ } #define R_D_SYNDROME(D, T, t) \ { \ LOAD((t), T); \ MUL4(T); \ XOR(D, T); \ STORE((t), T); \ } #define R_SYNDROME(T, t) \ { \ LOAD((t), T); \ MUL4(T); \ STORE((t), T); \ } /* * PARITY CALCULATION * * Macros *_SYNDROME are used for parity/syndrome calculation. * *_D_SYNDROME() macros are used to calculate syndrome between 0 and * length of data column, and *_SYNDROME() macros are only for updating * the parity/syndrome if data column is shorter. * * P parity is calculated using raidz_add_abd(). + * + * For CPU L2 cache blocking we process 64KB at a time. */ +#define CHUNK 65536 /* * Generate P parity (RAIDZ1) * * @rr RAIDZ row */ static raidz_inline void raidz_generate_p_impl(raidz_row_t * const rr) { size_t c; const size_t ncols = rr->rr_cols; const size_t psize = rr->rr_col[CODE_P].rc_size; abd_t *pabd = rr->rr_col[CODE_P].rc_abd; - size_t size; - abd_t *dabd; + size_t off, size; raidz_math_begin(); - /* start with first data column */ - raidz_copy(pabd, rr->rr_col[1].rc_abd, psize); + for (off = 0; off < psize; off += CHUNK) { - for (c = 2; c < ncols; c++) { - dabd = rr->rr_col[c].rc_abd; - size = rr->rr_col[c].rc_size; + /* start with first data column */ + size = MIN(CHUNK, psize - off); + raidz_copy(pabd, rr->rr_col[1].rc_abd, off, size); - /* add data column */ - raidz_add(pabd, dabd, size); + for (c = 2; c < ncols; c++) { + size = rr->rr_col[c].rc_size; + if (size <= off) + continue; + + /* add data column */ + size = MIN(CHUNK, size - off); + abd_t *dabd = rr->rr_col[c].rc_abd; + raidz_add(pabd, dabd, off, size); + } } raidz_math_end(); } /* * Generate PQ parity (RAIDZ2) * The function is called per data column. * * @c array of pointers to parity (code) columns * @dc pointer to data column * @csize size of parity columns * @dsize size of data column */ static void raidz_gen_pq_add(void **c, const void *dc, const size_t csize, const size_t dsize) { v_t *p = (v_t *)c[0]; v_t *q = (v_t *)c[1]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const qend = q + (csize / sizeof (v_t)); GEN_PQ_DEFINE(); MUL2_SETUP(); for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, q += GEN_PQ_STRIDE) { LOAD(d, GEN_PQ_D); P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); } for (; q < qend; q += GEN_PQ_STRIDE) { Q_SYNDROME(GEN_PQ_C, q); } } /* * Generate PQ parity (RAIDZ2) * * @rr RAIDZ row */ static raidz_inline void raidz_generate_pq_impl(raidz_row_t * const rr) { size_t c; const size_t ncols = rr->rr_cols; const size_t csize = rr->rr_col[CODE_P].rc_size; - size_t dsize; + size_t off, size, dsize; abd_t *dabd; abd_t *cabds[] = { rr->rr_col[CODE_P].rc_abd, rr->rr_col[CODE_Q].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize); + for (off = 0; off < csize; off += CHUNK) { + + size = MIN(CHUNK, csize - off); + raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, off, size); + raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, off, size); - for (c = 3; c < ncols; c++) { - dabd = rr->rr_col[c].rc_abd; - dsize = rr->rr_col[c].rc_size; + for (c = 3; c < ncols; c++) { + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; + dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0; - abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, - raidz_gen_pq_add); + abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 2, + raidz_gen_pq_add); + } } raidz_math_end(); } /* * Generate PQR parity (RAIDZ3) * The function is called per data column. * * @c array of pointers to parity (code) columns * @dc pointer to data column * @csize size of parity columns * @dsize size of data column */ static void raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, const size_t dsize) { v_t *p = (v_t *)c[CODE_P]; v_t *q = (v_t *)c[CODE_Q]; v_t *r = (v_t *)c[CODE_R]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const qend = q + (csize / sizeof (v_t)); GEN_PQR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { LOAD(d, GEN_PQR_D); P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); } for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { Q_SYNDROME(GEN_PQR_C, q); R_SYNDROME(GEN_PQR_C, r); } } /* * Generate PQR parity (RAIDZ3) * * @rr RAIDZ row */ static raidz_inline void raidz_generate_pqr_impl(raidz_row_t * const rr) { size_t c; const size_t ncols = rr->rr_cols; const size_t csize = rr->rr_col[CODE_P].rc_size; - size_t dsize; + size_t off, size, dsize; abd_t *dabd; abd_t *cabds[] = { rr->rr_col[CODE_P].rc_abd, rr->rr_col[CODE_Q].rc_abd, rr->rr_col[CODE_R].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize); + for (off = 0; off < csize; off += CHUNK) { - for (c = 4; c < ncols; c++) { - dabd = rr->rr_col[c].rc_abd; - dsize = rr->rr_col[c].rc_size; + size = MIN(CHUNK, csize - off); + raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, off, size); + raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, off, size); + raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, off, size); + + for (c = 4; c < ncols; c++) { + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; + dsize = (dsize > off) ? MIN(CHUNK, dsize - off) : 0; - abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, - raidz_gen_pqr_add); + abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 3, + raidz_gen_pqr_add); + } } raidz_math_end(); } /* * DATA RECONSTRUCTION * * Data reconstruction process consists of two phases: * - Syndrome calculation * - Data reconstruction * * Syndrome is calculated by generating parity using available data columns * and zeros in places of erasure. Existing parity is added to corresponding * syndrome value to obtain the [P|Q|R]syn values from equation: * P = Psyn + Dx + Dy + Dz * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz * * For data reconstruction phase, the corresponding equations are solved * for missing data (Dx, Dy, Dz). This generally involves multiplying known * symbols by an coefficient and adding them together. The multiplication * constant coefficients are calculated ahead of the operation in * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. * * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" * and "short" columns. * For this reason, reconstruction is performed in minimum of * two steps. First, from offset 0 to short_size, then from short_size to * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work * over both ranges. The split also enables removal of conditional expressions * from loop bodies, improving throughput of SIMD implementations. * For the best performance, all functions marked with raidz_inline attribute * must be inlined by compiler. * * parity data * columns columns * <----------> <------------------> * x y <----+ missing columns (x, y) * | | * +---+---+---+---+-v-+---+-v-+---+ ^ 0 * | | | | | | | | | | * | | | | | | | | | | * | P | Q | R | D | D | D | D | D | | * | | | | 0 | 1 | 2 | 3 | 4 | | * | | | | | | | | | v * | | | | | +---+---+---+ ^ short_size * | | | | | | | * +---+---+---+---+---+ v big_size * <------------------> <----------> * big columns short columns * */ /* * Reconstruct single data column using P parity * * @syn_method raidz_add_abd() * @rec_method not applicable * * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; const size_t firstdc = rr->rr_firstdatacol; const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t xsize = rr->rr_col[x].rc_size; abd_t *xabd = rr->rr_col[x].rc_abd; - size_t size; - abd_t *dabd; + size_t off, size; if (xabd == NULL) return (1 << CODE_P); raidz_math_begin(); - /* copy P into target */ - raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize); + for (off = 0; off < xsize; off += CHUNK) { - /* generate p_syndrome */ - for (c = firstdc; c < ncols; c++) { - if (c == x) - continue; + /* copy P into target */ + size = MIN(CHUNK, xsize - off); + raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, off, size); - dabd = rr->rr_col[c].rc_abd; - size = MIN(rr->rr_col[c].rc_size, xsize); + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; + size = rr->rr_col[c].rc_size; + if (size <= off) + continue; - raidz_add(xabd, dabd, size); + size = MIN(CHUNK, MIN(size, xsize) - off); + abd_t *dabd = rr->rr_col[c].rc_abd; + raidz_add(xabd, dabd, off, size); + } } raidz_math_end(); return (1 << CODE_P); } /* * Generate Q syndrome (Qsyn) * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @xsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, const size_t dsize) { v_t *x = (v_t *)xc[TARGET_X]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const xend = x + (xsize / sizeof (v_t)); SYN_Q_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { LOAD(d, SYN_Q_D); Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); } for (; x < xend; x += SYN_STRIDE) { Q_SYNDROME(SYN_Q_X, x); } } /* * Reconstruct single data column using Q parity * * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = rr->rr_firstdatacol; const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; abd_t *xabd = rr->rr_col[x].rc_abd; const size_t xsize = rr->rr_col[x].rc_size; abd_t *tabds[] = { xabd }; if (xabd == NULL) return (1 << CODE_Q); unsigned coeff[MUL_CNT]; raidz_rec_q_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x) { dabd = NULL; dsize = 0; } else { dabd = rr->rr_col[c].rc_abd; dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1, raidz_syn_q_abd); } /* add Q to the syndrome */ - raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, 0, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); raidz_math_end(); return (1 << CODE_Q); } /* * Generate R syndrome (Rsyn) * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, const size_t dsize) { v_t *x = (v_t *)xc[TARGET_X]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const xend = x + (tsize / sizeof (v_t)); SYN_R_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { LOAD(d, SYN_R_D); R_D_SYNDROME(SYN_R_D, SYN_R_X, x); } for (; x < xend; x += SYN_STRIDE) { R_SYNDROME(SYN_R_X, x); } } /* * Reconstruct single data column using R parity * * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * * @rr RAIDZ rr * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = rr->rr_firstdatacol; const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t xsize = rr->rr_col[x].rc_size; abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *tabds[] = { xabd }; if (xabd == NULL) return (1 << CODE_R); unsigned coeff[MUL_CNT]; raidz_rec_r_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x) { dabd = NULL; dsize = 0; } else { dabd = rr->rr_col[c].rc_abd; dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1, raidz_syn_r_abd); } /* add R to the syndrome */ - raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, 0, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); raidz_math_end(); return (1 << CODE_R); } /* * Generate P and Q syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, const size_t dsize) { v_t *x = (v_t *)tc[TARGET_X]; v_t *y = (v_t *)tc[TARGET_Y]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const yend = y + (tsize / sizeof (v_t)); SYN_PQ_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { LOAD(d, SYN_PQ_D); P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); } for (; y < yend; y += SYN_STRIDE) { Q_SYNDROME(SYN_PQ_X, y); } } /* * Reconstruct data using PQ parity and PQ syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, const unsigned *mul) { v_t *x = (v_t *)tc[TARGET_X]; v_t *y = (v_t *)tc[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; REC_PQ_DEFINE(); for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { LOAD(x, REC_PQ_X); LOAD(y, REC_PQ_Y); XOR_ACC(p, REC_PQ_X); XOR_ACC(q, REC_PQ_Y); /* Save Pxy */ COPY(REC_PQ_X, REC_PQ_T); /* Calc X */ MUL(mul[MUL_PQ_X], REC_PQ_X); MUL(mul[MUL_PQ_Y], REC_PQ_Y); XOR(REC_PQ_Y, REC_PQ_X); STORE(x, REC_PQ_X); /* Calc Y */ XOR(REC_PQ_T, REC_PQ_X); STORE(y, REC_PQ_X); } } /* * Reconstruct two data columns using PQ parity * * @syn_method raidz_syn_pq_abd() * @rec_method raidz_rec_pq_abd() * * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = rr->rr_firstdatacol; const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t xsize = rr->rr_col[x].rc_size; const size_t ysize = rr->rr_col[y].rc_size; abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { rr->rr_col[CODE_P].rc_abd, rr->rr_col[CODE_Q].rc_abd }; if (xabd == NULL) return ((1 << CODE_P) | (1 << CODE_Q)); unsigned coeff[MUL_CNT]; raidz_rec_pq_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others * In this case, shorter target needs to be replaced with * new buffer so that syndrome can be calculated. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y) { dabd = NULL; dsize = 0; } else { dabd = rr->rr_col[c].rc_abd; dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2, raidz_syn_pq_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); /* Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); return ((1 << CODE_P) | (1 << CODE_Q)); } /* * Generate P and R syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, const size_t dsize) { v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const yend = y + (tsize / sizeof (v_t)); SYN_PR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { LOAD(d, SYN_PR_D); P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); } for (; y < yend; y += SYN_STRIDE) { R_SYNDROME(SYN_PR_X, y); } } /* * Reconstruct data using PR parity and PR syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_pr_abd(void **t, const size_t tsize, void **c, const unsigned *mul) { v_t *x = (v_t *)t[TARGET_X]; v_t *y = (v_t *)t[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; REC_PR_DEFINE(); for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, p += REC_PR_STRIDE, q += REC_PR_STRIDE) { LOAD(x, REC_PR_X); LOAD(y, REC_PR_Y); XOR_ACC(p, REC_PR_X); XOR_ACC(q, REC_PR_Y); /* Save Pxy */ COPY(REC_PR_X, REC_PR_T); /* Calc X */ MUL(mul[MUL_PR_X], REC_PR_X); MUL(mul[MUL_PR_Y], REC_PR_Y); XOR(REC_PR_Y, REC_PR_X); STORE(x, REC_PR_X); /* Calc Y */ XOR(REC_PR_T, REC_PR_X); STORE(y, REC_PR_X); } } /* * Reconstruct two data columns using PR parity * * @syn_method raidz_syn_pr_abd() * @rec_method raidz_rec_pr_abd() * * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = rr->rr_firstdatacol; const size_t ncols = rr->rr_cols; const size_t x = tgtidx[0]; const size_t y = tgtidx[1]; const size_t xsize = rr->rr_col[x].rc_size; const size_t ysize = rr->rr_col[y].rc_size; abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { rr->rr_col[CODE_P].rc_abd, rr->rr_col[CODE_R].rc_abd }; if (xabd == NULL) return ((1 << CODE_P) | (1 << CODE_R)); unsigned coeff[MUL_CNT]; raidz_rec_pr_coeff(rr, tgtidx, coeff); /* * Check if some of targets are shorter then others. * They need to be replaced with a new buffer so that syndrome can * be calculated on full length. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y) { dabd = NULL; dsize = 0; } else { dabd = rr->rr_col[c].rc_abd; dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2, raidz_syn_pr_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); /* * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); return ((1 << CODE_P) | (1 << CODE_R)); } /* * Generate Q and R syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, const size_t dsize) { v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); SYN_QR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { LOAD(d, SYN_PQ_D); Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); } for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { Q_SYNDROME(SYN_QR_X, x); R_SYNDROME(SYN_QR_X, y); } } /* * Reconstruct data using QR parity and QR syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_qr_abd(void **t, const size_t tsize, void **c, const unsigned *mul) { v_t *x = (v_t *)t[TARGET_X]; v_t *y = (v_t *)t[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; REC_QR_DEFINE(); for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, p += REC_QR_STRIDE, q += REC_QR_STRIDE) { LOAD(x, REC_QR_X); LOAD(y, REC_QR_Y); XOR_ACC(p, REC_QR_X); XOR_ACC(q, REC_QR_Y); /* Save Pxy */ COPY(REC_QR_X, REC_QR_T); /* Calc X */ MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ STORE(x, REC_QR_X); /* Calc Y */ MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ STORE(y, REC_QR_T); } } /* * Reconstruct two data columns using QR parity * * @syn_method raidz_syn_qr_abd() * @rec_method raidz_rec_qr_abd() * * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = rr->rr_firstdatacol; const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t xsize = rr->rr_col[x].rc_size; const size_t ysize = rr->rr_col[y].rc_size; abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { rr->rr_col[CODE_Q].rc_abd, rr->rr_col[CODE_R].rc_abd }; if (xabd == NULL) return ((1 << CODE_Q) | (1 << CODE_R)); unsigned coeff[MUL_CNT]; raidz_rec_qr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others * In this case, shorter target needs to be replaced with * new buffer so that syndrome can be calculated. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y) { dabd = NULL; dsize = 0; } else { dabd = rr->rr_col[c].rc_abd; dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2, raidz_syn_qr_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); /* * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); return ((1 << CODE_Q) | (1 << CODE_R)); } /* * Generate P, Q, and R syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, const size_t dsize) { v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; v_t *z = (v_t *)c[TARGET_Z]; const v_t * const yend = y + (tsize / sizeof (v_t)); const v_t *d = (const v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); SYN_PQR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, z += SYN_STRIDE) { LOAD(d, SYN_PQR_D); P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); } for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { Q_SYNDROME(SYN_PQR_X, y); R_SYNDROME(SYN_PQR_X, z); } } /* * Reconstruct data using PRQ parity and PQR syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, const unsigned * const mul) { v_t *x = (v_t *)t[TARGET_X]; v_t *y = (v_t *)t[TARGET_Y]; v_t *z = (v_t *)t[TARGET_Z]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; const v_t *r = (v_t *)c[CODE_R]; REC_PQR_DEFINE(); for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, r += REC_PQR_STRIDE) { LOAD(x, REC_PQR_X); LOAD(y, REC_PQR_Y); LOAD(z, REC_PQR_Z); XOR_ACC(p, REC_PQR_X); XOR_ACC(q, REC_PQR_Y); XOR_ACC(r, REC_PQR_Z); /* Save Pxyz and Qxyz */ COPY(REC_PQR_X, REC_PQR_XS); COPY(REC_PQR_Y, REC_PQR_YS); /* Calc X */ MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ XOR(REC_PQR_Y, REC_PQR_X); MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ STORE(x, REC_PQR_X); /* Calc Y */ XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ STORE(y, REC_PQR_YS); /* Calc Z */ XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ STORE(z, REC_PQR_YS); } } /* * Reconstruct three data columns using PQR parity * * @syn_method raidz_syn_pqr_abd() * @rec_method raidz_rec_pqr_abd() * * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = rr->rr_firstdatacol; const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t z = tgtidx[TARGET_Z]; const size_t xsize = rr->rr_col[x].rc_size; const size_t ysize = rr->rr_col[y].rc_size; const size_t zsize = rr->rr_col[z].rc_size; abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *zabd = rr->rr_col[z].rc_abd; abd_t *tabds[] = { xabd, yabd, zabd }; abd_t *cabds[] = { rr->rr_col[CODE_P].rc_abd, rr->rr_col[CODE_Q].rc_abd, rr->rr_col[CODE_R].rc_abd }; if (xabd == NULL) return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); unsigned coeff[MUL_CNT]; raidz_rec_pqr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others * In this case, shorter target needs to be replaced with * new buffer so that syndrome can be calculated. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } if (zsize < xsize) { zabd = abd_alloc(xsize, B_FALSE); tabds[2] = zabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); - raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize); + raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, 0, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); raidz_zero(zabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y || c == z) { dabd = NULL; dsize = 0; } else { dabd = rr->rr_col[c].rc_abd; dsize = rr->rr_col[c].rc_size; } - abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 3, raidz_syn_pqr_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); /* * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize); if (zsize < xsize) - raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize); + raidz_copy(rr->rr_col[z].rc_abd, zabd, 0, zsize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); if (zsize < xsize) abd_free(zabd); return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); } #endif /* _VDEV_RAIDZ_MATH_IMPL_H */