diff --git a/include/os/linux/spl/sys/atomic.h b/include/os/linux/spl/sys/atomic.h
index 2d21cbb3e140..8f7fa5aeda11 100644
--- a/include/os/linux/spl/sys/atomic.h
+++ b/include/os/linux/spl/sys/atomic.h
@@ -1,78 +1,82 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _SPL_ATOMIC_H
 #define	_SPL_ATOMIC_H
 
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <sys/types.h>
 
 /*
  * Map the atomic_* functions to the Linux counterparts.  This relies on the
  * fact that the atomic types are internally really a uint32 or uint64.  If
  * this were to change an alternate approach would be needed.
  *
  * N.B. Due to the limitations of the original API atomicity is not strictly
  * preserved when using the 64-bit functions on a 32-bit system.  In order
  * to support this all consumers would need to be updated to use the Linux
  * provided atomic_t and atomic64_t types.
  */
 #define	atomic_inc_32(v)	atomic_inc((atomic_t *)(v))
 #define	atomic_dec_32(v)	atomic_dec((atomic_t *)(v))
 #define	atomic_add_32(v, i)	atomic_add((i), (atomic_t *)(v))
 #define	atomic_sub_32(v, i)	atomic_sub((i), (atomic_t *)(v))
 #define	atomic_inc_32_nv(v)	atomic_inc_return((atomic_t *)(v))
 #define	atomic_dec_32_nv(v)	atomic_dec_return((atomic_t *)(v))
 #define	atomic_add_32_nv(v, i)	atomic_add_return((i), (atomic_t *)(v))
 #define	atomic_sub_32_nv(v, i)	atomic_sub_return((i), (atomic_t *)(v))
 #define	atomic_cas_32(v, x, y)	atomic_cmpxchg((atomic_t *)(v), x, y)
 #define	atomic_swap_32(v, x)	atomic_xchg((atomic_t *)(v), x)
+#define	atomic_load_32(v)	atomic_read((atomic_t *)(v))
+#define	atomic_store_32(v, x)	atomic_set((atomic_t *)(v), x)
 #define	atomic_inc_64(v)	atomic64_inc((atomic64_t *)(v))
 #define	atomic_dec_64(v)	atomic64_dec((atomic64_t *)(v))
 #define	atomic_add_64(v, i)	atomic64_add((i), (atomic64_t *)(v))
 #define	atomic_sub_64(v, i)	atomic64_sub((i), (atomic64_t *)(v))
 #define	atomic_inc_64_nv(v)	atomic64_inc_return((atomic64_t *)(v))
 #define	atomic_dec_64_nv(v)	atomic64_dec_return((atomic64_t *)(v))
 #define	atomic_add_64_nv(v, i)	atomic64_add_return((i), (atomic64_t *)(v))
 #define	atomic_sub_64_nv(v, i)	atomic64_sub_return((i), (atomic64_t *)(v))
 #define	atomic_cas_64(v, x, y)	atomic64_cmpxchg((atomic64_t *)(v), x, y)
 #define	atomic_swap_64(v, x)	atomic64_xchg((atomic64_t *)(v), x)
+#define	atomic_load_64(v)	atomic64_read((atomic64_t *)(v))
+#define	atomic_store_64(v, x)	atomic64_set((atomic64_t *)(v), x)
 
 #ifdef _LP64
 static __inline__ void *
 atomic_cas_ptr(volatile void *target,  void *cmp, void *newval)
 {
 	return ((void *)atomic_cas_64((volatile uint64_t *)target,
 	    (uint64_t)cmp, (uint64_t)newval));
 }
 #else /* _LP64 */
 static __inline__ void *
 atomic_cas_ptr(volatile void *target,  void *cmp, void *newval)
 {
 	return ((void *)atomic_cas_32((volatile uint32_t *)target,
 	    (uint32_t)cmp, (uint32_t)newval));
 }
 #endif /* _LP64 */
 
 #endif  /* _SPL_ATOMIC_H */
diff --git a/include/sys/aggsum.h b/include/sys/aggsum.h
index cb43727f1df4..65800058cbf6 100644
--- a/include/sys/aggsum.h
+++ b/include/sys/aggsum.h
@@ -1,59 +1,60 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2017 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_AGGSUM_H
 #define	_SYS_AGGSUM_H
 
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef struct aggsum_bucket aggsum_bucket_t;
 
 struct aggsum_bucket {
 	kmutex_t asc_lock;
 	int64_t asc_delta;
 	uint64_t asc_borrowed;
 } ____cacheline_aligned;
 
 /*
  * Fan out over FANOUT cpus.
  */
 typedef struct aggsum {
 	kmutex_t as_lock;
 	int64_t as_lower_bound;
-	int64_t as_upper_bound;
+	uint64_t as_upper_bound;
+	aggsum_bucket_t *as_buckets ____cacheline_aligned;
 	uint_t as_numbuckets;
-	aggsum_bucket_t *as_buckets;
+	uint_t as_bucketshift;
 } aggsum_t;
 
 void aggsum_init(aggsum_t *, uint64_t);
 void aggsum_fini(aggsum_t *);
 int64_t aggsum_lower_bound(aggsum_t *);
-int64_t aggsum_upper_bound(aggsum_t *);
+uint64_t aggsum_upper_bound(aggsum_t *);
 int aggsum_compare(aggsum_t *, uint64_t);
 uint64_t aggsum_value(aggsum_t *);
 void aggsum_add(aggsum_t *, int64_t);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_AGGSUM_H */
diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c
index 35535ea49c79..504422b8e226 100644
--- a/lib/libspl/asm-generic/atomic.c
+++ b/lib/libspl/asm-generic/atomic.c
@@ -1,450 +1,463 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2009 by Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <atomic.h>
 #include <assert.h>
 #include <pthread.h>
 
 /*
  * All operations are implemented by serializing them through a global
  * pthread mutex.  This provides a correct generic implementation.
  * However all supported architectures are encouraged to provide a
  * native implementation is assembly for performance reasons.
  */
 pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER;
 
 /*
  * These are the void returning variants
  */
 /* BEGIN CSTYLED */
 #define	ATOMIC_INC(name, type) \
 	void atomic_inc_##name(volatile type *target)			\
 	{								\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		(*target)++;						\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 	}
 
 ATOMIC_INC(8, uint8_t)
 ATOMIC_INC(uchar, uchar_t)
 ATOMIC_INC(16, uint16_t)
 ATOMIC_INC(ushort, ushort_t)
 ATOMIC_INC(32, uint32_t)
 ATOMIC_INC(uint, uint_t)
 ATOMIC_INC(ulong, ulong_t)
 ATOMIC_INC(64, uint64_t)
 
 
 #define	ATOMIC_DEC(name, type) \
 	void atomic_dec_##name(volatile type *target)			\
 	{								\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		(*target)--;						\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 	}
 
 ATOMIC_DEC(8, uint8_t)
 ATOMIC_DEC(uchar, uchar_t)
 ATOMIC_DEC(16, uint16_t)
 ATOMIC_DEC(ushort, ushort_t)
 ATOMIC_DEC(32, uint32_t)
 ATOMIC_DEC(uint, uint_t)
 ATOMIC_DEC(ulong, ulong_t)
 ATOMIC_DEC(64, uint64_t)
 
 
 #define	ATOMIC_ADD(name, type1, type2) \
 	void atomic_add_##name(volatile type1 *target, type2 bits)	\
 	{								\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		*target += bits;					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 	}
 
 ATOMIC_ADD(8, uint8_t, int8_t)
 ATOMIC_ADD(char, uchar_t, signed char)
 ATOMIC_ADD(16, uint16_t, int16_t)
 ATOMIC_ADD(short, ushort_t, short)
 ATOMIC_ADD(32, uint32_t, int32_t)
 ATOMIC_ADD(int, uint_t, int)
 ATOMIC_ADD(long, ulong_t, long)
 ATOMIC_ADD(64, uint64_t, int64_t)
 
 void
 atomic_add_ptr(volatile void *target, ssize_t bits)
 {
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	*(caddr_t *)target += bits;
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 }
 
 
 #define	ATOMIC_SUB(name, type1, type2) \
 	void atomic_sub_##name(volatile type1 *target, type2 bits)	\
 	{								\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		*target -= bits;					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 	}
 
 ATOMIC_SUB(8, uint8_t, int8_t)
 ATOMIC_SUB(char, uchar_t, signed char)
 ATOMIC_SUB(16, uint16_t, int16_t)
 ATOMIC_SUB(short, ushort_t, short)
 ATOMIC_SUB(32, uint32_t, int32_t)
 ATOMIC_SUB(int, uint_t, int)
 ATOMIC_SUB(long, ulong_t, long)
 ATOMIC_SUB(64, uint64_t, int64_t)
 
 void
 atomic_sub_ptr(volatile void *target, ssize_t bits)
 {
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	*(caddr_t *)target -= bits;
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 }
 
 
 #define	ATOMIC_OR(name, type) \
 	void atomic_or_##name(volatile type *target, type bits)		\
 	{								\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		*target |= bits;					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 	}
 
 ATOMIC_OR(8, uint8_t)
 ATOMIC_OR(uchar, uchar_t)
 ATOMIC_OR(16, uint16_t)
 ATOMIC_OR(ushort, ushort_t)
 ATOMIC_OR(32, uint32_t)
 ATOMIC_OR(uint, uint_t)
 ATOMIC_OR(ulong, ulong_t)
 ATOMIC_OR(64, uint64_t)
 
 
 #define	ATOMIC_AND(name, type) \
 	void atomic_and_##name(volatile type *target, type bits)	\
 	{								\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		*target &= bits;					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 	}
 
 ATOMIC_AND(8, uint8_t)
 ATOMIC_AND(uchar, uchar_t)
 ATOMIC_AND(16, uint16_t)
 ATOMIC_AND(ushort, ushort_t)
 ATOMIC_AND(32, uint32_t)
 ATOMIC_AND(uint, uint_t)
 ATOMIC_AND(ulong, ulong_t)
 ATOMIC_AND(64, uint64_t)
 
 
 /*
  * New value returning variants
  */
 
 #define	ATOMIC_INC_NV(name, type) \
 	type atomic_inc_##name##_nv(volatile type *target)		\
 	{								\
 		type rc;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		rc = (++(*target));					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (rc);						\
 	}
 
 ATOMIC_INC_NV(8, uint8_t)
 ATOMIC_INC_NV(uchar, uchar_t)
 ATOMIC_INC_NV(16, uint16_t)
 ATOMIC_INC_NV(ushort, ushort_t)
 ATOMIC_INC_NV(32, uint32_t)
 ATOMIC_INC_NV(uint, uint_t)
 ATOMIC_INC_NV(ulong, ulong_t)
 ATOMIC_INC_NV(64, uint64_t)
 
 
 #define	ATOMIC_DEC_NV(name, type) \
 	type atomic_dec_##name##_nv(volatile type *target)		\
 	{								\
 		type rc;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		rc = (--(*target));					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (rc);						\
 	}
 
 ATOMIC_DEC_NV(8, uint8_t)
 ATOMIC_DEC_NV(uchar, uchar_t)
 ATOMIC_DEC_NV(16, uint16_t)
 ATOMIC_DEC_NV(ushort, ushort_t)
 ATOMIC_DEC_NV(32, uint32_t)
 ATOMIC_DEC_NV(uint, uint_t)
 ATOMIC_DEC_NV(ulong, ulong_t)
 ATOMIC_DEC_NV(64, uint64_t)
 
 
 #define	ATOMIC_ADD_NV(name, type1, type2) \
 	type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits)\
 	{								\
 		type1 rc;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		rc = (*target += bits);					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (rc);						\
 	}
 
 ATOMIC_ADD_NV(8, uint8_t, int8_t)
 ATOMIC_ADD_NV(char, uchar_t, signed char)
 ATOMIC_ADD_NV(16, uint16_t, int16_t)
 ATOMIC_ADD_NV(short, ushort_t, short)
 ATOMIC_ADD_NV(32, uint32_t, int32_t)
 ATOMIC_ADD_NV(int, uint_t, int)
 ATOMIC_ADD_NV(long, ulong_t, long)
 ATOMIC_ADD_NV(64, uint64_t, int64_t)
 
 void *
 atomic_add_ptr_nv(volatile void *target, ssize_t bits)
 {
 	void *ptr;
 
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	ptr = (*(caddr_t *)target += bits);
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 
 	return (ptr);
 }
 
 
 #define	ATOMIC_SUB_NV(name, type1, type2) \
 	type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\
 	{								\
 		type1 rc;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		rc = (*target -= bits);					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (rc);						\
 	}
 
 ATOMIC_SUB_NV(8, uint8_t, int8_t)
 ATOMIC_SUB_NV(char, uchar_t, signed char)
 ATOMIC_SUB_NV(16, uint16_t, int16_t)
 ATOMIC_SUB_NV(short, ushort_t, short)
 ATOMIC_SUB_NV(32, uint32_t, int32_t)
 ATOMIC_SUB_NV(int, uint_t, int)
 ATOMIC_SUB_NV(long, ulong_t, long)
 ATOMIC_SUB_NV(64, uint64_t, int64_t)
 
 void *
 atomic_sub_ptr_nv(volatile void *target, ssize_t bits)
 {
 	void *ptr;
 
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	ptr = (*(caddr_t *)target -= bits);
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 
 	return (ptr);
 }
 
 
 #define	ATOMIC_OR_NV(name, type) \
 	type atomic_or_##name##_nv(volatile type *target, type bits)	\
 	{								\
 		type rc;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		rc = (*target |= bits);					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (rc);						\
 	}
 
 ATOMIC_OR_NV(8, uint8_t)
 ATOMIC_OR_NV(uchar, uchar_t)
 ATOMIC_OR_NV(16, uint16_t)
 ATOMIC_OR_NV(ushort, ushort_t)
 ATOMIC_OR_NV(32, uint32_t)
 ATOMIC_OR_NV(uint, uint_t)
 ATOMIC_OR_NV(ulong, ulong_t)
 ATOMIC_OR_NV(64, uint64_t)
 
 
 #define	ATOMIC_AND_NV(name, type) \
 	type atomic_and_##name##_nv(volatile type *target, type bits)	\
 	{								\
 		type rc;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		rc = (*target &= bits);					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (rc);						\
 	}
 
 ATOMIC_AND_NV(8, uint8_t)
 ATOMIC_AND_NV(uchar, uchar_t)
 ATOMIC_AND_NV(16, uint16_t)
 ATOMIC_AND_NV(ushort, ushort_t)
 ATOMIC_AND_NV(32, uint32_t)
 ATOMIC_AND_NV(uint, uint_t)
 ATOMIC_AND_NV(ulong, ulong_t)
 ATOMIC_AND_NV(64, uint64_t)
 
 
 /*
  *  If *arg1 == arg2, set *arg1 = arg3; return old value
  */
 
 #define	ATOMIC_CAS(name, type) \
 	type atomic_cas_##name(volatile type *target, type arg1, type arg2) \
 	{								\
 		type old;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		old = *target;						\
 		if (old == arg1)					\
 			*target = arg2;					\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (old);						\
 	}
 
 ATOMIC_CAS(8, uint8_t)
 ATOMIC_CAS(uchar, uchar_t)
 ATOMIC_CAS(16, uint16_t)
 ATOMIC_CAS(ushort, ushort_t)
 ATOMIC_CAS(32, uint32_t)
 ATOMIC_CAS(uint, uint_t)
 ATOMIC_CAS(ulong, ulong_t)
 ATOMIC_CAS(64, uint64_t)
 
 void *
 atomic_cas_ptr(volatile void *target, void *arg1, void *arg2)
 {
 	void *old;
 
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	old = *(void **)target;
 	if (old == arg1)
 		*(void **)target = arg2;
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 
 	return (old);
 }
 
 
 /*
  * Swap target and return old value
  */
 
 #define	ATOMIC_SWAP(name, type) \
 	type atomic_swap_##name(volatile type *target, type bits)	\
 	{								\
 		type old;						\
 		VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);	\
 		old = *target;						\
 		*target = bits;						\
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);	\
 		return (old);						\
 	}
 
 ATOMIC_SWAP(8, uint8_t)
 ATOMIC_SWAP(uchar, uchar_t)
 ATOMIC_SWAP(16, uint16_t)
 ATOMIC_SWAP(ushort, ushort_t)
 ATOMIC_SWAP(32, uint32_t)
 ATOMIC_SWAP(uint, uint_t)
 ATOMIC_SWAP(ulong, ulong_t)
 ATOMIC_SWAP(64, uint64_t)
 /* END CSTYLED */
 
 void *
 atomic_swap_ptr(volatile void *target, void *bits)
 {
 	void *old;
 
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	old = *(void **)target;
 	*(void **)target = bits;
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 
 	return (old);
 }
 
+#ifndef _LP64
+uint64_t
+atomic_load_64(volatile uint64_t *target)
+{
+	return (__atomic_load_n(target, __ATOMIC_RELAXED));
+}
+
+void
+atomic_store_64(volatile uint64_t *target, uint64_t bits)
+{
+	return (__atomic_store_n(target, bits, __ATOMIC_RELAXED));
+}
+#endif
 
 int
 atomic_set_long_excl(volatile ulong_t *target, uint_t value)
 {
 	ulong_t bit;
 
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	bit = (1UL << value);
 	if ((*target & bit) != 0) {
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 		return (-1);
 	}
 	*target |= bit;
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 
 	return (0);
 }
 
 int
 atomic_clear_long_excl(volatile ulong_t *target, uint_t value)
 {
 	ulong_t bit;
 
 	VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0);
 	bit = (1UL << value);
 	if ((*target & bit) == 0) {
 		VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 		return (-1);
 	}
 	*target &= ~bit;
 	VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0);
 
 	return (0);
 }
 
 void
 membar_enter(void)
 {
 	/* XXX - Implement me */
 }
 
 void
 membar_exit(void)
 {
 	/* XXX - Implement me */
 }
 
 void
 membar_producer(void)
 {
 	/* XXX - Implement me */
 }
 
 void
 membar_consumer(void)
 {
 	/* XXX - Implement me */
 }
diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h
index f8c257f9696b..8dd1d654a486 100644
--- a/lib/libspl/include/atomic.h
+++ b/lib/libspl/include/atomic.h
@@ -1,296 +1,339 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ATOMIC_H
 #define	_SYS_ATOMIC_H
 
 #include <sys/types.h>
 #include <sys/inttypes.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #if defined(__STDC__)
 /*
  * Increment target.
  */
 extern void atomic_inc_8(volatile uint8_t *);
 extern void atomic_inc_uchar(volatile uchar_t *);
 extern void atomic_inc_16(volatile uint16_t *);
 extern void atomic_inc_ushort(volatile ushort_t *);
 extern void atomic_inc_32(volatile uint32_t *);
 extern void atomic_inc_uint(volatile uint_t *);
 extern void atomic_inc_ulong(volatile ulong_t *);
 #if defined(_INT64_TYPE)
 extern void atomic_inc_64(volatile uint64_t *);
 #endif
 
 /*
  * Decrement target
  */
 extern void atomic_dec_8(volatile uint8_t *);
 extern void atomic_dec_uchar(volatile uchar_t *);
 extern void atomic_dec_16(volatile uint16_t *);
 extern void atomic_dec_ushort(volatile ushort_t *);
 extern void atomic_dec_32(volatile uint32_t *);
 extern void atomic_dec_uint(volatile uint_t *);
 extern void atomic_dec_ulong(volatile ulong_t *);
 #if defined(_INT64_TYPE)
 extern void atomic_dec_64(volatile uint64_t *);
 #endif
 
 /*
  * Add delta to target
  */
 extern void atomic_add_8(volatile uint8_t *, int8_t);
 extern void atomic_add_char(volatile uchar_t *, signed char);
 extern void atomic_add_16(volatile uint16_t *, int16_t);
 extern void atomic_add_short(volatile ushort_t *, short);
 extern void atomic_add_32(volatile uint32_t *, int32_t);
 extern void atomic_add_int(volatile uint_t *, int);
 extern void atomic_add_ptr(volatile void *, ssize_t);
 extern void atomic_add_long(volatile ulong_t *, long);
 #if defined(_INT64_TYPE)
 extern void atomic_add_64(volatile uint64_t *, int64_t);
 #endif
 
 /*
  * Subtract delta from target
  */
 extern void atomic_sub_8(volatile uint8_t *, int8_t);
 extern void atomic_sub_char(volatile uchar_t *, signed char);
 extern void atomic_sub_16(volatile uint16_t *, int16_t);
 extern void atomic_sub_short(volatile ushort_t *, short);
 extern void atomic_sub_32(volatile uint32_t *, int32_t);
 extern void atomic_sub_int(volatile uint_t *, int);
 extern void atomic_sub_ptr(volatile void *, ssize_t);
 extern void atomic_sub_long(volatile ulong_t *, long);
 #if defined(_INT64_TYPE)
 extern void atomic_sub_64(volatile uint64_t *, int64_t);
 #endif
 
 /*
  * logical OR bits with target
  */
 extern void atomic_or_8(volatile uint8_t *, uint8_t);
 extern void atomic_or_uchar(volatile uchar_t *, uchar_t);
 extern void atomic_or_16(volatile uint16_t *, uint16_t);
 extern void atomic_or_ushort(volatile ushort_t *, ushort_t);
 extern void atomic_or_32(volatile uint32_t *, uint32_t);
 extern void atomic_or_uint(volatile uint_t *, uint_t);
 extern void atomic_or_ulong(volatile ulong_t *, ulong_t);
 #if defined(_INT64_TYPE)
 extern void atomic_or_64(volatile uint64_t *, uint64_t);
 #endif
 
 /*
  * logical AND bits with target
  */
 extern void atomic_and_8(volatile uint8_t *, uint8_t);
 extern void atomic_and_uchar(volatile uchar_t *, uchar_t);
 extern void atomic_and_16(volatile uint16_t *, uint16_t);
 extern void atomic_and_ushort(volatile ushort_t *, ushort_t);
 extern void atomic_and_32(volatile uint32_t *, uint32_t);
 extern void atomic_and_uint(volatile uint_t *, uint_t);
 extern void atomic_and_ulong(volatile ulong_t *, ulong_t);
 #if defined(_INT64_TYPE)
 extern void atomic_and_64(volatile uint64_t *, uint64_t);
 #endif
 
 /*
  * As above, but return the new value.  Note that these _nv() variants are
  * substantially more expensive on some platforms than the no-return-value
  * versions above, so don't use them unless you really need to know the
  * new value *atomically* (e.g. when decrementing a reference count and
  * checking whether it went to zero).
  */
 
 /*
  * Increment target and return new value.
  */
 extern uint8_t atomic_inc_8_nv(volatile uint8_t *);
 extern uchar_t atomic_inc_uchar_nv(volatile uchar_t *);
 extern uint16_t atomic_inc_16_nv(volatile uint16_t *);
 extern ushort_t atomic_inc_ushort_nv(volatile ushort_t *);
 extern uint32_t atomic_inc_32_nv(volatile uint32_t *);
 extern uint_t atomic_inc_uint_nv(volatile uint_t *);
 extern ulong_t atomic_inc_ulong_nv(volatile ulong_t *);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_inc_64_nv(volatile uint64_t *);
 #endif
 
 /*
  * Decrement target and return new value.
  */
 extern uint8_t atomic_dec_8_nv(volatile uint8_t *);
 extern uchar_t atomic_dec_uchar_nv(volatile uchar_t *);
 extern uint16_t atomic_dec_16_nv(volatile uint16_t *);
 extern ushort_t atomic_dec_ushort_nv(volatile ushort_t *);
 extern uint32_t atomic_dec_32_nv(volatile uint32_t *);
 extern uint_t atomic_dec_uint_nv(volatile uint_t *);
 extern ulong_t atomic_dec_ulong_nv(volatile ulong_t *);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_dec_64_nv(volatile uint64_t *);
 #endif
 
 /*
  * Add delta to target
  */
 extern uint8_t atomic_add_8_nv(volatile uint8_t *, int8_t);
 extern uchar_t atomic_add_char_nv(volatile uchar_t *, signed char);
 extern uint16_t atomic_add_16_nv(volatile uint16_t *, int16_t);
 extern ushort_t atomic_add_short_nv(volatile ushort_t *, short);
 extern uint32_t atomic_add_32_nv(volatile uint32_t *, int32_t);
 extern uint_t atomic_add_int_nv(volatile uint_t *, int);
 extern void *atomic_add_ptr_nv(volatile void *, ssize_t);
 extern ulong_t atomic_add_long_nv(volatile ulong_t *, long);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t);
 #endif
 
 /*
  * Subtract delta from target
  */
 extern uint8_t atomic_sub_8_nv(volatile uint8_t *, int8_t);
 extern uchar_t atomic_sub_char_nv(volatile uchar_t *, signed char);
 extern uint16_t atomic_sub_16_nv(volatile uint16_t *, int16_t);
 extern ushort_t atomic_sub_short_nv(volatile ushort_t *, short);
 extern uint32_t atomic_sub_32_nv(volatile uint32_t *, int32_t);
 extern uint_t atomic_sub_int_nv(volatile uint_t *, int);
 extern void *atomic_sub_ptr_nv(volatile void *, ssize_t);
 extern ulong_t atomic_sub_long_nv(volatile ulong_t *, long);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_sub_64_nv(volatile uint64_t *, int64_t);
 #endif
 
 /*
  * logical OR bits with target and return new value.
  */
 extern uint8_t atomic_or_8_nv(volatile uint8_t *, uint8_t);
 extern uchar_t atomic_or_uchar_nv(volatile uchar_t *, uchar_t);
 extern uint16_t atomic_or_16_nv(volatile uint16_t *, uint16_t);
 extern ushort_t atomic_or_ushort_nv(volatile ushort_t *, ushort_t);
 extern uint32_t atomic_or_32_nv(volatile uint32_t *, uint32_t);
 extern uint_t atomic_or_uint_nv(volatile uint_t *, uint_t);
 extern ulong_t atomic_or_ulong_nv(volatile ulong_t *, ulong_t);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_or_64_nv(volatile uint64_t *, uint64_t);
 #endif
 
 /*
  * logical AND bits with target and return new value.
  */
 extern uint8_t atomic_and_8_nv(volatile uint8_t *, uint8_t);
 extern uchar_t atomic_and_uchar_nv(volatile uchar_t *, uchar_t);
 extern uint16_t atomic_and_16_nv(volatile uint16_t *, uint16_t);
 extern ushort_t atomic_and_ushort_nv(volatile ushort_t *, ushort_t);
 extern uint32_t atomic_and_32_nv(volatile uint32_t *, uint32_t);
 extern uint_t atomic_and_uint_nv(volatile uint_t *, uint_t);
 extern ulong_t atomic_and_ulong_nv(volatile ulong_t *, ulong_t);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_and_64_nv(volatile uint64_t *, uint64_t);
 #endif
 
 /*
  * If *arg1 == arg2, set *arg1 = arg3; return old value
  */
 extern uint8_t atomic_cas_8(volatile uint8_t *, uint8_t, uint8_t);
 extern uchar_t atomic_cas_uchar(volatile uchar_t *, uchar_t, uchar_t);
 extern uint16_t atomic_cas_16(volatile uint16_t *, uint16_t, uint16_t);
 extern ushort_t atomic_cas_ushort(volatile ushort_t *, ushort_t, ushort_t);
 extern uint32_t atomic_cas_32(volatile uint32_t *, uint32_t, uint32_t);
 extern uint_t atomic_cas_uint(volatile uint_t *, uint_t, uint_t);
 extern void *atomic_cas_ptr(volatile void *, void *, void *);
 extern ulong_t atomic_cas_ulong(volatile ulong_t *, ulong_t, ulong_t);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_cas_64(volatile uint64_t *, uint64_t, uint64_t);
 #endif
 
 /*
  * Swap target and return old value
  */
 extern uint8_t atomic_swap_8(volatile uint8_t *, uint8_t);
 extern uchar_t atomic_swap_uchar(volatile uchar_t *, uchar_t);
 extern uint16_t atomic_swap_16(volatile uint16_t *, uint16_t);
 extern ushort_t atomic_swap_ushort(volatile ushort_t *, ushort_t);
 extern uint32_t atomic_swap_32(volatile uint32_t *, uint32_t);
 extern uint_t atomic_swap_uint(volatile uint_t *, uint_t);
 extern void *atomic_swap_ptr(volatile void *, void *);
 extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t);
 #if defined(_INT64_TYPE)
 extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t);
 #endif
 
+/*
+ * Atomically read variable.
+ */
+#define	atomic_load_char(p)	(*(volatile uchar_t *)(p))
+#define	atomic_load_short(p)	(*(volatile ushort_t *)(p))
+#define	atomic_load_int(p)	(*(volatile uint_t *)(p))
+#define	atomic_load_long(p)	(*(volatile ulong_t *)(p))
+#define	atomic_load_ptr(p)	(*(volatile __typeof(*p) *)(p))
+#define	atomic_load_8(p)	(*(volatile uint8_t *)(p))
+#define	atomic_load_16(p)	(*(volatile uint16_t *)(p))
+#define	atomic_load_32(p)	(*(volatile uint32_t *)(p))
+#ifdef _LP64
+#define	atomic_load_64(p)	(*(volatile uint64_t *)(p))
+#elif defined(_INT64_TYPE)
+extern uint64_t atomic_load_64(volatile uint64_t *);
+#endif
+
+/*
+ * Atomically write variable.
+ */
+#define	atomic_store_char(p, v)		\
+	(*(volatile uchar_t *)(p) = (uchar_t)(v))
+#define	atomic_store_short(p, v)	\
+	(*(volatile ushort_t *)(p) = (ushort_t)(v))
+#define	atomic_store_int(p, v)		\
+	(*(volatile uint_t *)(p) = (uint_t)(v))
+#define	atomic_store_long(p, v)		\
+	(*(volatile ulong_t *)(p) = (ulong_t)(v))
+#define	atomic_store_ptr(p, v)		\
+	(*(volatile __typeof(*p) *)(p) = (v))
+#define	atomic_store_8(p, v)		\
+	(*(volatile uint8_t *)(p) = (uint8_t)(v))
+#define	atomic_store_16(p, v)		\
+	(*(volatile uint16_t *)(p) = (uint16_t)(v))
+#define	atomic_store_32(p, v)		\
+	(*(volatile uint32_t *)(p) = (uint32_t)(v))
+#ifdef _LP64
+#define	atomic_store_64(p, v)		\
+	(*(volatile uint64_t *)(p) = (uint64_t)(v))
+#elif defined(_INT64_TYPE)
+extern void atomic_store_64(volatile uint64_t *, uint64_t);
+#endif
+
 /*
  * Perform an exclusive atomic bit set/clear on a target.
  * Returns 0 if bit was successfully set/cleared, or -1
  * if the bit was already set/cleared.
  */
 extern int atomic_set_long_excl(volatile ulong_t *, uint_t);
 extern int atomic_clear_long_excl(volatile ulong_t *, uint_t);
 
 /*
  * Generic memory barrier used during lock entry, placed after the
  * memory operation that acquires the lock to guarantee that the lock
  * protects its data.  No stores from after the memory barrier will
  * reach visibility, and no loads from after the barrier will be
  * resolved, before the lock acquisition reaches global visibility.
  */
 extern void membar_enter(void);
 
 /*
  * Generic memory barrier used during lock exit, placed before the
  * memory operation that releases the lock to guarantee that the lock
  * protects its data.  All loads and stores issued before the barrier
  * will be resolved before the subsequent lock update reaches visibility.
  */
 extern void membar_exit(void);
 
 /*
  * Arrange that all stores issued before this point in the code reach
  * global visibility before any stores that follow; useful in producer
  * modules that update a data item, then set a flag that it is available.
  * The memory barrier guarantees that the available flag is not visible
  * earlier than the updated data, i.e. it imposes store ordering.
  */
 extern void membar_producer(void);
 
 /*
  * Arrange that all loads issued before this point in the code are
  * completed before any subsequent loads; useful in consumer modules
  * that check to see if data is available and read the data.
  * The memory barrier guarantees that the data is not sampled until
  * after the available flag has been seen, i.e. it imposes load ordering.
  */
 extern void membar_consumer(void);
 #endif  /* __STDC__ */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_ATOMIC_H */
diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c
index e46da95f676c..c4ea4f86fc5f 100644
--- a/module/zfs/aggsum.c
+++ b/module/zfs/aggsum.c
@@ -1,240 +1,245 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/aggsum.h>
 
 /*
  * Aggregate-sum counters are a form of fanned-out counter, used when atomic
  * instructions on a single field cause enough CPU cache line contention to
  * slow system performance. Due to their increased overhead and the expense
  * involved with precisely reading from them, they should only be used in cases
  * where the write rate (increment/decrement) is much higher than the read rate
  * (get value).
  *
  * Aggregate sum counters are comprised of two basic parts, the core and the
  * buckets. The core counter contains a lock for the entire counter, as well
  * as the current upper and lower bounds on the value of the counter. The
  * aggsum_bucket structure contains a per-bucket lock to protect the contents of
  * the bucket, the current amount that this bucket has changed from the global
  * counter (called the delta), and the amount of increment and decrement we have
  * "borrowed" from the core counter.
  *
  * The basic operation of an aggsum is simple. Threads that wish to modify the
  * counter will modify one bucket's counter (determined by their current CPU, to
  * help minimize lock and cache contention). If the bucket already has
  * sufficient capacity borrowed from the core structure to handle their request,
  * they simply modify the delta and return.  If the bucket does not, we clear
  * the bucket's current state (to prevent the borrowed amounts from getting too
  * large), and borrow more from the core counter. Borrowing is done by adding to
  * the upper bound (or subtracting from the lower bound) of the core counter,
  * and setting the borrow value for the bucket to the amount added (or
  * subtracted).  Clearing the bucket is the opposite; we add the current delta
  * to both the lower and upper bounds of the core counter, subtract the borrowed
  * incremental from the upper bound, and add the borrowed decrement from the
  * lower bound.  Note that only borrowing and clearing require access to the
  * core counter; since all other operations access CPU-local resources,
  * performance can be much higher than a traditional counter.
  *
  * Threads that wish to read from the counter have a slightly more challenging
  * task. It is fast to determine the upper and lower bounds of the aggum; this
  * does not require grabbing any locks. This suffices for cases where an
  * approximation of the aggsum's value is acceptable. However, if one needs to
  * know whether some specific value is above or below the current value in the
  * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
  * comparing the target value to the upper and lower bounds of the aggsum, and
  * then clearing a bucket. This proceeds until the target is outside of the
  * upper and lower bounds and we return a response, or the last bucket has been
  * cleared and we know that the target is equal to the aggsum's value. Finally,
  * the most expensive operation is determining the precise value of the aggsum.
  * To do this, we clear every bucket and then return the upper bound (which must
  * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
  * expensive is clearing buckets. This involves grabbing the global lock
  * (serializing against themselves and borrow operations), grabbing a bucket's
  * lock (preventing threads on those CPUs from modifying their delta), and
  * zeroing out the borrowed value (forcing that thread to borrow on its next
  * request, which will also be expensive).  This is what makes aggsums well
  * suited for write-many read-rarely operations.
  *
  * Note that the aggsums do not expand if more CPUs are hot-added. In that
  * case, we will have less fanout than boot_ncpus, but we don't want to always
  * reserve the RAM necessary to create the extra slots for additional CPUs up
  * front, and dynamically adding them is a complex task.
  */
 
 /*
- * We will borrow aggsum_borrow_multiplier times the current request, so we will
- * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
- * aggsum_delta().
+ * We will borrow 2^aggsum_borrow_shift times the current request, so we will
+ * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to
+ * aggsum_add().
  */
-static uint_t aggsum_borrow_multiplier = 10;
+static uint_t aggsum_borrow_shift = 4;
 
 void
 aggsum_init(aggsum_t *as, uint64_t value)
 {
 	bzero(as, sizeof (*as));
 	as->as_lower_bound = as->as_upper_bound = value;
 	mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
-	as->as_numbuckets = boot_ncpus;
-	as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
-	    KM_SLEEP);
+	/*
+	 * Too many buckets may hurt read performance without improving
+	 * write.  From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc.
+	 */
+	as->as_bucketshift = highbit64(boot_ncpus / 6) / 2;
+	as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1;
+	as->as_buckets = kmem_zalloc(as->as_numbuckets *
+	    sizeof (aggsum_bucket_t), KM_SLEEP);
 	for (int i = 0; i < as->as_numbuckets; i++) {
 		mutex_init(&as->as_buckets[i].asc_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 	}
 }
 
 void
 aggsum_fini(aggsum_t *as)
 {
 	for (int i = 0; i < as->as_numbuckets; i++)
 		mutex_destroy(&as->as_buckets[i].asc_lock);
 	kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
 	mutex_destroy(&as->as_lock);
 }
 
 int64_t
 aggsum_lower_bound(aggsum_t *as)
 {
-	return (as->as_lower_bound);
+	return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound));
 }
 
-int64_t
+uint64_t
 aggsum_upper_bound(aggsum_t *as)
 {
-	return (as->as_upper_bound);
-}
-
-static void
-aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
-{
-	ASSERT(MUTEX_HELD(&as->as_lock));
-	ASSERT(MUTEX_HELD(&asb->asc_lock));
-
-	/*
-	 * We use atomic instructions for this because we read the upper and
-	 * lower bounds without the lock, so we need stores to be atomic.
-	 */
-	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-	    asb->asc_delta + asb->asc_borrowed);
-	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-	    asb->asc_delta - asb->asc_borrowed);
-	asb->asc_delta = 0;
-	asb->asc_borrowed = 0;
+	return (atomic_load_64(&as->as_upper_bound));
 }
 
 uint64_t
 aggsum_value(aggsum_t *as)
 {
-	int64_t rv;
+	int64_t lb;
+	uint64_t ub;
 
 	mutex_enter(&as->as_lock);
-	if (as->as_lower_bound == as->as_upper_bound) {
-		rv = as->as_lower_bound;
+	lb = as->as_lower_bound;
+	ub = as->as_upper_bound;
+	if (lb == ub) {
 		for (int i = 0; i < as->as_numbuckets; i++) {
 			ASSERT0(as->as_buckets[i].asc_delta);
 			ASSERT0(as->as_buckets[i].asc_borrowed);
 		}
 		mutex_exit(&as->as_lock);
-		return (rv);
+		return (lb);
 	}
 	for (int i = 0; i < as->as_numbuckets; i++) {
 		struct aggsum_bucket *asb = &as->as_buckets[i];
+		if (asb->asc_borrowed == 0)
+			continue;
 		mutex_enter(&asb->asc_lock);
-		aggsum_flush_bucket(as, asb);
+		lb += asb->asc_delta + asb->asc_borrowed;
+		ub += asb->asc_delta - asb->asc_borrowed;
+		asb->asc_delta = 0;
+		asb->asc_borrowed = 0;
 		mutex_exit(&asb->asc_lock);
 	}
-	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-	rv = as->as_lower_bound;
+	ASSERT3U(lb, ==, ub);
+	atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
+	atomic_store_64(&as->as_upper_bound, lb);
 	mutex_exit(&as->as_lock);
 
-	return (rv);
+	return (lb);
 }
 
 void
 aggsum_add(aggsum_t *as, int64_t delta)
 {
 	struct aggsum_bucket *asb;
 	int64_t borrow;
 
-	asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
+	asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) %
+	    as->as_numbuckets];
 
 	/* Try fast path if we already borrowed enough before. */
 	mutex_enter(&asb->asc_lock);
 	if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
 	    asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
 		asb->asc_delta += delta;
 		mutex_exit(&asb->asc_lock);
 		return;
 	}
 	mutex_exit(&asb->asc_lock);
 
 	/*
 	 * We haven't borrowed enough.  Take the global lock and borrow
 	 * considering what is requested now and what we borrowed before.
 	 */
-	borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
+	borrow = (delta < 0 ? -delta : delta);
+	borrow <<= aggsum_borrow_shift + as->as_bucketshift;
 	mutex_enter(&as->as_lock);
-	mutex_enter(&asb->asc_lock);
-	delta += asb->asc_delta;
-	asb->asc_delta = 0;
 	if (borrow >= asb->asc_borrowed)
 		borrow -= asb->asc_borrowed;
 	else
 		borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
+	mutex_enter(&asb->asc_lock);
+	delta += asb->asc_delta;
+	asb->asc_delta = 0;
 	asb->asc_borrowed += borrow;
-	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-	    delta - borrow);
-	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-	    delta + borrow);
 	mutex_exit(&asb->asc_lock);
+	atomic_store_64((volatile uint64_t *)&as->as_lower_bound,
+	    as->as_lower_bound + delta - borrow);
+	atomic_store_64(&as->as_upper_bound,
+	    as->as_upper_bound + delta + borrow);
 	mutex_exit(&as->as_lock);
 }
 
 /*
  * Compare the aggsum value to target efficiently. Returns -1 if the value
  * represented by the aggsum is less than target, 1 if it's greater, and 0 if
  * they are equal.
  */
 int
 aggsum_compare(aggsum_t *as, uint64_t target)
 {
-	if (as->as_upper_bound < target)
+	int64_t lb;
+	uint64_t ub;
+	int i;
+
+	if (atomic_load_64(&as->as_upper_bound) < target)
 		return (-1);
-	if (as->as_lower_bound > target)
+	lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound);
+	if (lb > 0 && (uint64_t)lb > target)
 		return (1);
 	mutex_enter(&as->as_lock);
-	for (int i = 0; i < as->as_numbuckets; i++) {
+	lb = as->as_lower_bound;
+	ub = as->as_upper_bound;
+	for (i = 0; i < as->as_numbuckets; i++) {
 		struct aggsum_bucket *asb = &as->as_buckets[i];
+		if (asb->asc_borrowed == 0)
+			continue;
 		mutex_enter(&asb->asc_lock);
-		aggsum_flush_bucket(as, asb);
+		lb += asb->asc_delta + asb->asc_borrowed;
+		ub += asb->asc_delta - asb->asc_borrowed;
+		asb->asc_delta = 0;
+		asb->asc_borrowed = 0;
 		mutex_exit(&asb->asc_lock);
-		if (as->as_upper_bound < target) {
-			mutex_exit(&as->as_lock);
-			return (-1);
-		}
-		if (as->as_lower_bound > target) {
-			mutex_exit(&as->as_lock);
-			return (1);
-		}
+		if (ub < target || (lb > 0 && (uint64_t)lb > target))
+			break;
 	}
-	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-	ASSERT3U(as->as_lower_bound, ==, target);
+	if (i >= as->as_numbuckets)
+		ASSERT3U(lb, ==, ub);
+	atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
+	atomic_store_64(&as->as_upper_bound, ub);
 	mutex_exit(&as->as_lock);
-	return (0);
+	return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0);
 }