diff --git a/cmd/arc_summary/arc_summary.py b/cmd/arc_summary/arc_summary.py
index 15d844b52657..25238f39ecda 100755
--- a/cmd/arc_summary/arc_summary.py
+++ b/cmd/arc_summary/arc_summary.py
@@ -1,1152 +1,1148 @@
 #!/usr/bin/python
 #
 # $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $
 #
 # Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
 # Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
 # Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 # If you are having troubles when using this script from cron(8) please try
 # adjusting your PATH before reporting problems.
 #
 # /usr/bin & /sbin
 #
 # Binaries used are:
 #
 # dc(1), kldstat(8), sed(1), sysctl(8) & vmstat(8)
 #
 # Binaries that I am working on phasing out are:
 #
 # dc(1) & sed(1)
 
 import sys
 import time
 import getopt
 import re
 from os import listdir
 from subprocess import Popen, PIPE
 from decimal import Decimal as D
 
 
 usetunable = True
 show_tunable_descriptions = False
 alternate_tunable_layout = False
 kstat_pobj = re.compile("^([^:]+):\s+(.+)\s*$", flags=re.M)
 
 
 def get_Kstat():
     def load_proc_kstats(fn, namespace):
         kstats = [line.strip() for line in open(fn)]
         del kstats[0:2]
         for kstat in kstats:
             kstat = kstat.strip()
             name, unused, value = kstat.split()
             Kstat[namespace + name] = D(value)
 
     Kstats = [
         "hw.pagesize",
         "hw.physmem",
         "kern.maxusers",
         "vm.kmem_map_free",
         "vm.kmem_map_size",
         "vm.kmem_size",
         "vm.kmem_size_max",
         "vm.kmem_size_min",
         "vm.kmem_size_scale",
         "vm.stats",
         "vm.swap_total",
         "vm.swap_reserved",
         "kstat.zfs",
         "vfs.zfs"
     ]
     Kstat = {}
     load_proc_kstats('/proc/spl/kstat/zfs/arcstats',
             'kstat.zfs.misc.arcstats.')
     load_proc_kstats('/proc/spl/kstat/zfs/zfetchstats',
             'kstat.zfs.misc.zfetchstats.')
     load_proc_kstats('/proc/spl/kstat/zfs/vdev_cache_stats',
             'kstat.zfs.misc.vdev_cache_stats.')
 
     return Kstat
 
 def div1():
     sys.stdout.write("\n")
     for i in xrange(18):
         sys.stdout.write("%s" % "----")
     sys.stdout.write("\n")
 
 
 def div2():
     sys.stdout.write("\n")
 
 
 def fBytes(Bytes=0, Decimal=2):
     kbytes = (2 ** 10)
     mbytes = (2 ** 20)
     gbytes = (2 ** 30)
     tbytes = (2 ** 40)
     pbytes = (2 ** 50)
     ebytes = (2 ** 60)
     zbytes = (2 ** 70)
     ybytes = (2 ** 80)
 
     if Bytes >= ybytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / ybytes) + "\tYiB"
     elif Bytes >= zbytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / zbytes) + "\tZiB"
     elif Bytes >= ebytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / ebytes) + "\tEiB"
     elif Bytes >= pbytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / pbytes) + "\tPiB"
     elif Bytes >= tbytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / tbytes) + "\tTiB"
     elif Bytes >= gbytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / gbytes) + "\tGiB"
     elif Bytes >= mbytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / mbytes) + "\tMiB"
     elif Bytes >= kbytes:
         return str("%0." + str(Decimal) + "f") % (Bytes / kbytes) + "\tKiB"
     elif Bytes == 0:
         return str("%d" % 0) + "\tBytes"
     else:
         return str("%d" % Bytes) + "\tBytes"
 
 
 def fHits(Hits=0, Decimal=2):
     khits = (10 ** 3)
     mhits = (10 ** 6)
     bhits = (10 ** 9)
     thits = (10 ** 12)
     qhits = (10 ** 15)
     Qhits = (10 ** 18)
     shits = (10 ** 21)
     Shits = (10 ** 24)
 
     if Hits >= Shits:
         return str("%0." + str(Decimal) + "f") % (Hits / Shits) + "S"
     elif Hits >= shits:
         return str("%0." + str(Decimal) + "f") % (Hits / shits) + "s"
     elif Hits >= Qhits:
         return str("%0." + str(Decimal) + "f") % (Hits / Qhits) + "Q"
     elif Hits >= qhits:
         return str("%0." + str(Decimal) + "f") % (Hits / qhits) + "q"
     elif Hits >= thits:
         return str("%0." + str(Decimal) + "f") % (Hits / thits) + "t"
     elif Hits >= bhits:
         return str("%0." + str(Decimal) + "f") % (Hits / bhits) + "b"
     elif Hits >= mhits:
         return str("%0." + str(Decimal) + "f") % (Hits / mhits) + "m"
     elif Hits >= khits:
         return str("%0." + str(Decimal) + "f") % (Hits / khits) + "k"
     elif Hits == 0:
         return str("%d" % 0)
     else:
         return str("%d" % Hits)
 
 
 def fPerc(lVal=0, rVal=0, Decimal=2):
     if rVal > 0:
         return str("%0." + str(Decimal) + "f") % (100 * (lVal / rVal)) + "%"
     else:
         return str("%0." + str(Decimal) + "f") % 100 + "%"
 
 
 def get_arc_summary(Kstat):
 
     output = {}
     memory_throttle_count = Kstat[
         "kstat.zfs.misc.arcstats.memory_throttle_count"
         ]
 
     if memory_throttle_count > 0:
         output['health'] = 'THROTTLED'
     else:
         output['health'] = 'HEALTHY'
 
     output['memory_throttle_count'] = fHits(memory_throttle_count)
 
     ### ARC Misc. ###
     deleted = Kstat["kstat.zfs.misc.arcstats.deleted"]
     mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"]
-    recycle_miss = Kstat["kstat.zfs.misc.arcstats.recycle_miss"]
 
     ### ARC Misc. ###
     output["arc_misc"] = {}
     output["arc_misc"]["deleted"] = fHits(deleted)
-    output["arc_misc"]['recycle_miss'] = fHits(recycle_miss)
     output["arc_misc"]['mutex_miss'] = fHits(mutex_miss)
     output["arc_misc"]['evict_skips'] = fHits(mutex_miss)
 
     ### ARC Sizing ###
     arc_size = Kstat["kstat.zfs.misc.arcstats.size"]
     mru_size = Kstat["kstat.zfs.misc.arcstats.p"]
     target_max_size = Kstat["kstat.zfs.misc.arcstats.c_max"]
     target_min_size = Kstat["kstat.zfs.misc.arcstats.c_min"]
     target_size = Kstat["kstat.zfs.misc.arcstats.c"]
 
     target_size_ratio = (target_max_size / target_min_size)
 
     ### ARC Sizing ###
     output['arc_sizing'] = {}
     output['arc_sizing']['arc_size'] = {
         'per': fPerc(arc_size, target_max_size),
         'num': fBytes(arc_size),
     }
     output['arc_sizing']['target_max_size'] = {
         'ratio': target_size_ratio,
         'num': fBytes(target_max_size),
     }
     output['arc_sizing']['target_min_size'] = {
         'per': fPerc(target_min_size, target_max_size),
         'num': fBytes(target_min_size),
     }
     output['arc_sizing']['target_size'] = {
         'per': fPerc(target_size, target_max_size),
         'num': fBytes(target_size),
     }
 
     ### ARC Hash Breakdown ###
     output['arc_hash_break'] = {}
     output['arc_hash_break']['hash_chain_max'] = Kstat[
         "kstat.zfs.misc.arcstats.hash_chain_max"
         ]
     output['arc_hash_break']['hash_chains'] = Kstat[
         "kstat.zfs.misc.arcstats.hash_chains"
         ]
     output['arc_hash_break']['hash_collisions'] = Kstat[
         "kstat.zfs.misc.arcstats.hash_collisions"
         ]
     output['arc_hash_break']['hash_elements'] = Kstat[
         "kstat.zfs.misc.arcstats.hash_elements"
         ]
     output['arc_hash_break']['hash_elements_max'] = Kstat[
         "kstat.zfs.misc.arcstats.hash_elements_max"
         ]
 
     output['arc_size_break'] = {}
     if arc_size > target_size:
         mfu_size = (arc_size - mru_size)
         output['arc_size_break']['recently_used_cache_size'] = {
             'per': fPerc(mru_size, arc_size),
             'num': fBytes(mru_size),
         }
         output['arc_size_break']['frequently_used_cache_size'] = {
             'per': fPerc(mfu_size, arc_size),
             'num': fBytes(mfu_size),
         }
 
     elif arc_size < target_size:
         mfu_size = (target_size - mru_size)
         output['arc_size_break']['recently_used_cache_size'] = {
             'per': fPerc(mru_size, target_size),
             'num': fBytes(mru_size),
         }
         output['arc_size_break']['frequently_used_cache_size'] = {
             'per': fPerc(mfu_size, target_size),
             'num': fBytes(mfu_size),
         }
 
     ### ARC Hash Breakdown ###
     hash_chain_max = Kstat["kstat.zfs.misc.arcstats.hash_chain_max"]
     hash_chains = Kstat["kstat.zfs.misc.arcstats.hash_chains"]
     hash_collisions = Kstat["kstat.zfs.misc.arcstats.hash_collisions"]
     hash_elements = Kstat["kstat.zfs.misc.arcstats.hash_elements"]
     hash_elements_max = Kstat["kstat.zfs.misc.arcstats.hash_elements_max"]
 
     output['arc_hash_break'] = {}
     output['arc_hash_break']['elements_max'] = fHits(hash_elements_max)
     output['arc_hash_break']['elements_current'] = {
         'per': fPerc(hash_elements, hash_elements_max),
         'num': fHits(hash_elements),
         }
     output['arc_hash_break']['collisions'] = fHits(hash_collisions)
     output['arc_hash_break']['chain_max'] = fHits(hash_chain_max)
     output['arc_hash_break']['chains'] = fHits(hash_chains)
 
     return output
 
 
 def _arc_summary(Kstat):
     ### ARC Sizing ###
     arc = get_arc_summary(Kstat)
 
     sys.stdout.write("ARC Summary: (%s)\n" % arc['health'])
 
     sys.stdout.write("\tMemory Throttle Count:\t\t\t%s\n" %
             arc['memory_throttle_count'])
     sys.stdout.write("\n")
 
     ### ARC Misc. ###
     sys.stdout.write("ARC Misc:\n")
     sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted'])
-    sys.stdout.write("\tRecycle Misses:\t\t\t\t%s\n" %
-            arc['arc_misc']['recycle_miss'])
     sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" %
             arc['arc_misc']['mutex_miss'])
     sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" %
             arc['arc_misc']['mutex_miss'])
     sys.stdout.write("\n")
 
     ### ARC Sizing ###
     sys.stdout.write("ARC Size:\t\t\t\t%s\t%s\n" % (
         arc['arc_sizing']['arc_size']['per'],
         arc['arc_sizing']['arc_size']['num']
         )
     )
     sys.stdout.write("\tTarget Size: (Adaptive)\t\t%s\t%s\n" % (
         arc['arc_sizing']['target_size']['per'],
         arc['arc_sizing']['target_size']['num'],
         )
     )
 
     sys.stdout.write("\tMin Size (Hard Limit):\t\t%s\t%s\n" % (
         arc['arc_sizing']['target_min_size']['per'],
         arc['arc_sizing']['target_min_size']['num'],
         )
     )
 
     sys.stdout.write("\tMax Size (High Water):\t\t%d:1\t%s\n" % (
         arc['arc_sizing']['target_max_size']['ratio'],
         arc['arc_sizing']['target_max_size']['num'],
         )
     )
 
     sys.stdout.write("\nARC Size Breakdown:\n")
     sys.stdout.write("\tRecently Used Cache Size:\t%s\t%s\n" % (
         arc['arc_size_break']['recently_used_cache_size']['per'],
         arc['arc_size_break']['recently_used_cache_size']['num'],
         )
     )
     sys.stdout.write("\tFrequently Used Cache Size:\t%s\t%s\n" % (
         arc['arc_size_break']['frequently_used_cache_size']['per'],
         arc['arc_size_break']['frequently_used_cache_size']['num'],
         )
     )
 
     sys.stdout.write("\n")
 
     ### ARC Hash Breakdown ###
     sys.stdout.write("ARC Hash Breakdown:\n")
     sys.stdout.write("\tElements Max:\t\t\t\t%s\n" %
             arc['arc_hash_break']['elements_max'])
     sys.stdout.write("\tElements Current:\t\t%s\t%s\n" % (
         arc['arc_hash_break']['elements_current']['per'],
         arc['arc_hash_break']['elements_current']['num'],
         )
     )
     sys.stdout.write("\tCollisions:\t\t\t\t%s\n" %
             arc['arc_hash_break']['collisions'])
     sys.stdout.write("\tChain Max:\t\t\t\t%s\n" %
             arc['arc_hash_break']['chain_max'])
     sys.stdout.write("\tChains:\t\t\t\t\t%s\n" %
             arc['arc_hash_break']['chains'])
 
 
 def get_arc_efficiency(Kstat):
     output = {}
 
     arc_hits = Kstat["kstat.zfs.misc.arcstats.hits"]
     arc_misses = Kstat["kstat.zfs.misc.arcstats.misses"]
     demand_data_hits = Kstat["kstat.zfs.misc.arcstats.demand_data_hits"]
     demand_data_misses = Kstat["kstat.zfs.misc.arcstats.demand_data_misses"]
     demand_metadata_hits = Kstat[
         "kstat.zfs.misc.arcstats.demand_metadata_hits"
         ]
     demand_metadata_misses = Kstat[
         "kstat.zfs.misc.arcstats.demand_metadata_misses"
         ]
     mfu_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mfu_ghost_hits"]
     mfu_hits = Kstat["kstat.zfs.misc.arcstats.mfu_hits"]
     mru_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mru_ghost_hits"]
     mru_hits = Kstat["kstat.zfs.misc.arcstats.mru_hits"]
     prefetch_data_hits = Kstat["kstat.zfs.misc.arcstats.prefetch_data_hits"]
     prefetch_data_misses = Kstat[
         "kstat.zfs.misc.arcstats.prefetch_data_misses"
         ]
     prefetch_metadata_hits = Kstat[
         "kstat.zfs.misc.arcstats.prefetch_metadata_hits"
         ]
     prefetch_metadata_misses = Kstat[
         "kstat.zfs.misc.arcstats.prefetch_metadata_misses"
         ]
 
     anon_hits = arc_hits - (
         mfu_hits + mru_hits + mfu_ghost_hits + mru_ghost_hits
         )
     arc_accesses_total = (arc_hits + arc_misses)
     demand_data_total = (demand_data_hits + demand_data_misses)
     prefetch_data_total = (prefetch_data_hits + prefetch_data_misses)
     real_hits = (mfu_hits + mru_hits)
 
     output["total_accesses"] = fHits(arc_accesses_total)
     output["cache_hit_ratio"] = {
         'per': fPerc(arc_hits, arc_accesses_total),
         'num': fHits(arc_hits),
     }
     output["cache_miss_ratio"] = {
         'per': fPerc(arc_misses, arc_accesses_total),
         'num': fHits(arc_misses),
     }
     output["actual_hit_ratio"] = {
         'per': fPerc(real_hits, arc_accesses_total),
         'num': fHits(real_hits),
     }
     output["data_demand_efficiency"] = {
         'per': fPerc(demand_data_hits, demand_data_total),
         'num': fHits(demand_data_total),
     }
 
     if prefetch_data_total > 0:
         output["data_prefetch_efficiency"] = {
             'per': fPerc(prefetch_data_hits, prefetch_data_total),
             'num': fHits(prefetch_data_total),
         }
 
     if anon_hits > 0:
         output["cache_hits_by_cache_list"] = {}
         output["cache_hits_by_cache_list"]["anonymously_used"] = {
             'per': fPerc(anon_hits, arc_hits),
             'num': fHits(anon_hits),
         }
 
     output["most_recently_used"] = {
         'per': fPerc(mru_hits, arc_hits),
         'num': fHits(mru_hits),
     }
     output["most_frequently_used"] = {
         'per': fPerc(mfu_hits, arc_hits),
         'num': fHits(mfu_hits),
     }
     output["most_recently_used_ghost"] = {
         'per': fPerc(mru_ghost_hits, arc_hits),
         'num': fHits(mru_ghost_hits),
     }
     output["most_frequently_used_ghost"] = {
         'per': fPerc(mfu_ghost_hits, arc_hits),
         'num': fHits(mfu_ghost_hits),
     }
 
     output["cache_hits_by_data_type"] = {}
     output["cache_hits_by_data_type"]["demand_data"] = {
         'per': fPerc(demand_data_hits, arc_hits),
         'num': fHits(demand_data_hits),
     }
     output["cache_hits_by_data_type"]["prefetch_data"] = {
         'per': fPerc(prefetch_data_hits, arc_hits),
         'num': fHits(prefetch_data_hits),
     }
     output["cache_hits_by_data_type"]["demand_metadata"] = {
         'per': fPerc(demand_metadata_hits, arc_hits),
         'num': fHits(demand_metadata_hits),
     }
     output["cache_hits_by_data_type"]["prefetch_metadata"] = {
         'per': fPerc(prefetch_metadata_hits, arc_hits),
         'num': fHits(prefetch_metadata_hits),
     }
 
     output["cache_misses_by_data_type"] = {}
     output["cache_misses_by_data_type"]["demand_data"] = {
         'per': fPerc(demand_data_misses, arc_misses),
         'num': fHits(demand_data_misses),
     }
     output["cache_misses_by_data_type"]["prefetch_data"] = {
         'per': fPerc(prefetch_data_misses, arc_misses),
         'num': fHits(prefetch_data_misses),
     }
     output["cache_misses_by_data_type"]["demand_metadata"] = {
         'per': fPerc(demand_metadata_misses, arc_misses),
         'num': fHits(demand_metadata_misses),
     }
     output["cache_misses_by_data_type"]["prefetch_metadata"] = {
         'per': fPerc(prefetch_metadata_misses, arc_misses),
         'num': fHits(prefetch_metadata_misses),
     }
 
     return output
 
 
 def _arc_efficiency(Kstat):
     arc = get_arc_efficiency(Kstat)
 
     sys.stdout.write("ARC Total accesses:\t\t\t\t\t%s\n" %
             arc['total_accesses'])
     sys.stdout.write("\tCache Hit Ratio:\t\t%s\t%s\n" % (
         arc['cache_hit_ratio']['per'],
         arc['cache_hit_ratio']['num'],
         )
     )
     sys.stdout.write("\tCache Miss Ratio:\t\t%s\t%s\n" % (
         arc['cache_miss_ratio']['per'],
         arc['cache_miss_ratio']['num'],
         )
     )
 
     sys.stdout.write("\tActual Hit Ratio:\t\t%s\t%s\n" % (
         arc['actual_hit_ratio']['per'],
         arc['actual_hit_ratio']['num'],
         )
     )
 
     sys.stdout.write("\n")
     sys.stdout.write("\tData Demand Efficiency:\t\t%s\t%s\n" % (
         arc['data_demand_efficiency']['per'],
         arc['data_demand_efficiency']['num'],
         )
     )
 
     if 'data_prefetch_efficiency' in arc:
         sys.stdout.write("\tData Prefetch Efficiency:\t%s\t%s\n" % (
             arc['data_prefetch_efficiency']['per'],
             arc['data_prefetch_efficiency']['num'],
             )
         )
     sys.stdout.write("\n")
 
     sys.stdout.write("\tCACHE HITS BY CACHE LIST:\n")
     if 'cache_hits_by_cache_list' in arc:
         sys.stdout.write("\t  Anonymously Used:\t\t%s\t%s\n" % (
             arc['cache_hits_by_cache_list']['anonymously_used']['per'],
             arc['cache_hits_by_cache_list']['anonymously_used']['num'],
             )
         )
     sys.stdout.write("\t  Most Recently Used:\t\t%s\t%s\n" % (
         arc['most_recently_used']['per'],
         arc['most_recently_used']['num'],
         )
     )
     sys.stdout.write("\t  Most Frequently Used:\t\t%s\t%s\n" % (
         arc['most_frequently_used']['per'],
         arc['most_frequently_used']['num'],
         )
     )
     sys.stdout.write("\t  Most Recently Used Ghost:\t%s\t%s\n" % (
         arc['most_recently_used_ghost']['per'],
         arc['most_recently_used_ghost']['num'],
         )
     )
     sys.stdout.write("\t  Most Frequently Used Ghost:\t%s\t%s\n" % (
         arc['most_frequently_used_ghost']['per'],
         arc['most_frequently_used_ghost']['num'],
         )
     )
 
     sys.stdout.write("\n\tCACHE HITS BY DATA TYPE:\n")
     sys.stdout.write("\t  Demand Data:\t\t\t%s\t%s\n" % (
         arc["cache_hits_by_data_type"]['demand_data']['per'],
         arc["cache_hits_by_data_type"]['demand_data']['num'],
         )
     )
     sys.stdout.write("\t  Prefetch Data:\t\t%s\t%s\n" % (
         arc["cache_hits_by_data_type"]['prefetch_data']['per'],
         arc["cache_hits_by_data_type"]['prefetch_data']['num'],
         )
     )
     sys.stdout.write("\t  Demand Metadata:\t\t%s\t%s\n" % (
         arc["cache_hits_by_data_type"]['demand_metadata']['per'],
         arc["cache_hits_by_data_type"]['demand_metadata']['num'],
         )
     )
     sys.stdout.write("\t  Prefetch Metadata:\t\t%s\t%s\n" % (
         arc["cache_hits_by_data_type"]['prefetch_metadata']['per'],
         arc["cache_hits_by_data_type"]['prefetch_metadata']['num'],
         )
     )
 
     sys.stdout.write("\n\tCACHE MISSES BY DATA TYPE:\n")
     sys.stdout.write("\t  Demand Data:\t\t\t%s\t%s\n" % (
         arc["cache_misses_by_data_type"]['demand_data']['per'],
         arc["cache_misses_by_data_type"]['demand_data']['num'],
         )
     )
     sys.stdout.write("\t  Prefetch Data:\t\t%s\t%s\n" % (
         arc["cache_misses_by_data_type"]['prefetch_data']['per'],
         arc["cache_misses_by_data_type"]['prefetch_data']['num'],
         )
     )
     sys.stdout.write("\t  Demand Metadata:\t\t%s\t%s\n" % (
         arc["cache_misses_by_data_type"]['demand_metadata']['per'],
         arc["cache_misses_by_data_type"]['demand_metadata']['num'],
         )
     )
     sys.stdout.write("\t  Prefetch Metadata:\t\t%s\t%s\n" % (
         arc["cache_misses_by_data_type"]['prefetch_metadata']['per'],
         arc["cache_misses_by_data_type"]['prefetch_metadata']['num'],
         )
     )
 
 
 def get_l2arc_summary(Kstat):
     output = {}
 
     l2_abort_lowmem = Kstat["kstat.zfs.misc.arcstats.l2_abort_lowmem"]
     l2_cksum_bad = Kstat["kstat.zfs.misc.arcstats.l2_cksum_bad"]
     l2_evict_lock_retry = Kstat["kstat.zfs.misc.arcstats.l2_evict_lock_retry"]
     l2_evict_reading = Kstat["kstat.zfs.misc.arcstats.l2_evict_reading"]
     l2_feeds = Kstat["kstat.zfs.misc.arcstats.l2_feeds"]
     l2_free_on_write = Kstat["kstat.zfs.misc.arcstats.l2_free_on_write"]
     l2_hdr_size = Kstat["kstat.zfs.misc.arcstats.l2_hdr_size"]
     l2_hits = Kstat["kstat.zfs.misc.arcstats.l2_hits"]
     l2_io_error = Kstat["kstat.zfs.misc.arcstats.l2_io_error"]
     l2_misses = Kstat["kstat.zfs.misc.arcstats.l2_misses"]
     l2_rw_clash = Kstat["kstat.zfs.misc.arcstats.l2_rw_clash"]
     l2_size = Kstat["kstat.zfs.misc.arcstats.l2_size"]
     l2_asize = Kstat["kstat.zfs.misc.arcstats.l2_asize"]
     l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"]
     l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"]
     l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"]
 
     l2_access_total = (l2_hits + l2_misses)
     output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error)
 
     output['l2_access_total'] = l2_access_total
     output['l2_size'] = l2_size
     output['l2_asize'] = l2_asize
 
     if l2_size > 0 and l2_access_total > 0:
 
         if output['l2_health_count'] > 0:
             output["health"] = "DEGRADED"
         else:
             output["health"] = "HEALTHY"
 
         output["low_memory_aborts"] = fHits(l2_abort_lowmem)
         output["free_on_write"] = fHits(l2_free_on_write)
         output["rw_clashes"] = fHits(l2_rw_clash)
         output["bad_checksums"] = fHits(l2_cksum_bad)
         output["io_errors"] = fHits(l2_io_error)
 
         output["l2_arc_size"] = {}
         output["l2_arc_size"]["adative"] = fBytes(l2_size)
         output["l2_arc_size"]["actual"] = {
             'per': fPerc(l2_asize, l2_size),
             'num': fBytes(l2_asize)
             }
         output["l2_arc_size"]["head_size"] = {
             'per': fPerc(l2_hdr_size, l2_size),
             'num': fBytes(l2_hdr_size),
         }
 
         output["l2_arc_evicts"] = {}
         output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry)
         output["l2_arc_evicts"]['reading'] = fHits(l2_evict_reading)
 
         output['l2_arc_breakdown'] = {}
         output['l2_arc_breakdown']['value'] = fHits(l2_access_total)
         output['l2_arc_breakdown']['hit_ratio'] = {
             'per': fPerc(l2_hits, l2_access_total),
             'num': fHits(l2_hits),
         }
         output['l2_arc_breakdown']['miss_ratio'] = {
             'per': fPerc(l2_misses, l2_access_total),
             'num': fHits(l2_misses),
         }
         output['l2_arc_breakdown']['feeds'] = fHits(l2_feeds)
 
         output['l2_arc_buffer'] = {}
 
         output['l2_arc_writes'] = {}
         output['l2_writes_done'] = l2_writes_done
         output['l2_writes_sent'] = l2_writes_sent
         if l2_writes_done != l2_writes_sent:
             output['l2_arc_writes']['writes_sent'] = {
                 'value': "FAULTED",
                 'num': fHits(l2_writes_sent),
             }
             output['l2_arc_writes']['done_ratio'] = {
                 'per': fPerc(l2_writes_done, l2_writes_sent),
                 'num': fHits(l2_writes_done),
             }
             output['l2_arc_writes']['error_ratio'] = {
                 'per': fPerc(l2_writes_error, l2_writes_sent),
                 'num': fHits(l2_writes_error),
             }
         else:
             output['l2_arc_writes']['writes_sent'] = {
                 'per': fPerc(100),
                 'num': fHits(l2_writes_sent),
             }
 
     return output
 
 
 def _l2arc_summary(Kstat):
 
     arc = get_l2arc_summary(Kstat)
 
     if arc['l2_size'] > 0 and arc['l2_access_total'] > 0:
         sys.stdout.write("L2 ARC Summary: ")
         if arc['l2_health_count'] > 0:
             sys.stdout.write("(DEGRADED)\n")
         else:
             sys.stdout.write("(HEALTHY)\n")
         sys.stdout.write("\tLow Memory Aborts:\t\t\t%s\n" %
                 arc['low_memory_aborts'])
         sys.stdout.write("\tFree on Write:\t\t\t\t%s\n" % arc['free_on_write'])
         sys.stdout.write("\tR/W Clashes:\t\t\t\t%s\n" % arc['rw_clashes'])
         sys.stdout.write("\tBad Checksums:\t\t\t\t%s\n" % arc['bad_checksums'])
         sys.stdout.write("\tIO Errors:\t\t\t\t%s\n" % arc['io_errors'])
         sys.stdout.write("\n")
 
         sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" %
                 arc["l2_arc_size"]["adative"])
         sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % (
             arc["l2_arc_size"]["actual"]["per"],
             arc["l2_arc_size"]["actual"]["num"],
             )
         )
         sys.stdout.write("\tHeader Size:\t\t\t%s\t%s\n" % (
             arc["l2_arc_size"]["head_size"]["per"],
             arc["l2_arc_size"]["head_size"]["num"],
             )
         )
         sys.stdout.write("\n")
 
         if arc["l2_arc_evicts"]['lock_retries'] + \
                 arc["l2_arc_evicts"]["reading"] > 0:
             sys.stdout.write("L2 ARC Evicts:\n")
             sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" %
                     arc["l2_arc_evicts"]['lock_retries'])
             sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" %
                     arc["l2_arc_evicts"]["reading"])
             sys.stdout.write("\n")
 
         sys.stdout.write("L2 ARC Breakdown:\t\t\t\t%s\n" %
                 arc['l2_arc_breakdown']['value'])
         sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
             arc['l2_arc_breakdown']['hit_ratio']['per'],
             arc['l2_arc_breakdown']['hit_ratio']['num'],
             )
         )
 
         sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
             arc['l2_arc_breakdown']['miss_ratio']['per'],
             arc['l2_arc_breakdown']['miss_ratio']['num'],
             )
         )
 
         sys.stdout.write("\tFeeds:\t\t\t\t\t%s\n" %
                 arc['l2_arc_breakdown']['feeds'])
         sys.stdout.write("\n")
 
         sys.stdout.write("L2 ARC Writes:\n")
         if arc['l2_writes_done'] != arc['l2_writes_sent']:
             sys.stdout.write("\tWrites Sent: (%s)\t\t\t\t%s\n" % (
                 arc['l2_arc_writes']['writes_sent']['value'],
                 arc['l2_arc_writes']['writes_sent']['num'],
                 )
             )
             sys.stdout.write("\t  Done Ratio:\t\t\t%s\t%s\n" % (
                 arc['l2_arc_writes']['done_ratio']['per'],
                 arc['l2_arc_writes']['done_ratio']['num'],
                 )
             )
             sys.stdout.write("\t  Error Ratio:\t\t\t%s\t%s\n" % (
                 arc['l2_arc_writes']['error_ratio']['per'],
                 arc['l2_arc_writes']['error_ratio']['num'],
                 )
             )
         else:
             sys.stdout.write("\tWrites Sent:\t\t\t%s\t%s\n" % (
                 arc['l2_arc_writes']['writes_sent']['per'],
                 arc['l2_arc_writes']['writes_sent']['num'],
                 )
             )
 
 
 def get_dmu_summary(Kstat):
     output = {}
 
     zfetch_bogus_streams = Kstat["kstat.zfs.misc.zfetchstats.bogus_streams"]
     zfetch_colinear_hits = Kstat["kstat.zfs.misc.zfetchstats.colinear_hits"]
     zfetch_colinear_misses = \
             Kstat["kstat.zfs.misc.zfetchstats.colinear_misses"]
     zfetch_hits = Kstat["kstat.zfs.misc.zfetchstats.hits"]
     zfetch_misses = Kstat["kstat.zfs.misc.zfetchstats.misses"]
     zfetch_reclaim_failures = \
             Kstat["kstat.zfs.misc.zfetchstats.reclaim_failures"]
     zfetch_reclaim_successes = \
             Kstat["kstat.zfs.misc.zfetchstats.reclaim_successes"]
     zfetch_streams_noresets = \
             Kstat["kstat.zfs.misc.zfetchstats.streams_noresets"]
     zfetch_streams_resets = Kstat["kstat.zfs.misc.zfetchstats.streams_resets"]
     zfetch_stride_hits = Kstat["kstat.zfs.misc.zfetchstats.stride_hits"]
     zfetch_stride_misses = Kstat["kstat.zfs.misc.zfetchstats.stride_misses"]
 
     zfetch_access_total = (zfetch_hits + zfetch_misses)
     zfetch_colinear_total = (zfetch_colinear_hits + zfetch_colinear_misses)
     zfetch_health_count = (zfetch_bogus_streams)
     zfetch_reclaim_total = (zfetch_reclaim_successes + zfetch_reclaim_failures)
     zfetch_streams_total = (zfetch_streams_resets + zfetch_streams_noresets +
             zfetch_bogus_streams)
     zfetch_stride_total = (zfetch_stride_hits + zfetch_stride_misses)
     output['zfetch_access_total'] = zfetch_access_total
 
     if zfetch_access_total > 0:
 
         output['file_level_prefetch'] = {}
         if zfetch_health_count > 0:
             output['file_level_prefetch']['health'] = 'DEGRADED'
         else:
             output['file_level_prefetch']['health'] = 'HEALTHY'
 
         output['dmu'] = {}
         output['dmu']['efficiency'] = {}
         output['dmu']['efficiency']['value'] = fHits(zfetch_access_total)
         output['dmu']['efficiency']['hit_ratio'] = {
             'per': fPerc(zfetch_hits, zfetch_access_total),
             'num': fHits(zfetch_hits),
         }
         output['dmu']['efficiency']['miss_ratio'] = {
             'per': fPerc(zfetch_misses, zfetch_access_total),
             'num': fHits(zfetch_misses),
         }
 
         output['dmu']['colinear'] = {}
         output['dmu']['colinear']['value'] = fHits(zfetch_colinear_total)
         output['dmu']['colinear']['hit_ratio'] = {
             'per': fPerc(zfetch_colinear_hits, zfetch_colinear_total),
             'num': fHits(zfetch_colinear_hits),
         }
         output['dmu']['colinear']['miss_ratio'] = {
             'per': fPerc(zfetch_colinear_misses, zfetch_colinear_total),
             'num': fHits(zfetch_colinear_misses),
         }
 
         output['dmu']['stride'] = {}
         output['dmu']['stride']['value'] = fHits(zfetch_stride_total)
         output['dmu']['stride']['hit_ratio'] = {
             'per': fPerc(zfetch_stride_hits, zfetch_stride_total),
             'num': fHits(zfetch_stride_hits),
         }
         output['dmu']['stride']['miss_ratio'] = {
             'per': fPerc(zfetch_stride_misses, zfetch_stride_total),
             'num': fHits(zfetch_stride_misses),
         }
 
         output['dmu_misc'] = {}
         if zfetch_health_count > 0:
             output['dmu_misc']['status'] = "FAULTED"
         else:
             output['dmu_misc']['status'] = ""
 
         output['dmu_misc']['reclaim'] = {}
         output['dmu_misc']['reclaim']['value'] = fHits(zfetch_reclaim_total)
         output['dmu_misc']['reclaim']['successes'] = {
             'per': fPerc(zfetch_reclaim_successes, zfetch_reclaim_total),
             'num': fHits(zfetch_reclaim_successes),
         }
         output['dmu_misc']['reclaim']['failure'] = {
             'per': fPerc(zfetch_reclaim_failures, zfetch_reclaim_total),
             'num': fHits(zfetch_reclaim_failures),
         }
 
         output['dmu_misc']['streams'] = {}
         output['dmu_misc']['streams']['value'] = fHits(zfetch_streams_total)
         output['dmu_misc']['streams']['plus_resets'] = {
             'per': fPerc(zfetch_streams_resets, zfetch_streams_total),
             'num': fHits(zfetch_streams_resets),
         }
         output['dmu_misc']['streams']['neg_resets'] = {
             'per': fPerc(zfetch_streams_noresets, zfetch_streams_total),
             'num': fHits(zfetch_streams_noresets),
         }
         output['dmu_misc']['streams']['bogus'] = fHits(zfetch_bogus_streams)
 
     return output
 
 
 def _dmu_summary(Kstat):
 
     arc = get_dmu_summary(Kstat)
 
     if arc['zfetch_access_total'] > 0:
         sys.stdout.write("File-Level Prefetch: (%s)" %
                 arc['file_level_prefetch']['health'])
         sys.stdout.write("\n")
 
         sys.stdout.write("DMU Efficiency:\t\t\t\t\t%s\n" %
                 arc['dmu']['efficiency']['value'])
         sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
             arc['dmu']['efficiency']['hit_ratio']['per'],
             arc['dmu']['efficiency']['hit_ratio']['num'],
             )
         )
         sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
             arc['dmu']['efficiency']['miss_ratio']['per'],
             arc['dmu']['efficiency']['miss_ratio']['num'],
             )
         )
 
         sys.stdout.write("\n")
 
         sys.stdout.write("\tColinear:\t\t\t\t%s\n" %
                 arc['dmu']['colinear']['value'])
         sys.stdout.write("\t  Hit Ratio:\t\t\t%s\t%s\n" % (
             arc['dmu']['colinear']['hit_ratio']['per'],
             arc['dmu']['colinear']['hit_ratio']['num'],
             )
         )
 
         sys.stdout.write("\t  Miss Ratio:\t\t\t%s\t%s\n" % (
             arc['dmu']['colinear']['miss_ratio']['per'],
             arc['dmu']['colinear']['miss_ratio']['num'],
             )
         )
 
         sys.stdout.write("\n")
 
         sys.stdout.write("\tStride:\t\t\t\t\t%s\n" %
                 arc['dmu']['stride']['value'])
         sys.stdout.write("\t  Hit Ratio:\t\t\t%s\t%s\n" % (
             arc['dmu']['stride']['hit_ratio']['per'],
             arc['dmu']['stride']['hit_ratio']['num'],
             )
         )
 
         sys.stdout.write("\t  Miss Ratio:\t\t\t%s\t%s\n" % (
             arc['dmu']['stride']['miss_ratio']['per'],
             arc['dmu']['stride']['miss_ratio']['num'],
             )
         )
 
         sys.stdout.write("\n")
         sys.stdout.write("DMU Misc: %s\n" % arc['dmu_misc']['status'])
 
         sys.stdout.write("\tReclaim:\t\t\t\t%s\n" %
                 arc['dmu_misc']['reclaim']['value'])
         sys.stdout.write("\t  Successes:\t\t\t%s\t%s\n" % (
             arc['dmu_misc']['reclaim']['successes']['per'],
             arc['dmu_misc']['reclaim']['successes']['num'],
             )
         )
 
         sys.stdout.write("\t  Failures:\t\t\t%s\t%s\n" % (
             arc['dmu_misc']['reclaim']['failure']['per'],
             arc['dmu_misc']['reclaim']['failure']['num'],
             )
         )
 
         sys.stdout.write("\n\tStreams:\t\t\t\t%s\n" %
                 arc['dmu_misc']['streams']['value'])
         sys.stdout.write("\t  +Resets:\t\t\t%s\t%s\n" % (
             arc['dmu_misc']['streams']['plus_resets']['per'],
             arc['dmu_misc']['streams']['plus_resets']['num'],
             )
         )
 
         sys.stdout.write("\t  -Resets:\t\t\t%s\t%s\n" % (
             arc['dmu_misc']['streams']['neg_resets']['per'],
             arc['dmu_misc']['streams']['neg_resets']['num'],
             )
         )
 
         sys.stdout.write("\t  Bogus:\t\t\t\t%s\n" %
                 arc['dmu_misc']['streams']['bogus'])
 
 
 def get_vdev_summary(Kstat):
     output = {}
 
     vdev_cache_delegations = \
             Kstat["kstat.zfs.misc.vdev_cache_stats.delegations"]
     vdev_cache_misses = Kstat["kstat.zfs.misc.vdev_cache_stats.misses"]
     vdev_cache_hits = Kstat["kstat.zfs.misc.vdev_cache_stats.hits"]
     vdev_cache_total = (vdev_cache_misses + vdev_cache_hits +
             vdev_cache_delegations)
 
     output['vdev_cache_total'] = vdev_cache_total
 
     if vdev_cache_total > 0:
         output['summary'] = fHits(vdev_cache_total)
         output['hit_ratio'] = {
             'per': fPerc(vdev_cache_hits, vdev_cache_total),
             'num': fHits(vdev_cache_hits),
         }
         output['miss_ratio'] = {
             'per': fPerc(vdev_cache_misses, vdev_cache_total),
             'num': fHits(vdev_cache_misses),
         }
         output['delegations'] = {
             'per': fPerc(vdev_cache_delegations, vdev_cache_total),
             'num': fHits(vdev_cache_delegations),
         }
 
     return output
 
 
 def _vdev_summary(Kstat):
     arc = get_vdev_summary(Kstat)
 
     if arc['vdev_cache_total'] > 0:
         sys.stdout.write("VDEV Cache Summary:\t\t\t\t%s\n" % arc['summary'])
         sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
             arc['hit_ratio']['per'],
             arc['hit_ratio']['num'],
         ))
         sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
             arc['miss_ratio']['per'],
             arc['miss_ratio']['num'],
         ))
         sys.stdout.write("\tDelegations:\t\t\t%s\t%s\n" % (
             arc['delegations']['per'],
             arc['delegations']['num'],
         ))
 
 
 def _tunable_summary(Kstat):
     global show_tunable_descriptions
     global alternate_tunable_layout
 
     names = listdir("/sys/module/zfs/parameters/")
 
     values = {}
     for name in names:
         with open("/sys/module/zfs/parameters/" + name) as f: value = f.read()
         values[name] = value.strip()
 
     descriptions = {}
 
     if show_tunable_descriptions:
         try:
             command = ["/sbin/modinfo", "zfs", "-0"]
             p = Popen(command, stdin=PIPE, stdout=PIPE,
                     stderr=PIPE, shell=False, close_fds=True)
             p.wait()
 
             description_list = p.communicate()[0].strip().split('\0')
 
             if p.returncode == 0:
                 for tunable in description_list:
                     if tunable[0:5] == 'parm:':
                         tunable = tunable[5:].strip()
                         name, description = tunable.split(':', 1)
                         if not description:
                             description = "Description unavailable"
                         descriptions[name] = description
             else:
                 sys.stderr.write("%s: '%s' exited with code %i\n" %
                         (sys.argv[0], command[0], p.returncode))
                 sys.stderr.write("Tunable descriptions will be disabled.\n")
         except OSError as e:
             sys.stderr.write("%s: Cannot run '%s': %s\n" %
                     (sys.argv[0], command[0], e.strerror))
             sys.stderr.write("Tunable descriptions will be disabled.\n")
 
     sys.stdout.write("ZFS Tunable:\n")
     for name in names:
         if not name:
             continue
 
         format = "\t%-50s%s\n"
         if alternate_tunable_layout:
             format = "\t%s=%s\n"
 
         if show_tunable_descriptions and descriptions.has_key(name):
             sys.stdout.write("\t# %s\n" % descriptions[name])
 
         sys.stdout.write(format % (name, values[name]))
 
 
 unSub = [
     _arc_summary,
     _arc_efficiency,
     _l2arc_summary,
     _dmu_summary,
     _vdev_summary,
     _tunable_summary
 ]
 
 
 def zfs_header():
     daydate = time.strftime("%a %b %d %H:%M:%S %Y")
 
     div1()
     sys.stdout.write("ZFS Subsystem Report\t\t\t\t%s" % daydate)
     div2()
 
 
 def usage():
     sys.stdout.write("Usage: arc_summary.py [-h] [-a] [-d] [-p PAGE]\n\n")
     sys.stdout.write("\t -h, --help           : "
             "Print this help message and exit\n")
     sys.stdout.write("\t -a, --alternate      : "
             "Show an alternate sysctl layout\n")
     sys.stdout.write("\t -d, --description    : "
             "Show the sysctl descriptions\n")
     sys.stdout.write("\t -p PAGE, --page=PAGE : "
             "Select a single output page to display,\n")
     sys.stdout.write("\t                        "
             "should be an integer between 1 and " + str(len(unSub)) + "\n\n")
     sys.stdout.write("Examples:\n")
     sys.stdout.write("\tarc_summary.py -a\n")
     sys.stdout.write("\tarc_summary.py -p 4\n")
     sys.stdout.write("\tarc_summary.py -ad\n")
     sys.stdout.write("\tarc_summary.py --page=2\n")
 
 def main():
     global show_tunable_descriptions
     global alternate_tunable_layout
 
     opts, args = getopt.getopt(
         sys.argv[1:], "adp:h", ["alternate", "description", "page=", "help"]
     )
 
     args = {}
     for opt, arg in opts:
         if opt in ('-a', '--alternate'):
             args['a'] = True
         if opt in ('-d', '--description'):
             args['d'] = True
         if opt in ('-p', '--page'):
             args['p'] = arg
         if opt in ('-h', '--help'):
             usage()
             sys.exit()
 
     Kstat = get_Kstat()
 
     alternate_tunable_layout = 'a' in args
     show_tunable_descriptions = 'd' in args
 
     pages = []
 
     if 'p' in args:
         try:
             pages.append(unSub[int(args['p']) - 1])
         except IndexError , e:
             sys.stderr.write('the argument to -p must be between 1 and ' +
                     str(len(unSub)) + '\n')
             sys.exit()
     else:
         pages = unSub
 
     zfs_header()
     for page in pages:
         page(Kstat)
         div2()
 
 if __name__ == '__main__':
     main()
diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py
index b516cf285fdc..bbf43100aa4e 100755
--- a/cmd/arcstat/arcstat.py
+++ b/cmd/arcstat/arcstat.py
@@ -1,459 +1,457 @@
 #!/usr/bin/python
 #
 # Print out ZFS ARC Statistics exported via kstat(1)
 # For a definition of fields, or usage, use arctstat.pl -v
 #
 # This script is a fork of the original arcstat.pl (0.1) by
 # Neelakanth Nadgir, originally published on his Sun blog on
 # 09/18/2007
 #     http://blogs.sun.com/realneel/entry/zfs_arc_statistics
 #
 # This version aims to improve upon the original by adding features
 # and fixing bugs as needed.  This version is maintained by
 # Mike Harsch and is hosted in a public open source repository:
 #    http://github.com/mharsch/arcstat
 #
 # Comments, Questions, or Suggestions are always welcome.
 # Contact the maintainer at ( mike at harschsystems dot com )
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License, Version 1.0 only
 # (the "License").  You may not use this file except in compliance
 # with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 #
 # Fields have a fixed width. Every interval, we fill the "v"
 # hash with its corresponding value (v[field]=value) using calculate().
 # @hdr is the array of fields that needs to be printed, so we
 # just iterate over this array and print the values using our pretty printer.
 #
 
 
 import sys
 import time
 import getopt
 import re
 import copy
 
 from decimal import Decimal
 from signal import signal, SIGINT, SIGWINCH, SIG_DFL
 
 cols = {
     # HDR:        [Size, Scale, Description]
     "time":       [8, -1, "Time"],
     "hits":       [4, 1000, "ARC reads per second"],
     "miss":       [4, 1000, "ARC misses per second"],
     "read":       [4, 1000, "Total ARC accesses per second"],
     "hit%":       [4, 100, "ARC Hit percentage"],
     "miss%":      [5, 100, "ARC miss percentage"],
     "dhit":       [4, 1000, "Demand hits per second"],
     "dmis":       [4, 1000, "Demand misses per second"],
     "dh%":        [3, 100, "Demand hit percentage"],
     "dm%":        [3, 100, "Demand miss percentage"],
     "phit":       [4, 1000, "Prefetch hits per second"],
     "pmis":       [4, 1000, "Prefetch misses per second"],
     "ph%":        [3, 100, "Prefetch hits percentage"],
     "pm%":        [3, 100, "Prefetch miss percentage"],
     "mhit":       [4, 1000, "Metadata hits per second"],
     "mmis":       [4, 1000, "Metadata misses per second"],
     "mread":      [4, 1000, "Metadata accesses per second"],
     "mh%":        [3, 100, "Metadata hit percentage"],
     "mm%":        [3, 100, "Metadata miss percentage"],
     "arcsz":      [5, 1024, "ARC Size"],
     "c":          [4, 1024, "ARC Target Size"],
     "mfu":        [4, 1000, "MFU List hits per second"],
     "mru":        [4, 1000, "MRU List hits per second"],
     "mfug":       [4, 1000, "MFU Ghost List hits per second"],
     "mrug":       [4, 1000, "MRU Ghost List hits per second"],
     "eskip":      [5, 1000, "evict_skip per second"],
     "mtxmis":     [6, 1000, "mutex_miss per second"],
-    "rmis":       [4, 1000, "recycle_miss per second"],
     "dread":      [5, 1000, "Demand accesses per second"],
     "pread":      [5, 1000, "Prefetch accesses per second"],
     "l2hits":     [6, 1000, "L2ARC hits per second"],
     "l2miss":     [6, 1000, "L2ARC misses per second"],
     "l2read":     [6, 1000, "Total L2ARC accesses per second"],
     "l2hit%":     [6, 100, "L2ARC access hit percentage"],
     "l2miss%":    [7, 100, "L2ARC access miss percentage"],
     "l2asize":    [7, 1024, "Actual (compressed) size of the L2ARC"],
     "l2size":     [6, 1024, "Size of the L2ARC"],
     "l2bytes":    [7, 1024, "bytes read per second from the L2ARC"],
 }
 
 v = {}
 hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis",
        "mm%", "arcsz", "c"]
 xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "rmis",
         "dread", "pread", "read"]
 sint = 1               # Default interval is 1 second
 count = 1              # Default count is 1
 hdr_intr = 20          # Print header every 20 lines of output
 opfile = None
 sep = "  "              # Default separator is 2 spaces
 version = "0.4"
 l2exist = False
 cmd = ("Usage: arcstat.py [-hvx] [-f fields] [-o file] [-s string] [interval "
        "[count]]\n")
 cur = {}
 d = {}
 out = None
 kstat = None
 float_pobj = re.compile("^[0-9]+(\.[0-9]+)?$")
 
 
 def detailed_usage():
     sys.stderr.write("%s\n" % cmd)
     sys.stderr.write("Field definitions are as follows:\n")
     for key in cols:
         sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
     sys.stderr.write("\n")
 
     sys.exit(1)
 
 
 def usage():
     sys.stderr.write("%s\n" % cmd)
     sys.stderr.write("\t -h : Print this help message\n")
     sys.stderr.write("\t -v : List all possible field headers and definitions"
                      "\n")
     sys.stderr.write("\t -x : Print extended stats\n")
     sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
     sys.stderr.write("\t -o : Redirect output to the specified file\n")
     sys.stderr.write("\t -s : Override default field separator with custom "
                      "character or string\n")
     sys.stderr.write("\nExamples:\n")
     sys.stderr.write("\tarcstat.py -o /tmp/a.log 2 10\n")
     sys.stderr.write("\tarcstat.py -s \",\" -o /tmp/a.log 2 10\n")
     sys.stderr.write("\tarcstat.py -v\n")
     sys.stderr.write("\tarcstat.py -f time,hit%,dh%,ph%,mh% 1\n")
     sys.stderr.write("\n")
 
     sys.exit(1)
 
 
 def kstat_update():
     global kstat
 
     k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
 
     if not k:
         sys.exit(1)
 
     del k[0:2]
     kstat = {}
 
     for s in k:
         if not s:
             continue
 
         name, unused, value = s.split()
         kstat[name] = Decimal(value)
 
 
 def snap_stats():
     global cur
     global kstat
 
     prev = copy.deepcopy(cur)
     kstat_update()
 
     cur = kstat
     for key in cur:
         if re.match(key, "class"):
             continue
         if key in prev:
             d[key] = cur[key] - prev[key]
         else:
             d[key] = cur[key]
 
 
 def prettynum(sz, scale, num=0):
     suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
     index = 0
     save = 0
 
     # Special case for date field
     if scale == -1:
         return "%s" % num
 
     # Rounding error, return 0
     elif 0 < num < 1:
         num = 0
 
     while num > scale and index < 5:
         save = num
         num = num / scale
         index += 1
 
     if index == 0:
         return "%*d" % (sz, num)
 
     if (save / scale) < 10:
         return "%*.1f%s" % (sz - 1, num, suffix[index])
     else:
         return "%*d%s" % (sz - 1, num, suffix[index])
 
 
 def print_values():
     global hdr
     global sep
     global v
 
     for col in hdr:
         sys.stdout.write("%s%s" % (
             prettynum(cols[col][0], cols[col][1], v[col]),
             sep
         ))
     sys.stdout.write("\n")
 
 
 def print_header():
     global hdr
     global sep
 
     for col in hdr:
         sys.stdout.write("%*s%s" % (cols[col][0], col, sep))
     sys.stdout.write("\n")
 
 def get_terminal_lines():
     try:
         import fcntl, termios, struct
         data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234')
         sz = struct.unpack('hh', data)
         return sz[0]
     except:
         pass
 
 def update_hdr_intr():
     global hdr_intr
 
     lines = get_terminal_lines()
     if lines and lines > 3:
         hdr_intr = lines - 3
 
 def resize_handler(signum, frame):
     update_hdr_intr()
 
 
 def init():
     global sint
     global count
     global hdr
     global xhdr
     global opfile
     global sep
     global out
     global l2exist
 
     desired_cols = None
     xflag = False
     hflag = False
     vflag = False
     i = 1
 
     try:
         opts, args = getopt.getopt(
             sys.argv[1:],
             "xo:hvs:f:",
             [
                 "extended",
                 "outfile",
                 "help",
                 "verbose",
                 "seperator",
                 "columns"
             ]
         )
     except getopt.error as msg:
         sys.stderr.write(msg)
         usage()
         opts = None
 
     for opt, arg in opts:
         if opt in ('-x', '--extended'):
             xflag = True
         if opt in ('-o', '--outfile'):
             opfile = arg
             i += 1
         if opt in ('-h', '--help'):
             hflag = True
         if opt in ('-v', '--verbose'):
             vflag = True
         if opt in ('-s', '--seperator'):
             sep = arg
             i += 1
         if opt in ('-f', '--columns'):
             desired_cols = arg
             i += 1
         i += 1
 
     argv = sys.argv[i:]
     sint = Decimal(argv[0]) if argv else sint
     count = int(argv[1]) if len(argv) > 1 else count
 
     if len(argv) > 1:
         sint = Decimal(argv[0])
         count = int(argv[1])
 
     elif len(argv) > 0:
         sint = Decimal(argv[0])
         count = 0
 
     if hflag or (xflag and desired_cols):
         usage()
 
     if vflag:
         detailed_usage()
 
     if xflag:
         hdr = xhdr
 
     update_hdr_intr()
 
     # check if L2ARC exists
     snap_stats()
     l2_size = cur.get("l2_size")
     if l2_size:
         l2exist = True
 
     if desired_cols:
         hdr = desired_cols.split(",")
 
         invalid = []
         incompat = []
         for ele in hdr:
             if ele not in cols:
                 invalid.append(ele)
             elif not l2exist and ele.startswith("l2"):
                 sys.stdout.write("No L2ARC Here\n%s\n" % ele)
                 incompat.append(ele)
 
         if len(invalid) > 0:
             sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
             usage()
 
         if len(incompat) > 0:
             sys.stderr.write("Incompatible field specified! -- %s\n" %
                              incompat)
             usage()
 
     if opfile:
         try:
             out = open(opfile, "w")
             sys.stdout = out
 
         except IOError:
             sys.stderr.write("Cannot open %s for writing\n" % opfile)
             sys.exit(1)
 
 
 def calculate():
     global d
     global v
     global l2exist
 
     v = dict()
     v["time"] = time.strftime("%H:%M:%S", time.localtime())
     v["hits"] = d["hits"] / sint
     v["miss"] = d["misses"] / sint
     v["read"] = v["hits"] + v["miss"]
     v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0
     v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0
 
     v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint
     v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint
 
     v["dread"] = v["dhit"] + v["dmis"]
     v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0
     v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0
 
     v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint
     v["pmis"] = (d["prefetch_data_misses"] +
                  d["prefetch_metadata_misses"]) / sint
 
     v["pread"] = v["phit"] + v["pmis"]
     v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0
     v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0
 
     v["mhit"] = (d["prefetch_metadata_hits"] +
                  d["demand_metadata_hits"]) / sint
     v["mmis"] = (d["prefetch_metadata_misses"] +
                  d["demand_metadata_misses"]) / sint
 
     v["mread"] = v["mhit"] + v["mmis"]
     v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0
     v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0
 
     v["arcsz"] = cur["size"]
     v["c"] = cur["c"]
     v["mfu"] = d["mfu_hits"] / sint
     v["mru"] = d["mru_hits"] / sint
     v["mrug"] = d["mru_ghost_hits"] / sint
     v["mfug"] = d["mfu_ghost_hits"] / sint
     v["eskip"] = d["evict_skip"] / sint
-    v["rmis"] = d["recycle_miss"] / sint
     v["mtxmis"] = d["mutex_miss"] / sint
 
     if l2exist:
         v["l2hits"] = d["l2_hits"] / sint
         v["l2miss"] = d["l2_misses"] / sint
         v["l2read"] = v["l2hits"] + v["l2miss"]
         v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0
 
         v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0
         v["l2asize"] = cur["l2_asize"]
         v["l2size"] = cur["l2_size"]
         v["l2bytes"] = d["l2_read_bytes"] / sint
 
 
 def main():
     global sint
     global count
     global hdr_intr
 
     i = 0
     count_flag = 0
 
     init()
     if count > 0:
         count_flag = 1
 
     signal(SIGINT, SIG_DFL)
     signal(SIGWINCH, resize_handler)
     while True:
         if i == 0:
             print_header()
 
         snap_stats()
         calculate()
         print_values()
 
         if count_flag == 1:
             if count <= 1:
                 break
             count -= 1
 
         i = 0 if i >= hdr_intr else i + 1
         time.sleep(sint)
 
     if out:
         out.close()
 
 
 if __name__ == '__main__':
     main()
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 5211e656456d..fd80b34ee44d 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -1,116 +1,118 @@
 SUBDIRS = fm fs
 
 COMMON_H = \
 	$(top_srcdir)/include/sys/arc.h \
 	$(top_srcdir)/include/sys/arc_impl.h \
 	$(top_srcdir)/include/sys/avl.h \
 	$(top_srcdir)/include/sys/avl_impl.h \
 	$(top_srcdir)/include/sys/blkptr.h \
 	$(top_srcdir)/include/sys/bplist.h \
 	$(top_srcdir)/include/sys/bpobj.h \
 	$(top_srcdir)/include/sys/bptree.h \
 	$(top_srcdir)/include/sys/dbuf.h \
 	$(top_srcdir)/include/sys/ddt.h \
 	$(top_srcdir)/include/sys/dmu.h \
 	$(top_srcdir)/include/sys/dmu_impl.h \
 	$(top_srcdir)/include/sys/dmu_objset.h \
 	$(top_srcdir)/include/sys/dmu_send.h \
 	$(top_srcdir)/include/sys/dmu_traverse.h \
 	$(top_srcdir)/include/sys/dmu_tx.h \
 	$(top_srcdir)/include/sys/dmu_zfetch.h \
 	$(top_srcdir)/include/sys/dnode.h \
 	$(top_srcdir)/include/sys/dsl_bookmark.h \
 	$(top_srcdir)/include/sys/dsl_dataset.h \
 	$(top_srcdir)/include/sys/dsl_deadlist.h \
 	$(top_srcdir)/include/sys/dsl_deleg.h \
 	$(top_srcdir)/include/sys/dsl_destroy.h \
 	$(top_srcdir)/include/sys/dsl_dir.h \
 	$(top_srcdir)/include/sys/dsl_pool.h \
 	$(top_srcdir)/include/sys/dsl_prop.h \
 	$(top_srcdir)/include/sys/dsl_scan.h \
 	$(top_srcdir)/include/sys/dsl_synctask.h \
 	$(top_srcdir)/include/sys/dsl_userhold.h \
 	$(top_srcdir)/include/sys/efi_partition.h \
 	$(top_srcdir)/include/sys/metaslab.h \
 	$(top_srcdir)/include/sys/metaslab_impl.h \
+	$(top_srcdir)/include/sys/multilist.h \
 	$(top_srcdir)/include/sys/nvpair.h \
 	$(top_srcdir)/include/sys/nvpair_impl.h \
 	$(top_srcdir)/include/sys/range_tree.h \
 	$(top_srcdir)/include/sys/refcount.h \
 	$(top_srcdir)/include/sys/rrwlock.h \
 	$(top_srcdir)/include/sys/sa.h \
 	$(top_srcdir)/include/sys/sa_impl.h \
 	$(top_srcdir)/include/sys/sdt.h \
 	$(top_srcdir)/include/sys/spa_boot.h \
 	$(top_srcdir)/include/sys/space_map.h \
 	$(top_srcdir)/include/sys/space_reftree.h \
 	$(top_srcdir)/include/sys/spa.h \
 	$(top_srcdir)/include/sys/spa_impl.h \
 	$(top_srcdir)/include/sys/trace.h \
 	$(top_srcdir)/include/sys/trace_acl.h \
 	$(top_srcdir)/include/sys/trace_arc.h \
 	$(top_srcdir)/include/sys/trace_dbgmsg.h \
 	$(top_srcdir)/include/sys/trace_dbuf.h \
 	$(top_srcdir)/include/sys/trace_dmu.h \
 	$(top_srcdir)/include/sys/trace_dnode.h \
+	$(top_srcdir)/include/sys/trace_multilist.h \
 	$(top_srcdir)/include/sys/trace_txg.h \
 	$(top_srcdir)/include/sys/trace_zil.h \
 	$(top_srcdir)/include/sys/trace_zrlock.h \
 	$(top_srcdir)/include/sys/txg.h \
 	$(top_srcdir)/include/sys/txg_impl.h \
 	$(top_srcdir)/include/sys/u8_textprep_data.h \
 	$(top_srcdir)/include/sys/u8_textprep.h \
 	$(top_srcdir)/include/sys/uberblock.h \
 	$(top_srcdir)/include/sys/uberblock_impl.h \
 	$(top_srcdir)/include/sys/uio_impl.h \
 	$(top_srcdir)/include/sys/unique.h \
 	$(top_srcdir)/include/sys/uuid.h \
 	$(top_srcdir)/include/sys/vdev_disk.h \
 	$(top_srcdir)/include/sys/vdev_file.h \
 	$(top_srcdir)/include/sys/vdev.h \
 	$(top_srcdir)/include/sys/vdev_impl.h \
 	$(top_srcdir)/include/sys/xvattr.h \
 	$(top_srcdir)/include/sys/zap.h \
 	$(top_srcdir)/include/sys/zap_impl.h \
 	$(top_srcdir)/include/sys/zap_leaf.h \
 	$(top_srcdir)/include/sys/zfeature.h \
 	$(top_srcdir)/include/sys/zfs_acl.h \
 	$(top_srcdir)/include/sys/zfs_context.h \
 	$(top_srcdir)/include/sys/zfs_ctldir.h \
 	$(top_srcdir)/include/sys/zfs_debug.h \
 	$(top_srcdir)/include/sys/zfs_delay.h \
 	$(top_srcdir)/include/sys/zfs_dir.h \
 	$(top_srcdir)/include/sys/zfs_fuid.h \
 	$(top_srcdir)/include/sys/zfs_rlock.h \
 	$(top_srcdir)/include/sys/zfs_sa.h \
 	$(top_srcdir)/include/sys/zfs_stat.h \
 	$(top_srcdir)/include/sys/zfs_vfsops.h \
 	$(top_srcdir)/include/sys/zfs_vnops.h \
 	$(top_srcdir)/include/sys/zfs_znode.h \
 	$(top_srcdir)/include/sys/zil.h \
 	$(top_srcdir)/include/sys/zil_impl.h \
 	$(top_srcdir)/include/sys/zio_checksum.h \
 	$(top_srcdir)/include/sys/zio_compress.h \
 	$(top_srcdir)/include/sys/zio.h \
 	$(top_srcdir)/include/sys/zio_impl.h \
 	$(top_srcdir)/include/sys/zrlock.h
 
 KERNEL_H = \
 	$(top_srcdir)/include/sys/zfs_ioctl.h \
 	$(top_srcdir)/include/sys/zfs_onexit.h \
 	${top_srcdir}/include/sys/zpl.h \
 	$(top_srcdir)/include/sys/zvol.h
 
 USER_H =
 
 EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H)
 
 if CONFIG_USER
 libzfsdir = $(includedir)/libzfs/sys
 libzfs_HEADERS = $(COMMON_H) $(USER_H)
 endif
 
 if CONFIG_KERNEL
 kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys
 kernel_HEADERS = $(COMMON_H) $(KERNEL_H)
 endif
diff --git a/include/sys/arc.h b/include/sys/arc.h
index 903f0b413167..38f9f27fea61 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -1,230 +1,235 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #ifndef	_SYS_ARC_H
 #define	_SYS_ARC_H
 
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #include <sys/zio.h>
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/refcount.h>
 
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define	ARC_EVICT_ALL	-1ULL
+
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
 typedef struct arc_prune arc_prune_t;
 typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
 typedef void arc_prune_func_t(int64_t bytes, void *private);
 typedef int arc_evict_func_t(void *private);
 
 /* generic arc_done_func_t's which you can use */
 arc_done_func_t arc_bcopy_func;
 arc_done_func_t arc_getbuf_func;
 
 /* generic arc_prune_func_t wrapper for callbacks */
 struct arc_prune {
 	arc_prune_func_t	*p_pfunc;
 	void			*p_private;
 	list_node_t		p_node;
 	refcount_t		p_refcnt;
 };
 
 typedef enum arc_flags
 {
 	/*
 	 * Public flags that can be passed into the ARC by external consumers.
 	 */
 	ARC_FLAG_NONE			= 1 << 0,	/* No flags set */
 	ARC_FLAG_WAIT			= 1 << 1,	/* perform sync I/O */
 	ARC_FLAG_NOWAIT			= 1 << 2,	/* perform async I/O */
 	ARC_FLAG_PREFETCH		= 1 << 3,	/* I/O is a prefetch */
 	ARC_FLAG_CACHED			= 1 << 4,	/* I/O was in cache */
 	ARC_FLAG_L2CACHE		= 1 << 5,	/* cache in L2ARC */
 	ARC_FLAG_L2COMPRESS		= 1 << 6,	/* compress in L2ARC */
 
 	/*
 	 * Private ARC flags.  These flags are private ARC only flags that
 	 * will show up in b_flags in the arc_hdr_buf_t. These flags should
 	 * only be set by ARC code.
 	 */
 	ARC_FLAG_IN_HASH_TABLE		= 1 << 7,	/* buffer is hashed */
 	ARC_FLAG_IO_IN_PROGRESS		= 1 << 8,	/* I/O in progress */
 	ARC_FLAG_IO_ERROR		= 1 << 9,	/* I/O failed for buf */
 	ARC_FLAG_FREED_IN_READ		= 1 << 10,	/* freed during read */
 	ARC_FLAG_BUF_AVAILABLE		= 1 << 11,	/* block not in use */
 	ARC_FLAG_INDIRECT		= 1 << 12,	/* indirect block */
 	ARC_FLAG_L2_WRITING		= 1 << 13,	/* write in progress */
 	ARC_FLAG_L2_EVICTED		= 1 << 14,	/* evicted during I/O */
 	ARC_FLAG_L2_WRITE_HEAD		= 1 << 15,	/* head of write list */
 	/* indicates that the buffer contains metadata (otherwise, data) */
 	ARC_FLAG_BUFC_METADATA		= 1 << 16,
 
 	/* Flags specifying whether optional hdr struct fields are defined */
 	ARC_FLAG_HAS_L1HDR		= 1 << 17,
 	ARC_FLAG_HAS_L2HDR		= 1 << 18,
 
 	/*
 	 * The arc buffer's compression mode is stored in the top 7 bits of the
 	 * flags field, so these dummy flags are included so that MDB can
 	 * interpret the enum properly.
 	 */
 	ARC_FLAG_COMPRESS_0		= 1 << 24,
 	ARC_FLAG_COMPRESS_1		= 1 << 25,
 	ARC_FLAG_COMPRESS_2		= 1 << 26,
 	ARC_FLAG_COMPRESS_3		= 1 << 27,
 	ARC_FLAG_COMPRESS_4		= 1 << 28,
 	ARC_FLAG_COMPRESS_5		= 1 << 29,
 	ARC_FLAG_COMPRESS_6		= 1 << 30
 
 } arc_flags_t;
 
 struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
 	kmutex_t		b_evict_lock;
 	void			*b_data;
 	arc_evict_func_t	*b_efunc;
 	void			*b_private;
 };
 
 typedef enum arc_buf_contents {
 	ARC_BUFC_DATA,				/* buffer contains data */
 	ARC_BUFC_METADATA,			/* buffer contains metadata */
 	ARC_BUFC_NUMTYPES
 } arc_buf_contents_t;
 
 /*
  * The following breakdows of arc_size exist for kstat only.
  */
 typedef enum arc_space_type {
 	ARC_SPACE_DATA,
 	ARC_SPACE_META,
 	ARC_SPACE_HDRS,
 	ARC_SPACE_L2HDRS,
 	ARC_SPACE_OTHER,
 	ARC_SPACE_NUMTYPES
 } arc_space_type_t;
 
 typedef enum arc_state_type {
 	ARC_STATE_ANON,
 	ARC_STATE_MRU,
 	ARC_STATE_MRU_GHOST,
 	ARC_STATE_MFU,
 	ARC_STATE_MFU_GHOST,
 	ARC_STATE_L2C_ONLY,
 	ARC_STATE_NUMTYPES
 } arc_state_type_t;
 
 typedef struct arc_buf_info {
 	arc_state_type_t	abi_state_type;
 	arc_buf_contents_t	abi_state_contents;
-	uint64_t		abi_state_index;
 	uint32_t		abi_flags;
 	uint32_t		abi_datacnt;
 	uint64_t		abi_size;
 	uint64_t		abi_spa;
 	uint64_t		abi_access;
 	uint32_t		abi_mru_hits;
 	uint32_t		abi_mru_ghost_hits;
 	uint32_t		abi_mfu_hits;
 	uint32_t		abi_mfu_ghost_hits;
 	uint32_t		abi_l2arc_hits;
 	uint32_t		abi_holds;
 	uint64_t		abi_l2arc_dattr;
 	uint64_t		abi_l2arc_asize;
 	enum zio_compress	abi_l2arc_compress;
 } arc_buf_info_t;
 
 void arc_space_consume(uint64_t space, arc_space_type_t type);
 void arc_space_return(uint64_t space, arc_space_type_t type);
 arc_buf_t *arc_buf_alloc(spa_t *spa, uint64_t size, void *tag,
     arc_buf_contents_t type);
 arc_buf_t *arc_loan_buf(spa_t *spa, uint64_t size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
 void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index);
 uint64_t arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
 void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
 boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif
 
 int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, zio_priority_t priority, int flags,
     arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
 zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
     arc_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, const zbookmark_phys_t *zb);
 
 arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *private);
 void arc_remove_prune_callback(arc_prune_t *p);
 void arc_freed(spa_t *spa, const blkptr_t *bp);
 
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 boolean_t arc_clear_callback(arc_buf_t *buf);
 
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
 
 void arc_init(void);
 void arc_fini(void);
 
 /*
  * Level 2 ARC
  */
 
 void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
 void l2arc_remove_vdev(vdev_t *vd);
 boolean_t l2arc_vdev_present(vdev_t *vd);
 void l2arc_init(void);
 void l2arc_fini(void);
 void l2arc_start(void);
 void l2arc_stop(void);
 
 #ifndef _KERNEL
 extern boolean_t arc_watch;
 #endif
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_ARC_H */
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index 556cc258330d..54f5e9f4094a 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -1,215 +1,227 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef _SYS_ARC_IMPL_H
 #define	_SYS_ARC_IMPL_H
 
 #include <sys/arc.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
  *	ARC_mru		- recently used, currently cached
  *	ARC_mru_ghost	- recentely used, no longer in cache
  *	ARC_mfu		- frequently used, currently cached
  *	ARC_mfu_ghost	- frequently used, no longer in cache
  *	ARC_l2c_only	- exists in L2ARC but not other states
  * When there are no active references to the buffer, they are
  * are linked onto a list in one of these arc states.  These are
  * the only buffers that can be evicted or deleted.  Within each
  * state there are multiple lists, one for meta-data and one for
  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
  * etc.) is tracked separately so that it can be managed more
  * explicitly: favored over data, limited explicitly.
  *
  * Anonymous buffers are buffers that are not associated with
  * a DVA.  These are buffers that hold dirty block copies
  * before they are written to stable storage.  By definition,
  * they are "ref'd" and are considered part of arc_mru
  * that cannot be freed.  Generally, they will aquire a DVA
  * as they are written and migrate onto the arc_mru list.
  *
  * The ARC_l2c_only state is for buffers that are in the second
  * level ARC but no longer in any of the ARC_m* lists.  The second
  * level ARC itself may also contain buffers that are in any of
  * the ARC_m* states - meaning that a buffer can exist in two
  * places.  The reason for the ARC_l2c_only state is to keep the
  * buffer header in the hash table, so that reads that hit the
  * second level ARC benefit from these fast lookups.
  */
 
 typedef struct arc_state {
-	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
-	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
-	uint64_t arcs_size;	/* total amount of data in this state */
-	kmutex_t arcs_mtx;
+	/*
+	 * list of evictable buffers
+	 */
+	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of evictable data in this state
+	 */
+	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of data in this state; this includes: evictable,
+	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+	 */
+	uint64_t arcs_size;
+	/*
+	 * supports the "dbufs" kstat
+	 */
 	arc_state_type_t arcs_state;
 } arc_state_t;
 
 typedef struct arc_callback arc_callback_t;
 
 struct arc_callback {
 	void			*acb_private;
 	arc_done_func_t		*acb_done;
 	arc_buf_t		*acb_buf;
 	zio_t			*acb_zio_dummy;
 	arc_callback_t		*acb_next;
 };
 
 typedef struct arc_write_callback arc_write_callback_t;
 
 struct arc_write_callback {
 	void		*awcb_private;
 	arc_done_func_t	*awcb_ready;
 	arc_done_func_t	*awcb_physdone;
 	arc_done_func_t	*awcb_done;
 	arc_buf_t	*awcb_buf;
 };
 
 /*
  * ARC buffers are separated into multiple structs as a memory saving measure:
  *   - Common fields struct, always defined, and embedded within it:
  *       - L2-only fields, always allocated but undefined when not in L2ARC
  *       - L1-only fields, only allocated when in L1ARC
  *
  *           Buffer in L1                     Buffer only in L2
  *    +------------------------+          +------------------------+
  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
  *    |                        |          |                        |
  *    |                        |          |                        |
  *    |                        |          |                        |
  *    +------------------------+          +------------------------+
  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
  *    | (undefined if L1-only) |          |                        |
  *    +------------------------+          +------------------------+
  *    | l1arc_buf_hdr_t        |
  *    |                        |
  *    |                        |
  *    |                        |
  *    |                        |
  *    +------------------------+
  *
  * Because it's possible for the L2ARC to become extremely large, we can wind
  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
  * is minimized by only allocating the fields necessary for an L1-cached buffer
  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
  * words in pointers. arc_hdr_realloc() is used to switch a header between
  * these two allocation states.
  */
 typedef struct l1arc_buf_hdr {
 	kmutex_t		b_freeze_lock;
 
 	arc_buf_t		*b_buf;
 	uint32_t		b_datacnt;
 	/* for waiting on writes to complete */
 	kcondvar_t		b_cv;
 
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
-	list_node_t		b_arc_node;
+	multilist_node_t	b_arc_node;
 
 	/* updated atomically */
 	clock_t			b_arc_access;
 	uint32_t		b_mru_hits;
 	uint32_t		b_mru_ghost_hits;
 	uint32_t		b_mfu_hits;
 	uint32_t		b_mfu_ghost_hits;
 	uint32_t		b_l2_hits;
 
 	/* self protecting */
 	refcount_t		b_refcnt;
 
 	arc_callback_t		*b_acb;
 	/* temporary buffer holder for in-flight compressed data */
 	void			*b_tmp_cdata;
 } l1arc_buf_hdr_t;
 
 typedef struct l2arc_dev {
 	vdev_t			*l2ad_vdev;	/* vdev */
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
 	uint64_t		l2ad_evict;	/* last addr eviction reached */
 	boolean_t		l2ad_first;	/* first sweep through */
 	boolean_t		l2ad_writing;	/* currently writing */
 	kmutex_t		l2ad_mtx;	/* lock for buffer list */
 	list_t			l2ad_buflist;	/* buffer list */
 	list_node_t		l2ad_node;	/* device list node */
 } l2arc_dev_t;
 
 typedef struct l2arc_buf_hdr {
 	/* protected by arc_buf_hdr mutex */
 	l2arc_dev_t		*b_dev;		/* L2ARC device */
 	uint64_t		b_daddr;	/* disk address, offset byte */
 	/* real alloc'd buffer size depending on b_compress applied */
 	uint32_t		b_hits;
 	int32_t			b_asize;
 
 	list_node_t		b_l2node;
 } l2arc_buf_hdr_t;
 
 typedef struct l2arc_write_callback {
 	l2arc_dev_t	*l2wcb_dev;		/* device info */
 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
 } l2arc_write_callback_t;
 
 struct arc_buf_hdr {
 	/* protected by hash lock */
 	dva_t			b_dva;
 	uint64_t		b_birth;
 	/*
 	 * Even though this checksum is only set/verified when a buffer is in
 	 * the L1 cache, it needs to be in the set of common fields because it
 	 * must be preserved from the time before a buffer is written out to
 	 * L2ARC until after it is read back in.
 	 */
 	zio_cksum_t		*b_freeze_cksum;
 
 	arc_buf_hdr_t		*b_hash_next;
 	arc_flags_t		b_flags;
 
 	/* immutable */
 	int32_t			b_size;
 	uint64_t		b_spa;
 
 	/* L2ARC fields. Undefined when not in L2ARC. */
 	l2arc_buf_hdr_t		b_l2hdr;
 	/* L1ARC fields. Undefined when in l2arc_only state */
 	l1arc_buf_hdr_t		b_l1hdr;
 };
 #ifdef __cplusplus
 }
 #endif
 
 #endif /* _SYS_ARC_IMPL_H */
diff --git a/include/sys/multilist.h b/include/sys/multilist.h
new file mode 100644
index 000000000000..98d707dd71ef
--- /dev/null
+++ b/include/sys/multilist.h
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_MULTILIST_H
+#define	_SYS_MULTILIST_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef list_node_t multilist_node_t;
+typedef struct multilist multilist_t;
+typedef struct multilist_sublist multilist_sublist_t;
+typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
+
+struct multilist_sublist {
+	/*
+	 * The mutex used internally to implement thread safe insertions
+	 * and removals to this individual sublist. It can also be locked
+	 * by a consumer using multilist_sublist_{lock,unlock}, which is
+	 * useful if a consumer needs to traverse the list in a thread
+	 * safe manner.
+	 */
+	kmutex_t	mls_lock;
+	/*
+	 * The actual list object containing all objects in this sublist.
+	 */
+	list_t		mls_list;
+	/*
+	 * Pad to cache line, in an effort to try and prevent cache line
+	 * contention.
+	 */
+} ____cacheline_aligned;
+
+struct multilist {
+	/*
+	 * This is used to get to the multilist_node_t structure given
+	 * the void *object contained on the list.
+	 */
+	size_t				ml_offset;
+	/*
+	 * The number of sublists used internally by this multilist.
+	 */
+	uint64_t			ml_num_sublists;
+	/*
+	 * The array of pointers to the actual sublists.
+	 */
+	multilist_sublist_t		*ml_sublists;
+	/*
+	 * Pointer to function which determines the sublist to use
+	 * when inserting and removing objects from this multilist.
+	 * Please see the comment above multilist_create for details.
+	 */
+	multilist_sublist_index_func_t	*ml_index_func;
+};
+
+void multilist_destroy(multilist_t *);
+void multilist_create(multilist_t *, size_t, size_t, unsigned int,
+    multilist_sublist_index_func_t *);
+
+void multilist_insert(multilist_t *, void *);
+void multilist_remove(multilist_t *, void *);
+int  multilist_is_empty(multilist_t *);
+
+unsigned int multilist_get_num_sublists(multilist_t *);
+unsigned int multilist_get_random_index(multilist_t *);
+
+multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_unlock(multilist_sublist_t *);
+
+void multilist_sublist_insert_head(multilist_sublist_t *, void *);
+void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
+void multilist_sublist_remove(multilist_sublist_t *, void *);
+
+void *multilist_sublist_head(multilist_sublist_t *);
+void *multilist_sublist_tail(multilist_sublist_t *);
+void *multilist_sublist_next(multilist_sublist_t *, void *);
+void *multilist_sublist_prev(multilist_sublist_t *, void *);
+
+void multilist_link_init(multilist_node_t *);
+int  multilist_link_active(multilist_node_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTILIST_H */
diff --git a/include/sys/trace_multilist.h b/include/sys/trace_multilist.h
new file mode 100644
index 000000000000..11d2f2701ac4
--- /dev/null
+++ b/include/sys/trace_multilist.h
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS)
+
+#undef TRACE_SYSTEM
+#define	TRACE_SYSTEM zfs
+
+#if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define	_TRACE_MULTILIST_H
+
+#include <linux/tracepoint.h>
+#include <sys/types.h>
+
+/*
+ * Generic support for three argument tracepoints of the form:
+ *
+ * DTRACE_PROBE3(...,
+ *     multilist_t *, ...,
+ *     unsigned int, ...,
+ *     void *, ...);
+ */
+
+DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class,
+	TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj),
+	TP_ARGS(ml, sublist_idx, obj),
+	TP_STRUCT__entry(
+	    __field(size_t,		ml_offset)
+	    __field(uint64_t,		ml_num_sublists)
+
+	    __field(unsigned int,	sublist_idx)
+	),
+	TP_fast_assign(
+	    __entry->ml_offset		= ml->ml_offset;
+	    __entry->ml_num_sublists	= ml->ml_num_sublists;
+
+	    __entry->sublist_idx	= sublist_idx;
+	),
+	TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ",
+	    __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx)
+);
+
+#define	DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \
+DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \
+	TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \
+	TP_ARGS(ml, sublist_idx, obj))
+DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert);
+DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove);
+
+#endif /* _TRACE_MULTILIST_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define	TRACE_INCLUDE_PATH sys
+#define	TRACE_INCLUDE_FILE trace_multilist
+#include <trace/define_trace.h>
+
+#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 3dc54f1d7d90..761b1d57a116 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -1,743 +1,744 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define	_SYS_ZFS_CONTEXT_H
 
 #ifdef __KERNEL__
 
 #include <sys/note.h>
 #include <sys/types.h>
 #include <sys/t_lock.h>
 #include <sys/atomic.h>
 #include <sys/sysmacros.h>
 #include <sys/bitmap.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
 #include <sys/taskq.h>
 #include <sys/buf.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/cpuvar.h>
 #include <sys/kobj.h>
 #include <sys/conf.h>
 #include <sys/disp.h>
 #include <sys/debug.h>
 #include <sys/random.h>
 #include <sys/byteorder.h>
 #include <sys/systm.h>
 #include <sys/list.h>
 #include <sys/uio_impl.h>
 #include <sys/dirent.h>
 #include <sys/time.h>
 #include <vm/seg_kmem.h>
 #include <sys/zone.h>
 #include <sys/sdt.h>
 #include <sys/zfs_debug.h>
 #include <sys/zfs_delay.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/sunddi.h>
 #include <sys/ctype.h>
 #include <sys/disp.h>
 #include <sys/trace.h>
 #include <linux/dcache_compat.h>
 #include <linux/utsname_compat.h>
 
 #else /* _KERNEL */
 
 #define	_SYS_MUTEX_H
 #define	_SYS_RWLOCK_H
 #define	_SYS_CONDVAR_H
 #define	_SYS_SYSTM_H
 #define	_SYS_T_LOCK_H
 #define	_SYS_VNODE_H
 #define	_SYS_VFS_H
 #define	_SYS_SUNDDI_H
 #define	_SYS_CALLB_H
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
 #include <stdarg.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
 #include <strings.h>
 #include <pthread.h>
 #include <synch.h>
 #include <assert.h>
 #include <alloca.h>
 #include <umem.h>
 #include <limits.h>
 #include <atomic.h>
 #include <dirent.h>
 #include <time.h>
 #include <ctype.h>
 #include <signal.h>
 #include <sys/mman.h>
 #include <sys/note.h>
 #include <sys/types.h>
 #include <sys/cred.h>
 #include <sys/sysmacros.h>
 #include <sys/bitmap.h>
 #include <sys/resource.h>
 #include <sys/byteorder.h>
 #include <sys/list.h>
 #include <sys/uio.h>
 #include <sys/zfs_debug.h>
 #include <sys/sdt.h>
 #include <sys/kstat.h>
 #include <sys/u8_textprep.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/sunddi.h>
 #include <sys/debug.h>
 #include <sys/utsname.h>
 
 /*
  * Stack
  */
 
 #define	noinline	__attribute__((noinline))
 
 /*
  * Debugging
  */
 
 /*
  * Note that we are not using the debugging levels.
  */
 
 #define	CE_CONT		0	/* continuation		*/
 #define	CE_NOTE		1	/* notice		*/
 #define	CE_WARN		2	/* warning		*/
 #define	CE_PANIC	3	/* panic		*/
 #define	CE_IGNORE	4	/* print nothing	*/
 
 /*
  * ZFS debugging
  */
 
 extern void dprintf_setup(int *argc, char **argv);
 
 extern void cmn_err(int, const char *, ...);
 extern void vcmn_err(int, const char *, va_list);
 extern void panic(const char *, ...);
 extern void vpanic(const char *, va_list);
 
 #define	fm_panic	panic
 
 extern int aok;
 
 /*
  * DTrace SDT probes have different signatures in userland than they do in
  * kernel.  If they're being used in kernel code, re-define them out of
  * existence for their counterparts in libzpool.
  */
 
 #ifdef DTRACE_PROBE
 #undef	DTRACE_PROBE
 #endif	/* DTRACE_PROBE */
 #define	DTRACE_PROBE(a) \
 	ZFS_PROBE0(#a)
 
 #ifdef DTRACE_PROBE1
 #undef	DTRACE_PROBE1
 #endif	/* DTRACE_PROBE1 */
 #define	DTRACE_PROBE1(a, b, c) \
 	ZFS_PROBE1(#a, (unsigned long)c)
 
 #ifdef DTRACE_PROBE2
 #undef	DTRACE_PROBE2
 #endif	/* DTRACE_PROBE2 */
 #define	DTRACE_PROBE2(a, b, c, d, e) \
 	ZFS_PROBE2(#a, (unsigned long)c, (unsigned long)e)
 
 #ifdef DTRACE_PROBE3
 #undef	DTRACE_PROBE3
 #endif	/* DTRACE_PROBE3 */
 #define	DTRACE_PROBE3(a, b, c, d, e, f, g) \
 	ZFS_PROBE3(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g)
 
 #ifdef DTRACE_PROBE4
 #undef	DTRACE_PROBE4
 #endif	/* DTRACE_PROBE4 */
 #define	DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) \
 	ZFS_PROBE4(#a, (unsigned long)c, (unsigned long)e, (unsigned long)g, \
 	(unsigned long)i)
 
 /*
  * We use the comma operator so that this macro can be used without much
  * additional code.  For example, "return (EINVAL);" becomes
  * "return (SET_ERROR(EINVAL));".  Note that the argument will be evaluated
  * twice, so it should not have side effects (e.g. something like:
  * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice).
  */
 #define	SET_ERROR(err) (ZFS_SET_ERROR(err), err)
 
 /*
  * Threads.  TS_STACK_MIN is dictated by the minimum allowed pthread stack
  * size.  While TS_STACK_MAX is somewhat arbitrary, it was selected to be
  * large enough for the expected stack depth while small enough to avoid
  * exhausting address space with high thread counts.
  */
 #define	TS_MAGIC		0x72f158ab4261e538ull
 #define	TS_RUN			0x00000002
 #define	TS_STACK_MIN		PTHREAD_STACK_MIN
 #define	TS_STACK_MAX		(256 * 1024)
 
 /* in libzpool, p0 exists only to have its address taken */
 typedef struct proc {
 	uintptr_t	this_is_never_used_dont_dereference_it;
 } proc_t;
 
 extern struct proc p0;
 #define	curproc		(&p0)
 
 typedef void (*thread_func_t)(void *);
 typedef void (*thread_func_arg_t)(void *);
 typedef pthread_t kt_did_t;
 
 #define	kpreempt(x)	((void)0)
 
 typedef struct kthread {
 	kt_did_t	t_tid;
 	thread_func_t	t_func;
 	void *		t_arg;
 } kthread_t;
 
 #define	curthread			zk_thread_current()
 #define	getcomm()			"unknown"
 #define	thread_exit			zk_thread_exit
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
 	zk_thread_create(stk, stksize, (thread_func_t)func, arg,	\
 	    len, NULL, state, pri, PTHREAD_CREATE_DETACHED)
 #define	thread_join(t)			zk_thread_join(t)
 #define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
 
 extern kthread_t *zk_thread_current(void);
 extern void zk_thread_exit(void);
 extern kthread_t *zk_thread_create(caddr_t stk, size_t  stksize,
 	thread_func_t func, void *arg, size_t len,
 	proc_t *pp, int state, pri_t pri, int detachstate);
 extern void zk_thread_join(kt_did_t tid);
 
 #define	kpreempt_disable()	((void)0)
 #define	kpreempt_enable()	((void)0)
 
 #define	PS_NONE		-1
 
 #define	issig(why)	(FALSE)
 #define	ISSIG(thr, why)	(FALSE)
 
 /*
  * Mutexes
  */
 #define	MTX_MAGIC	0x9522f51362a6e326ull
 #define	MTX_INIT	((void *)NULL)
 #define	MTX_DEST	((void *)-1UL)
 
 typedef struct kmutex {
 	void		*m_owner;
 	uint64_t	m_magic;
 	pthread_mutex_t	m_lock;
 } kmutex_t;
 
 #define	MUTEX_DEFAULT	0
 #define	MUTEX_HELD(m)	((m)->m_owner == curthread)
 #define	MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
 
 extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie);
 extern void mutex_destroy(kmutex_t *mp);
 extern void mutex_enter(kmutex_t *mp);
 extern void mutex_exit(kmutex_t *mp);
 extern int mutex_tryenter(kmutex_t *mp);
 extern void *mutex_owner(kmutex_t *mp);
 extern int mutex_held(kmutex_t *mp);
 
 /*
  * RW locks
  */
 #define	RW_MAGIC	0x4d31fb123648e78aull
 #define	RW_INIT		((void *)NULL)
 #define	RW_DEST		((void *)-1UL)
 
 typedef struct krwlock {
 	void			*rw_owner;
 	void			*rw_wr_owner;
 	uint64_t		rw_magic;
 	pthread_rwlock_t	rw_lock;
 	uint_t			rw_readers;
 } krwlock_t;
 
 typedef int krw_t;
 
 #define	RW_READER	0
 #define	RW_WRITER	1
 #define	RW_DEFAULT	RW_READER
 
 #define	RW_READ_HELD(x)		((x)->rw_readers > 0)
 #define	RW_WRITE_HELD(x)	((x)->rw_wr_owner == curthread)
 #define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
 
 #undef RW_LOCK_HELD
 #define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
 
 #undef RW_LOCK_HELD
 #define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
 
 extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
 extern void rw_destroy(krwlock_t *rwlp);
 extern void rw_enter(krwlock_t *rwlp, krw_t rw);
 extern int rw_tryenter(krwlock_t *rwlp, krw_t rw);
 extern int rw_tryupgrade(krwlock_t *rwlp);
 extern void rw_exit(krwlock_t *rwlp);
 #define	rw_downgrade(rwlp) do { } while (0)
 
 extern uid_t crgetuid(cred_t *cr);
 extern uid_t crgetruid(cred_t *cr);
 extern gid_t crgetgid(cred_t *cr);
 extern int crgetngroups(cred_t *cr);
 extern gid_t *crgetgroups(cred_t *cr);
 
 /*
  * Condition variables
  */
 #define	CV_MAGIC	0xd31ea9a83b1b30c4ull
 
 typedef struct kcondvar {
 	uint64_t		cv_magic;
 	pthread_cond_t		cv;
 } kcondvar_t;
 
 #define	CV_DEFAULT	0
 
 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
 extern void cv_destroy(kcondvar_t *cv);
 extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
 extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
 extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
     hrtime_t res, int flag);
 extern void cv_signal(kcondvar_t *cv);
 extern void cv_broadcast(kcondvar_t *cv);
 #define	cv_timedwait_interruptible(cv, mp, at)	cv_timedwait(cv, mp, at)
 #define	cv_wait_interruptible(cv, mp)		cv_wait(cv, mp)
 #define	cv_wait_io(cv, mp)			cv_wait(cv, mp)
 
 /*
  * Thread-specific data
  */
 #define	tsd_get(k) pthread_getspecific(k)
 #define	tsd_set(k, v) pthread_setspecific(k, v)
 #define	tsd_create(kp, d) pthread_key_create(kp, d)
 #define	tsd_destroy(kp) /* nothing */
 
 /*
  * Thread-specific data
  */
 #define	tsd_get(k) pthread_getspecific(k)
 #define	tsd_set(k, v) pthread_setspecific(k, v)
 #define	tsd_create(kp, d) pthread_key_create(kp, d)
 #define	tsd_destroy(kp) /* nothing */
 
 /*
  * kstat creation, installation and deletion
  */
 extern kstat_t *kstat_create(const char *, int,
     const char *, const char *, uchar_t, ulong_t, uchar_t);
 extern void kstat_install(kstat_t *);
 extern void kstat_delete(kstat_t *);
 extern void kstat_waitq_enter(kstat_io_t *);
 extern void kstat_waitq_exit(kstat_io_t *);
 extern void kstat_runq_enter(kstat_io_t *);
 extern void kstat_runq_exit(kstat_io_t *);
 extern void kstat_waitq_to_runq(kstat_io_t *);
 extern void kstat_runq_back_to_waitq(kstat_io_t *);
 extern void kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index));
 
 /*
  * Kernel memory
  */
 #define	KM_SLEEP		UMEM_NOFAIL
 #define	KM_PUSHPAGE		KM_SLEEP
 #define	KM_NOSLEEP		UMEM_DEFAULT
 #define	KMC_NODEBUG		UMC_NODEBUG
 #define	KMC_KMEM		0x0
 #define	KMC_VMEM		0x0
 #define	kmem_alloc(_s, _f)	umem_alloc(_s, _f)
 #define	kmem_zalloc(_s, _f)	umem_zalloc(_s, _f)
 #define	kmem_free(_b, _s)	umem_free(_b, _s)
 #define	vmem_alloc(_s, _f)	kmem_alloc(_s, _f)
 #define	vmem_zalloc(_s, _f)	kmem_zalloc(_s, _f)
 #define	vmem_free(_b, _s)	kmem_free(_b, _s)
 #define	kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \
 	umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i)
 #define	kmem_cache_destroy(_c)	umem_cache_destroy(_c)
 #define	kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
 #define	kmem_cache_free(_c, _b)	umem_cache_free(_c, _b)
 #define	kmem_debugging()	0
 #define	kmem_cache_reap_now(_c)		/* nothing */
 #define	kmem_cache_set_move(_c, _cb)	/* nothing */
 #define	POINTER_INVALIDATE(_pp)		/* nothing */
 #define	POINTER_IS_VALID(_p)	0
 
 typedef umem_cache_t kmem_cache_t;
 
 typedef enum kmem_cbrc {
 	KMEM_CBRC_YES,
 	KMEM_CBRC_NO,
 	KMEM_CBRC_LATER,
 	KMEM_CBRC_DONT_NEED,
 	KMEM_CBRC_DONT_KNOW
 } kmem_cbrc_t;
 
 /*
  * Task queues
  */
 typedef struct taskq taskq_t;
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
 typedef struct taskq_ent {
 	struct taskq_ent	*tqent_next;
 	struct taskq_ent	*tqent_prev;
 	task_func_t		*tqent_func;
 	void			*tqent_arg;
 	uintptr_t		tqent_flags;
 } taskq_ent_t;
 
 #define	TQENT_FLAG_PREALLOC	0x1	/* taskq_dispatch_ent used */
 
 #define	TASKQ_PREPOPULATE	0x0001
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
 #define	TASKQ_THREADS_CPU_PCT	0x0008	/* Scale # threads by # cpus */
 #define	TASKQ_DC_BATCH		0x0010	/* Mark threads as batch */
 
 #define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
 #define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02		/* Do not enqueue if can't dispatch */
 #define	TQ_FRONT	0x08		/* Queue in front */
 
 extern taskq_t *system_taskq;
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 #define	taskq_create_proc(a, b, c, d, e, p, f) \
 	    (taskq_create(a, b, c, d, e, f))
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
 	    (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
     clock_t);
 extern void	taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int	taskq_empty_ent(taskq_ent_t *);
 extern void	taskq_init_ent(taskq_ent_t *);
 extern void	taskq_destroy(taskq_t *);
 extern void	taskq_wait(taskq_t *);
 extern void	taskq_wait_id(taskq_t *, taskqid_t);
 extern int	taskq_member(taskq_t *, kthread_t *);
 extern int	taskq_cancel_id(taskq_t *, taskqid_t);
 extern void	system_taskq_init(void);
 extern void	system_taskq_fini(void);
 
 #define	XVA_MAPSIZE	3
 #define	XVA_MAGIC	0x78766174
 
 /*
  * vnodes
  */
 typedef struct vnode {
 	uint64_t	v_size;
 	int		v_fd;
 	char		*v_path;
 } vnode_t;
 
 #define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
 
 typedef struct xoptattr {
 	timestruc_t	xoa_createtime;	/* Create time of file */
 	uint8_t		xoa_archive;
 	uint8_t		xoa_system;
 	uint8_t		xoa_readonly;
 	uint8_t		xoa_hidden;
 	uint8_t		xoa_nounlink;
 	uint8_t		xoa_immutable;
 	uint8_t		xoa_appendonly;
 	uint8_t		xoa_nodump;
 	uint8_t		xoa_settable;
 	uint8_t		xoa_opaque;
 	uint8_t		xoa_av_quarantined;
 	uint8_t		xoa_av_modified;
 	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
 	uint8_t		xoa_reparse;
 	uint8_t		xoa_offline;
 	uint8_t		xoa_sparse;
 } xoptattr_t;
 
 typedef struct vattr {
 	uint_t		va_mask;	/* bit-mask of attributes */
 	u_offset_t	va_size;	/* file size in bytes */
 } vattr_t;
 
 
 typedef struct xvattr {
 	vattr_t		xva_vattr;	/* Embedded vattr structure */
 	uint32_t	xva_magic;	/* Magic Number */
 	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
 	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
 	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
 	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
 	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
 } xvattr_t;
 
 typedef struct vsecattr {
 	uint_t		vsa_mask;	/* See below */
 	int		vsa_aclcnt;	/* ACL entry count */
 	void		*vsa_aclentp;	/* pointer to ACL entries */
 	int		vsa_dfaclcnt;	/* default ACL entry count */
 	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
 	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
 } vsecattr_t;
 
 #define	AT_TYPE		0x00001
 #define	AT_MODE		0x00002
 #define	AT_UID		0x00004
 #define	AT_GID		0x00008
 #define	AT_FSID		0x00010
 #define	AT_NODEID	0x00020
 #define	AT_NLINK	0x00040
 #define	AT_SIZE		0x00080
 #define	AT_ATIME	0x00100
 #define	AT_MTIME	0x00200
 #define	AT_CTIME	0x00400
 #define	AT_RDEV		0x00800
 #define	AT_BLKSIZE	0x01000
 #define	AT_NBLOCKS	0x02000
 #define	AT_SEQ		0x08000
 #define	AT_XVATTR	0x10000
 
 #define	CRCREAT		0
 
 extern int fop_getattr(vnode_t *vp, vattr_t *vap);
 
 #define	VOP_CLOSE(vp, f, c, o, cr, ct)	vn_close(vp)
 #define	VOP_PUTPAGE(vp, of, sz, fl, cr, ct)	0
 #define	VOP_GETATTR(vp, vap, fl, cr, ct)  fop_getattr((vp), (vap));
 
 #define	VOP_FSYNC(vp, f, cr, ct)	fsync((vp)->v_fd)
 
 #define	VN_RELE(vp)	vn_close(vp)
 
 extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp,
     int x2, int x3);
 extern int vn_openat(char *path, int x1, int oflags, int mode, vnode_t **vpp,
     int x2, int x3, vnode_t *vp, int fd);
 extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len,
     offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp);
 extern void vn_close(vnode_t *vp);
 
 #define	vn_remove(path, x1, x2)		remove(path)
 #define	vn_rename(from, to, seg)	rename((from), (to))
 #define	vn_is_readonly(vp)		B_FALSE
 
 extern vnode_t *rootdir;
 
 #include <sys/file.h>		/* for FREAD, FWRITE, etc */
 
 /*
  * Random stuff
  */
 #define	ddi_get_lbolt()		(gethrtime() >> 23)
 #define	ddi_get_lbolt64()	(gethrtime() >> 23)
 #define	hz	119	/* frequency when using gethrtime() >> 23 for lbolt */
 
 #define	ddi_time_before(a, b)		(a < b)
 #define	ddi_time_after(a, b)		ddi_time_before(b, a)
 #define	ddi_time_before_eq(a, b)	(!ddi_time_after(a, b))
 #define	ddi_time_after_eq(a, b)		ddi_time_before_eq(b, a)
 
 #define	ddi_time_before64(a, b)		(a < b)
 #define	ddi_time_after64(a, b)		ddi_time_before64(b, a)
 #define	ddi_time_before_eq64(a, b)	(!ddi_time_after64(a, b))
 #define	ddi_time_after_eq64(a, b)	ddi_time_before_eq64(b, a)
 
 extern void delay(clock_t ticks);
 
 #define	SEC_TO_TICK(sec)	((sec) * hz)
 #define	MSEC_TO_TICK(msec)	((msec) / (MILLISEC / hz))
 #define	USEC_TO_TICK(usec)	((usec) / (MICROSEC / hz))
 #define	NSEC_TO_TICK(usec)	((usec) / (NANOSEC / hz))
 
 #define	gethrestime_sec() time(NULL)
 #define	gethrestime(t) \
 	do {\
 		(t)->tv_sec = gethrestime_sec();\
 		(t)->tv_nsec = 0;\
 	} while (0);
 
 #define	max_ncpus	64
+#define	num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN))
 
 #define	minclsyspri	60
 #define	maxclsyspri	99
 
 #define	CPU_SEQID	(pthread_self() & (max_ncpus - 1))
 
 #define	kcred		NULL
 #define	CRED()		NULL
 
 #define	ptob(x)		((x) * PAGESIZE)
 
 extern uint64_t physmem;
 
 extern int highbit64(uint64_t i);
 extern int random_get_bytes(uint8_t *ptr, size_t len);
 extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
 
 extern void kernel_init(int);
 extern void kernel_fini(void);
 
 struct spa;
 extern void nicenum(uint64_t num, char *buf);
 extern void show_pool_stats(struct spa *);
 
 typedef struct callb_cpr {
 	kmutex_t	*cc_lockp;
 } callb_cpr_t;
 
 #define	CALLB_CPR_INIT(cp, lockp, func, name)	{		\
 	(cp)->cc_lockp = lockp;					\
 }
 
 #define	CALLB_CPR_SAFE_BEGIN(cp) {				\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 }
 
 #define	CALLB_CPR_SAFE_END(cp, lockp) {				\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 }
 
 #define	CALLB_CPR_EXIT(cp) {					\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 	mutex_exit((cp)->cc_lockp);				\
 }
 
 #define	zone_dataset_visible(x, y)	(1)
 #define	INGLOBALZONE(z)			(1)
 
 extern char *kmem_vasprintf(const char *fmt, va_list adx);
 extern char *kmem_asprintf(const char *fmt, ...);
 #define	strfree(str) kmem_free((str), strlen(str) + 1)
 
 /*
  * Hostname information
  */
 extern char hw_serial[];	/* for userland-emulated hostid access */
 extern int ddi_strtoul(const char *str, char **nptr, int base,
     unsigned long *result);
 
 extern int ddi_strtoull(const char *str, char **nptr, int base,
     u_longlong_t *result);
 
 typedef struct utsname	utsname_t;
 extern utsname_t *utsname(void);
 
 /* ZFS Boot Related stuff. */
 
 struct _buf {
 	intptr_t	_fd;
 };
 
 struct bootstat {
 	uint64_t st_size;
 };
 
 typedef struct ace_object {
 	uid_t		a_who;
 	uint32_t	a_access_mask;
 	uint16_t	a_flags;
 	uint16_t	a_type;
 	uint8_t		a_obj_type[16];
 	uint8_t		a_inherit_obj_type[16];
 } ace_object_t;
 
 
 #define	ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE	0x05
 #define	ACE_ACCESS_DENIED_OBJECT_ACE_TYPE	0x06
 #define	ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE	0x07
 #define	ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE	0x08
 
 extern struct _buf *kobj_open_file(char *name);
 extern int kobj_read_file(struct _buf *file, char *buf, unsigned size,
     unsigned off);
 extern void kobj_close_file(struct _buf *file);
 extern int kobj_get_filesize(struct _buf *file, uint64_t *size);
 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
 extern int zfs_secpolicy_rename_perms(const char *from, const char *to,
     cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern zoneid_t getzoneid(void);
 
 /* SID stuff */
 typedef struct ksiddomain {
 	uint_t	kd_ref;
 	uint_t	kd_len;
 	char	*kd_name;
 } ksiddomain_t;
 
 ksiddomain_t *ksid_lookupdomain(const char *);
 void ksiddomain_rele(ksiddomain_t *);
 
 #define	DDI_SLEEP	KM_SLEEP
 #define	ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \
 	sysevent_post_event(_c, _d, _b, "libzpool", _e, _f)
 
 #define	zfs_sleep_until(wakeup)						\
 	do {								\
 		hrtime_t delta = wakeup - gethrtime();			\
 		struct timespec ts;					\
 		ts.tv_sec = delta / NANOSEC;				\
 		ts.tv_nsec = delta % NANOSEC;				\
 		(void) nanosleep(&ts, NULL);				\
 	} while (0)
 
 typedef int fstrans_cookie_t;
 
 extern fstrans_cookie_t spl_fstrans_mark(void);
 extern void spl_fstrans_unmark(fstrans_cookie_t);
 extern int spl_fstrans_check(void);
 
 #endif /* _KERNEL */
 #endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 85bc0510a81d..e25591300ef7 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -1,131 +1,132 @@
 include $(top_srcdir)/config/Rules.am
 
 AM_CFLAGS += $(DEBUG_STACKFLAGS) $(FRAME_LARGER_THAN)
 
 DEFAULT_INCLUDES += \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/lib/libspl/include
 
 lib_LTLIBRARIES = libzpool.la
 
 libzpool_la_SOURCES = \
 	$(top_srcdir)/lib/libzpool/kernel.c \
 	$(top_srcdir)/lib/libzpool/taskq.c \
 	$(top_srcdir)/lib/libzpool/util.c \
 	$(top_srcdir)/module/zcommon/zfs_comutil.c \
 	$(top_srcdir)/module/zcommon/zfs_deleg.c \
 	$(top_srcdir)/module/zcommon/zfs_fletcher.c \
 	$(top_srcdir)/module/zcommon/zfs_namecheck.c \
 	$(top_srcdir)/module/zcommon/zfs_prop.c \
 	$(top_srcdir)/module/zcommon/zfs_uio.c \
 	$(top_srcdir)/module/zcommon/zpool_prop.c \
 	$(top_srcdir)/module/zcommon/zprop_common.c \
 	$(top_srcdir)/module/zfs/arc.c \
 	$(top_srcdir)/module/zfs/blkptr.c \
 	$(top_srcdir)/module/zfs/bplist.c \
 	$(top_srcdir)/module/zfs/bpobj.c \
 	$(top_srcdir)/module/zfs/bptree.c \
 	$(top_srcdir)/module/zfs/dbuf.c \
 	$(top_srcdir)/module/zfs/dbuf_stats.c \
 	$(top_srcdir)/module/zfs/ddt.c \
 	$(top_srcdir)/module/zfs/ddt_zap.c \
 	$(top_srcdir)/module/zfs/dmu.c \
 	$(top_srcdir)/module/zfs/dmu_diff.c \
 	$(top_srcdir)/module/zfs/dmu_object.c \
 	$(top_srcdir)/module/zfs/dmu_objset.c \
 	$(top_srcdir)/module/zfs/dmu_send.c \
 	$(top_srcdir)/module/zfs/dmu_traverse.c \
 	$(top_srcdir)/module/zfs/dmu_tx.c \
 	$(top_srcdir)/module/zfs/dmu_zfetch.c \
 	$(top_srcdir)/module/zfs/dnode.c \
 	$(top_srcdir)/module/zfs/dnode_sync.c \
 	$(top_srcdir)/module/zfs/dsl_bookmark.c \
 	$(top_srcdir)/module/zfs/dsl_dataset.c \
 	$(top_srcdir)/module/zfs/dsl_deadlist.c \
 	$(top_srcdir)/module/zfs/dsl_deleg.c \
 	$(top_srcdir)/module/zfs/dsl_dir.c \
 	$(top_srcdir)/module/zfs/dsl_pool.c \
 	$(top_srcdir)/module/zfs/dsl_prop.c \
 	$(top_srcdir)/module/zfs/dsl_scan.c \
 	$(top_srcdir)/module/zfs/dsl_synctask.c \
 	$(top_srcdir)/module/zfs/dsl_destroy.c \
 	$(top_srcdir)/module/zfs/dsl_userhold.c \
 	$(top_srcdir)/module/zfs/fm.c \
 	$(top_srcdir)/module/zfs/gzip.c \
 	$(top_srcdir)/module/zfs/lzjb.c \
 	$(top_srcdir)/module/zfs/lz4.c \
 	$(top_srcdir)/module/zfs/metaslab.c \
+	$(top_srcdir)/module/zfs/multilist.c \
 	$(top_srcdir)/module/zfs/range_tree.c \
 	$(top_srcdir)/module/zfs/refcount.c \
 	$(top_srcdir)/module/zfs/rrwlock.c \
 	$(top_srcdir)/module/zfs/sa.c \
 	$(top_srcdir)/module/zfs/sha256.c \
 	$(top_srcdir)/module/zfs/spa.c \
 	$(top_srcdir)/module/zfs/spa_boot.c \
 	$(top_srcdir)/module/zfs/spa_config.c \
 	$(top_srcdir)/module/zfs/spa_errlog.c \
 	$(top_srcdir)/module/zfs/spa_history.c \
 	$(top_srcdir)/module/zfs/spa_misc.c \
 	$(top_srcdir)/module/zfs/spa_stats.c \
 	$(top_srcdir)/module/zfs/space_map.c \
 	$(top_srcdir)/module/zfs/space_reftree.c \
 	$(top_srcdir)/module/zfs/txg.c \
 	$(top_srcdir)/module/zfs/trace.c \
 	$(top_srcdir)/module/zfs/uberblock.c \
 	$(top_srcdir)/module/zfs/unique.c \
 	$(top_srcdir)/module/zfs/vdev.c \
 	$(top_srcdir)/module/zfs/vdev_cache.c \
 	$(top_srcdir)/module/zfs/vdev_file.c \
 	$(top_srcdir)/module/zfs/vdev_label.c \
 	$(top_srcdir)/module/zfs/vdev_mirror.c \
 	$(top_srcdir)/module/zfs/vdev_missing.c \
 	$(top_srcdir)/module/zfs/vdev_queue.c \
 	$(top_srcdir)/module/zfs/vdev_raidz.c \
 	$(top_srcdir)/module/zfs/vdev_root.c \
 	$(top_srcdir)/module/zfs/zap.c \
 	$(top_srcdir)/module/zfs/zap_leaf.c \
 	$(top_srcdir)/module/zfs/zap_micro.c \
 	$(top_srcdir)/module/zfs/zfeature.c \
 	$(top_srcdir)/module/zfs/zfeature_common.c \
 	$(top_srcdir)/module/zfs/zfs_byteswap.c \
 	$(top_srcdir)/module/zfs/zfs_debug.c \
 	$(top_srcdir)/module/zfs/zfs_fm.c \
 	$(top_srcdir)/module/zfs/zfs_fuid.c \
 	$(top_srcdir)/module/zfs/zfs_sa.c \
 	$(top_srcdir)/module/zfs/zfs_znode.c \
 	$(top_srcdir)/module/zfs/zil.c \
 	$(top_srcdir)/module/zfs/zio.c \
 	$(top_srcdir)/module/zfs/zio_checksum.c \
 	$(top_srcdir)/module/zfs/zio_compress.c \
 	$(top_srcdir)/module/zfs/zio_inject.c \
 	$(top_srcdir)/module/zfs/zle.c \
 	$(top_srcdir)/module/zfs/zrlock.c
 
 libzpool_la_LIBADD = \
 	$(top_builddir)/lib/libunicode/libunicode.la \
 	$(top_builddir)/lib/libuutil/libuutil.la \
 	$(top_builddir)/lib/libnvpair/libnvpair.la
 
 libzpool_la_LIBADD += $(ZLIB)
 libzpool_la_LDFLAGS = -version-info 2:0:0
 
 EXTRA_DIST = \
 	$(top_srcdir)/module/zfs/vdev_disk.c \
 	$(top_srcdir)/module/zfs/zfs_acl.c \
 	$(top_srcdir)/module/zfs/zfs_ctldir.c \
 	$(top_srcdir)/module/zfs/zfs_dir.c \
 	$(top_srcdir)/module/zfs/zfs_ioctl.c \
 	$(top_srcdir)/module/zfs/zfs_log.c \
 	$(top_srcdir)/module/zfs/zfs_onexit.c \
 	$(top_srcdir)/module/zfs/zfs_replay.c \
 	$(top_srcdir)/module/zfs/zfs_rlock.c \
 	$(top_srcdir)/module/zfs/zfs_vfsops.c \
 	$(top_srcdir)/module/zfs/zfs_vnops.c \
 	$(top_srcdir)/module/zfs/zpl_ctldir.c \
 	$(top_srcdir)/module/zfs/zpl_export.c \
 	$(top_srcdir)/module/zfs/zpl_file.c \
 	$(top_srcdir)/module/zfs/zpl_inode.c \
 	$(top_srcdir)/module/zfs/zpl_super.c \
 	$(top_srcdir)/module/zfs/zpl_xattr.c \
 	$(top_srcdir)/module/zfs/zvol.c \
 	$(top_srcdir)/module/zpios/pios.c
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5
index 359e9f72f35e..250adc9efa2b 100644
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -1,1671 +1,1731 @@
 '\" te
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .TH ZFS-MODULE-PARAMETERS 5 "Nov 16, 2013"
 .SH NAME
 zfs\-module\-parameters \- ZFS module parameters
 .SH DESCRIPTION
 .sp
 .LP
 Description of the different parameters to the ZFS module.
 
 .SS "Module parameters"
 .sp
 .LP
 
 .sp
 .ne 2
 .na
 \fBl2arc_feed_again\fR (int)
 .ad
 .RS 12n
 Turbo L2ARC warmup
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_feed_min_ms\fR (ulong)
 .ad
 .RS 12n
 Min feed interval in milliseconds
 .sp
 Default value: \fB200\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_feed_secs\fR (ulong)
 .ad
 .RS 12n
 Seconds between L2ARC writing
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_headroom\fR (ulong)
 .ad
 .RS 12n
 Number of max device writes to precache
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_headroom_boost\fR (ulong)
 .ad
 .RS 12n
 Compressed l2arc_headroom multiplier
 .sp
 Default value: \fB200\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_nocompress\fR (int)
 .ad
 .RS 12n
 Skip compressing L2ARC buffers
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_noprefetch\fR (int)
 .ad
 .RS 12n
 Skip caching prefetched buffers
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_norw\fR (int)
 .ad
 .RS 12n
 No reads during writes
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_write_boost\fR (ulong)
 .ad
 .RS 12n
 Extra write bytes during device warmup
 .sp
 Default value: \fB8,388,608\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBl2arc_write_max\fR (ulong)
 .ad
 .RS 12n
 Max write bytes per interval
 .sp
 Default value: \fB8,388,608\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_bias_enabled\fR (int)
 .ad
 .RS 12n
 Enable metaslab group biasing based on its vdev's over- or under-utilization
 relative to the pool.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_debug_load\fR (int)
 .ad
 .RS 12n
 Load all metaslabs during pool import.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_debug_unload\fR (int)
 .ad
 .RS 12n
 Prevent metaslabs from being unloaded.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_fragmentation_factor_enabled\fR (int)
 .ad
 .RS 12n
 Enable use of the fragmentation metric in computing metaslab weights.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslabs_per_vdev\fR (int)
 .ad
 .RS 12n
 When a vdev is added, it will be divided into approximately (but no more than) this number of metaslabs.
 .sp
 Default value: \fB200\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_preload_enabled\fR (int)
 .ad
 .RS 12n
 Enable metaslab group preloading.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBmetaslab_lba_weighting_enabled\fR (int)
 .ad
 .RS 12n
 Give more weight to metaslabs with lower LBAs, assuming they have
 greater bandwidth as is typically the case on a modern constant
 angular velocity disk drive.
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_config_path\fR (charp)
 .ad
 .RS 12n
 SPA config file
 .sp
 Default value: \fB/etc/zfs/zpool.cache\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_asize_inflation\fR (int)
 .ad
 .RS 12n
 Multiplication factor used to estimate actual disk consumption from the
 size of data being written. The default value is a worst case estimate,
 but lower values may be valid for a given pool depending on its
 configuration.  Pool administrators who understand the factors involved
 may wish to specify a more realistic inflation factor, particularly if
 they operate close to quota or capacity limits.
 .sp
 Default value: 24
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_load_verify_data\fR (int)
 .ad
 .RS 12n
 Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR)
 import.  Use 0 to disable and 1 to enable.
 
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.  If this parameter is set to 0,
 the traversal skips non-metadata blocks.  It can be toggled once the
 import has started to stop or start the traversal of non-metadata blocks.
 .sp
 Default value: 1
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_load_verify_metadata\fR (int)
 .ad
 .RS 12n
 Whether to traverse blocks during an "extreme rewind" (\fB-X\fR)
 pool import.  Use 0 to disable and 1 to enable.
 
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.  If this parameter is set to 1,
 the traversal is not performed.  It can be toggled once the import has
 started to stop or start the traversal.
 .sp
 Default value: 1
 .RE
 
 .sp
 .ne 2
 .na
 \fBspa_load_verify_maxinflight\fR (int)
 .ad
 .RS 12n
 Maximum concurrent I/Os during the traversal performed during an "extreme
 rewind" (\fB-X\fR) pool import.
 .sp
 Default value: 10000
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_array_rd_sz\fR (ulong)
 .ad
 .RS 12n
 If prefetching is enabled, disable prefetching for reads larger than this size.
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_block_cap\fR (uint)
 .ad
 .RS 12n
 Max number of blocks to prefetch at a time
 .sp
 Default value: \fB256\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_max_streams\fR (uint)
 .ad
 .RS 12n
 Max number of streams per zfetch (prefetch streams per file).
 .sp
 Default value: \fB8\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfetch_min_sec_reap\fR (uint)
 .ad
 .RS 12n
 Min time before an active prefetch stream can be reclaimed
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_average_blocksize\fR (int)
 .ad
 .RS 12n
 The ARC's buffer hash table is sized based on the assumption of an average
 block size of \fBzfs_arc_average_blocksize\fR (default 8K).  This works out
 to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers.
 For configurations with a known larger average block size this value can be
 increased to reduce the memory footprint.
 
 .sp
 Default value: \fB8192\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_arc_evict_batch_limit\fR (int)
+.ad
+.RS 12n
+Number ARC headers to evict per sub-list before proceding to another sub-list.
+This batch-style operation prevents entire sub-lists from being evicted at once
+but comes at a cost of additional unlocking and locking.
+.sp
+Default value: \fB10\fR.
+.RE
+
 .sp
 .ne 2
 .na
 \fBzfs_arc_grow_retry\fR (int)
 .ad
 .RS 12n
 Seconds before growing arc size
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_max\fR (ulong)
 .ad
 .RS 12n
 Max arc size
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_memory_throttle_disable\fR (int)
 .ad
 .RS 12n
 Disable memory throttle
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_limit\fR (ulong)
 .ad
 .RS 12n
 The maximum allowed size in bytes that meta data buffers are allowed to
 consume in the ARC.  When this limit is reached meta data buffers will
 be reclaimed even if the overall arc_c_max has not been reached.  This
 value defaults to 0 which indicates that 3/4 of the ARC may be used
 for meta data.
 .sp
 Default value: \fB0\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_arc_meta_min\fR (ulong)
+.ad
+.RS 12n
+The minimum allowed size in bytes that meta data buffers may consume in
+the ARC.  This value defaults to 0 which disables a floor on the amount
+of the ARC devoted meta data.
+.sp
+Default value: \fB0\fR.
+.RE
+
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_prune\fR (int)
 .ad
 .RS 12n
 The number of dentries and inodes to be scanned looking for entries
 which can be dropped.  This may be required when the ARC reaches the
 \fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers
 in the ARC.  Increasing this value will cause to dentry and inode caches
 to be pruned more aggressively.  Setting this value to 0 will disable
 pruning the inode and dentry caches.
 .sp
 Default value: \fB10,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_meta_adjust_restarts\fR (ulong)
 .ad
 .RS 12n
 The number of restart passes to make while scanning the ARC attempting
 the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR.
 This value should not need to be tuned but is available to facilitate
 performance analysis.
 .sp
 Default value: \fB4096\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_min\fR (ulong)
 .ad
 .RS 12n
 Min arc size
 .sp
 Default value: \fB100\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_min_prefetch_lifespan\fR (int)
 .ad
 .RS 12n
 Min life of prefetch block
 .sp
 Default value: \fB100\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_arc_num_sublists_per_state\fR (int)
+.ad
+.RS 12n
+To allow more fine-grained locking, each ARC state contains a series
+of lists for both data and meta data objects.  Locking is performed at
+the level of these "sub-lists".  This parameters controls the number of
+sub-lists per ARC state.
+.sp
+Default value: 1 or the number of on-online CPUs, whichever is greater
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_overflow_shift\fR (int)
+.ad
+.RS 12n
+The ARC size is considered to be overflowing if it exceeds the current
+ARC target size (arc_c) by a threshold determined by this parameter.
+The threshold is calculated as a fraction of arc_c using the formula
+"arc_c >> \fBzfs_arc_overflow_shift\fR".
+
+The default value of 8 causes the ARC to be considered to be overflowing
+if it exceeds the target size by 1/256th (0.3%) of the target size.
+
+When the ARC is overflowing, new buffer allocations are stalled until
+the reclaim thread catches up and the overflow condition no longer exists.
+.sp
+Default value: \fB8\fR.
+.RE
+
 .sp
 .ne 2
 .na
 \fBzfs_arc_p_aggressive_disable\fR (int)
 .ad
 .RS 12n
 Disable aggressive arc_p growth
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_p_dampener_disable\fR (int)
 .ad
 .RS 12n
 Disable arc_p adapt dampener
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_arc_shrink_shift\fR (int)
 .ad
 .RS 12n
 log2(fraction of arc to reclaim)
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_autoimport_disable\fR (int)
 .ad
 .RS 12n
 Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR).
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR for no.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dbuf_state_index\fR (int)
 .ad
 .RS 12n
 Calculate arc header index
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_deadman_enabled\fR (int)
 .ad
 .RS 12n
 Enable deadman timer
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_deadman_synctime_ms\fR (ulong)
 .ad
 .RS 12n
 Expiration time in milliseconds. This value has two meanings. First it is
 used to determine when the spa_deadman() logic should fire. By default the
 spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
 Secondly, the value determines if an I/O is considered "hung". Any I/O that
 has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
 in a zevent being logged.
 .sp
 Default value: \fB1,000,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dedup_prefetch\fR (int)
 .ad
 .RS 12n
 Enable prefetching dedup-ed blks
 .sp
 Use \fB1\fR for yes and \fB0\fR to disable (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_delay_min_dirty_percent\fR (int)
 .ad
 .RS 12n
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of \fBzfs_dirty_data_max\fR.
 This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
 See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: \fB60\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_delay_scale\fR (int)
 .ad
 .RS 12n
 This controls how quickly the transaction delay approaches infinity.
 Larger values cause longer delays for a given amount of dirty data.
 .sp
 For the smoothest delay, this value should be about 1 billion divided
 by the maximum number of operations per second.  This will smoothly
 handle between 10x and 1/10th this number.
 .sp
 See the section "ZFS TRANSACTION DELAY".
 .sp
 Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64.
 .sp
 Default value: \fB500,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max\fR (int)
 .ad
 .RS 12n
 Determines the dirty space limit in bytes.  Once this limit is exceeded, new
 writes are halted until space frees up. This parameter takes precedence
 over \fBzfs_dirty_data_max_percent\fR.
 See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: 10 percent of all memory, capped at \fBzfs_dirty_data_max_max\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max_max\fR (int)
 .ad
 .RS 12n
 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes.
 This limit is only enforced at module load time, and will be ignored if
 \fBzfs_dirty_data_max\fR is later changed.  This parameter takes
 precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section
 "ZFS TRANSACTION DELAY".
 .sp
 Default value: 25% of physical RAM.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max_max_percent\fR (int)
 .ad
 .RS 12n
 Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a
 percentage of physical RAM.  This limit is only enforced at module load
 time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed.
 The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this
 one. See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: 25
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_max_percent\fR (int)
 .ad
 .RS 12n
 Determines the dirty space limit, expressed as a percentage of all
 memory.  Once this limit is exceeded, new writes are halted until space frees
 up.  The parameter \fBzfs_dirty_data_max\fR takes precedence over this
 one.  See the section "ZFS TRANSACTION DELAY".
 .sp
 Default value: 10%, subject to \fBzfs_dirty_data_max_max\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_dirty_data_sync\fR (int)
 .ad
 .RS 12n
 Start syncing out a transaction group if there is at least this much dirty data.
 .sp
 Default value: \fB67,108,864\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_free_max_blocks\fR (ulong)
 .ad
 .RS 12n
 Maximum number of blocks freed in a single txg.
 .sp
 Default value: \fB100,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_read_max_active\fR (int)
 .ad
 .RS 12n
 Maxium asynchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB3\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_read_min_active\fR (int)
 .ad
 .RS 12n
 Minimum asynchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_active_max_dirty_percent\fR (int)
 .ad
 .RS 12n
 When the pool has more than
 \fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use
 \fBzfs_vdev_async_write_max_active\fR to limit active async writes.  If
 the dirty data is between min and max, the active I/O limit is linearly
 interpolated. See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB60\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_active_min_dirty_percent\fR (int)
 .ad
 .RS 12n
 When the pool has less than
 \fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use
 \fBzfs_vdev_async_write_min_active\fR to limit active async writes.  If
 the dirty data is between min and max, the active I/O limit is linearly
 interpolated. See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB30\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_max_active\fR (int)
 .ad
 .RS 12n
 Maxium asynchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_async_write_min_active\fR (int)
 .ad
 .RS 12n
 Minimum asynchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_max_active\fR (int)
 .ad
 .RS 12n
 The maximum number of I/Os active to each device.  Ideally, this will be >=
 the sum of each queue's max_active.  It must be at least the sum of each
 queue's min_active.  See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_scrub_max_active\fR (int)
 .ad
 .RS 12n
 Maxium scrub I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_scrub_min_active\fR (int)
 .ad
 .RS 12n
 Minimum scrub I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB1\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_read_max_active\fR (int)
 .ad
 .RS 12n
 Maxium synchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_read_min_active\fR (int)
 .ad
 .RS 12n
 Minimum synchronous read I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_write_max_active\fR (int)
 .ad
 .RS 12n
 Maxium synchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_sync_write_min_active\fR (int)
 .ad
 .RS 12n
 Minimum synchronous write I/Os active to each device.
 See the section "ZFS I/O SCHEDULER".
 .sp
 Default value: \fB10\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_disable_dup_eviction\fR (int)
 .ad
 .RS 12n
 Disable duplicate buffer eviction
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_expire_snapshot\fR (int)
 .ad
 .RS 12n
 Seconds to expire .zfs/snapshot
 .sp
 Default value: \fB300\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_flags\fR (int)
 .ad
 .RS 12n
 Set additional debugging flags. The following flags may be bitwise-or'd
 together.
 .sp
 .TS
 box;
 rB lB
 lB lB
 r l.
 Value	Symbolic Name
 	Description
 _
 1	ZFS_DEBUG_DPRINTF
 	Enable dprintf entries in the debug log.
 _
 2	ZFS_DEBUG_DBUF_VERIFY *
 	Enable extra dbuf verifications.
 _
 4	ZFS_DEBUG_DNODE_VERIFY *
 	Enable extra dnode verifications.
 _
 8	ZFS_DEBUG_SNAPNAMES
 	Enable snapshot name verification.
 _
 16	ZFS_DEBUG_MODIFY
 	Check for illegally modified ARC buffers.
 _
 32	ZFS_DEBUG_SPA
 	Enable spa_dbgmsg entries in the debug log.
 _
 64	ZFS_DEBUG_ZIO_FREE
 	Enable verification of block frees.
 _
 128	ZFS_DEBUG_HISTOGRAM_VERIFY
 	Enable extra spacemap histogram verifications.
 .TE
 .sp
 * Requires debug build.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_free_leak_on_eio\fR (int)
 .ad
 .RS 12n
 If destroy encounters an EIO while reading metadata (e.g. indirect
 blocks), space referenced by the missing metadata can not be freed.
 Normally this causes the background destroy to become "stalled", as
 it is unable to make forward progress.  While in this stalled state,
 all remaining space to free from the error-encountering filesystem is
 "temporarily leaked".  Set this flag to cause it to ignore the EIO,
 permanently leak the space from indirect blocks that can not be read,
 and continue to free everything else that it can.
 
 The default, "stalling" behavior is useful if the storage partially
 fails (i.e. some but not all i/os fail), and then later recovers.  In
 this case, we will be able to continue pool operations while it is
 partially failed, and when it recovers, we can continue to free the
 space, with no leaks.  However, note that this case is actually
 fairly rare.
 
 Typically pools either (a) fail completely (but perhaps temporarily,
 e.g. a top-level vdev going offline), or (b) have localized,
 permanent errors (e.g. disk returns the wrong data due to bit flip or
 firmware bug).  In case (a), this setting does not matter because the
 pool will be suspended and the sync thread will not be able to make
 forward progress regardless.  In case (b), because the error is
 permanent, the best we can do is leak the minimum amount of space,
 which is what setting this flag will do.  Therefore, it is reasonable
 for this flag to normally be set, but we chose the more conservative
 approach of not setting it, so that there is no possibility of
 leaking space in the "partial temporary" failure case.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_free_min_time_ms\fR (int)
 .ad
 .RS 12n
 Min millisecs to free per txg
 .sp
 Default value: \fB1,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_immediate_write_sz\fR (long)
 .ad
 .RS 12n
 Largest data block to write to zil
 .sp
 Default value: \fB32,768\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_max_recordsize\fR (int)
 .ad
 .RS 12n
 We currently support block sizes from 512 bytes to 16MB.  The benefits of
 larger blocks, and thus larger IO, need to be weighed against the cost of
 COWing a giant block to modify one byte.  Additionally, very large blocks
 can have an impact on i/o latency, and also potentially on the memory
 allocator.  Therefore, we do not allow the recordsize to be set larger than
 zfs_max_recordsize (default 1MB).  Larger blocks can be created by changing
 this tunable, and pools with larger blocks can always be imported and used,
 regardless of this setting.
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_mdcomp_disable\fR (int)
 .ad
 .RS 12n
 Disable meta data compression
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_metaslab_fragmentation_threshold\fR (int)
 .ad
 .RS 12n
 Allow metaslabs to keep their active state as long as their fragmentation
 percentage is less than or equal to this value. An active metaslab that
 exceeds this threshold will no longer keep its active status allowing
 better metaslabs to be selected.
 .sp
 Default value: \fB70\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_mg_fragmentation_threshold\fR (int)
 .ad
 .RS 12n
 Metaslab groups are considered eligible for allocations if their
 fragmenation metric (measured as a percentage) is less than or equal to
 this value. If a metaslab group exceeds this threshold then it will be
 skipped unless all metaslab groups within the metaslab class have also
 crossed this threshold.
 .sp
 Default value: \fB85\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_mg_noalloc_threshold\fR (int)
 .ad
 .RS 12n
 Defines a threshold at which metaslab groups should be eligible for
 allocations.  The value is expressed as a percentage of free space
 beyond which a metaslab group is always eligible for allocations.
 If a metaslab group's free space is less than or equal to the
 the threshold, the allocator will avoid allocating to that group
 unless all groups in the pool have reached the threshold.  Once all
 groups have reached the threshold, all groups are allowed to accept
 allocations.  The default value of 0 disables the feature and causes
 all metaslab groups to be eligible for allocations.
 
 This parameter allows to deal with pools having heavily imbalanced
 vdevs such as would be the case when a new vdev has been added.
 Setting the threshold to a non-zero percentage will stop allocations
 from being made to vdevs that aren't filled to the specified percentage
 and allow lesser filled vdevs to acquire more allocations than they
 otherwise would under the old \fBzfs_mg_alloc_failures\fR facility.
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_no_scrub_io\fR (int)
 .ad
 .RS 12n
 Set for no scrub I/O
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_no_scrub_prefetch\fR (int)
 .ad
 .RS 12n
 Set for no scrub prefetching
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_nocacheflush\fR (int)
 .ad
 .RS 12n
 Disable cache flushes
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_nopwrite_enabled\fR (int)
 .ad
 .RS 12n
 Enable NOP writes
 .sp
 Use \fB1\fR for yes (default) and \fB0\fR to disable.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_pd_bytes_max\fR (int)
 .ad
 .RS 12n
 The number of bytes which should be prefetched.
 .sp
 Default value: \fB52,428,800\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_prefetch_disable\fR (int)
 .ad
 .RS 12n
 Disable all ZFS prefetching
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_read_chunk_size\fR (long)
 .ad
 .RS 12n
 Bytes to read per chunk
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_read_history\fR (int)
 .ad
 .RS 12n
 Historic statistics for the last N reads
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_read_history_hits\fR (int)
 .ad
 .RS 12n
 Include cache hits in read history
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_recover\fR (int)
 .ad
 .RS 12n
 Set to attempt to recover from fatal errors. This should only be used as a
 last resort, as it typically results in leaked space, or worse.
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_resilver_delay\fR (int)
 .ad
 .RS 12n
 Number of ticks to delay prior to issuing a resilver I/O operation when
 a non-resilver or non-scrub I/O operation has occurred within the past
 \fBzfs_scan_idle\fR ticks.
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_resilver_min_time_ms\fR (int)
 .ad
 .RS 12n
 Min millisecs to resilver per txg
 .sp
 Default value: \fB3,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_idle\fR (int)
 .ad
 .RS 12n
 Idle window in clock ticks.  During a scrub or a resilver, if
 a non-scrub or non-resilver I/O operation has occurred during this
 window, the next scrub or resilver operation is delayed by, respectively
 \fBzfs_scrub_delay\fR or \fBzfs_resilver_delay\fR ticks.
 .sp
 Default value: \fB50\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scan_min_time_ms\fR (int)
 .ad
 .RS 12n
 Min millisecs to scrub per txg
 .sp
 Default value: \fB1,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_scrub_delay\fR (int)
 .ad
 .RS 12n
 Number of ticks to delay prior to issuing a scrub I/O operation when
 a non-scrub or non-resilver I/O operation has occurred within the past
 \fBzfs_scan_idle\fR ticks.
 .sp
 Default value: \fB4\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_send_corrupt_data\fR (int)
 .ad
 .RS 12n
 Allow to send corrupt data (ignore read/checksum errors when sending data)
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_sync_pass_deferred_free\fR (int)
 .ad
 .RS 12n
 Defer frees starting in this pass
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_sync_pass_dont_compress\fR (int)
 .ad
 .RS 12n
 Don't compress starting in this pass
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_sync_pass_rewrite\fR (int)
 .ad
 .RS 12n
 Rewrite new bps starting in this pass
 .sp
 Default value: \fB2\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_top_maxinflight\fR (int)
 .ad
 .RS 12n
 Max I/Os per top-level vdev during scrub or resilver operations.
 .sp
 Default value: \fB32\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_txg_history\fR (int)
 .ad
 .RS 12n
 Historic statistics for the last N txgs
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_txg_timeout\fR (int)
 .ad
 .RS 12n
 Max seconds worth of delta per txg
 .sp
 Default value: \fB5\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_aggregation_limit\fR (int)
 .ad
 .RS 12n
 Max vdev I/O aggregation size
 .sp
 Default value: \fB131,072\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_cache_bshift\fR (int)
 .ad
 .RS 12n
 Shift size to inflate reads too
 .sp
 Default value: \fB16\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_cache_max\fR (int)
 .ad
 .RS 12n
 Inflate reads small than max
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_cache_size\fR (int)
 .ad
 .RS 12n
 Total size of the per-disk cache
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_mirror_switch_us\fR (int)
 .ad
 .RS 12n
 Switch mirrors every N usecs
 .sp
 Default value: \fB10,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_read_gap_limit\fR (int)
 .ad
 .RS 12n
 Aggregate read I/O over gap
 .sp
 Default value: \fB32,768\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_scheduler\fR (charp)
 .ad
 .RS 12n
 I/O scheduler
 .sp
 Default value: \fBnoop\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_vdev_write_gap_limit\fR (int)
 .ad
 .RS 12n
 Aggregate write I/O over gap
 .sp
 Default value: \fB4,096\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zevent_cols\fR (int)
 .ad
 .RS 12n
 Max event column width
 .sp
 Default value: \fB80\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zevent_console\fR (int)
 .ad
 .RS 12n
 Log events to the console
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzfs_zevent_len_max\fR (int)
 .ad
 .RS 12n
 Max event queue length
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzil_replay_disable\fR (int)
 .ad
 .RS 12n
 Disable intent logging replay
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzil_slog_limit\fR (ulong)
 .ad
 .RS 12n
 Max commit bytes to separate log device
 .sp
 Default value: \fB1,048,576\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_delay_max\fR (int)
 .ad
 .RS 12n
 Max zio millisec delay before posting event
 .sp
 Default value: \fB30,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzio_requeue_io_start_cut_in_line\fR (int)
 .ad
 .RS 12n
 Prioritize requeued I/O
 .sp
 Default value: \fB0\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_inhibit_dev\fR (uint)
 .ad
 .RS 12n
 Do not create zvol device nodes
 .sp
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_major\fR (uint)
 .ad
 .RS 12n
 Major number for zvol device
 .sp
 Default value: \fB230\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_max_discard_blocks\fR (ulong)
 .ad
 .RS 12n
 Max number of blocks to discard at once
 .sp
 Default value: \fB16,384\fR.
 .RE
 
 .sp
 .ne 2
 .na
 \fBzvol_threads\fR (uint)
 .ad
 .RS 12n
 Number of threads for zvol device
 .sp
 Default value: \fB32\fR.
 .RE
 
 .SH ZFS I/O SCHEDULER
 ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os.
 The I/O scheduler determines when and in what order those operations are
 issued.  The I/O scheduler divides operations into five I/O classes
 prioritized in the following order: sync read, sync write, async read,
 async write, and scrub/resilver.  Each queue defines the minimum and
 maximum number of concurrent operations that may be issued to the
 device.  In addition, the device has an aggregate maximum,
 \fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums
 must not exceed the aggregate maximum.  If the sum of the per-queue
 maximums exceeds the aggregate maximum, then the number of active I/Os
 may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will
 be issued regardless of whether all per-queue minimums have been met.
 .sp
 For many physical devices, throughput increases with the number of
 concurrent operations, but latency typically suffers. Further, physical
 devices typically have a limit at which more concurrent operations have no
 effect on throughput or can actually cause it to decrease.
 .sp
 The scheduler selects the next operation to issue by first looking for an
 I/O class whose minimum has not been satisfied. Once all are satisfied and
 the aggregate maximum has not been hit, the scheduler looks for classes
 whose maximum has not been satisfied. Iteration through the I/O classes is
 done in the order specified above. No further operations are issued if the
 aggregate maximum number of concurrent operations has been hit or if there
 are no operations queued for an I/O class that has not hit its maximum.
 Every time an I/O is queued or an operation completes, the I/O scheduler
 looks for new operations to issue.
 .sp
 In general, smaller max_active's will lead to lower latency of synchronous
 operations.  Larger max_active's may lead to higher overall throughput,
 depending on underlying storage.
 .sp
 The ratio of the queues' max_actives determines the balance of performance
 between reads, writes, and scrubs.  E.g., increasing
 \fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete
 more quickly, but reads and writes to have higher latency and lower throughput.
 .sp
 All I/O classes have a fixed maximum number of outstanding operations
 except for the async write class. Asynchronous writes represent the data
 that is committed to stable storage during the syncing stage for
 transaction groups. Transaction groups enter the syncing state
 periodically so the number of queued async writes will quickly burst up
 and then bleed down to zero. Rather than servicing them as quickly as
 possible, the I/O scheduler changes the maximum number of active async
 write I/Os according to the amount of dirty data in the pool.  Since
 both throughput and latency typically increase with the number of
 concurrent operations issued to physical devices, reducing the
 burstiness in the number of concurrent operations also stabilizes the
 response time of operations from other -- and in particular synchronous
 -- queues. In broad strokes, the I/O scheduler will issue more
 concurrent operations from the async write queue as there's more dirty
 data in the pool.
 .sp
 Async Writes
 .sp
 The number of concurrent operations issued for the async write I/O class
 follows a piece-wise linear function defined by a few adjustable points.
 .nf
 
        |              o---------| <-- zfs_vdev_async_write_max_active
   ^    |             /^         |
   |    |            / |         |
 active |           /  |         |
  I/O   |          /   |         |
 count  |         /    |         |
        |        /     |         |
        |-------o      |         | <-- zfs_vdev_async_write_min_active
       0|_______^______|_________|
        0%      |      |       100% of zfs_dirty_data_max
                |      |
                |      `-- zfs_vdev_async_write_active_max_dirty_percent
                `--------- zfs_vdev_async_write_active_min_dirty_percent
 
 .fi
 Until the amount of dirty data exceeds a minimum percentage of the dirty
 data allowed in the pool, the I/O scheduler will limit the number of
 concurrent operations to the minimum. As that threshold is crossed, the
 number of concurrent operations issued increases linearly to the maximum at
 the specified maximum percentage of the dirty data allowed in the pool.
 .sp
 Ideally, the amount of dirty data on a busy pool will stay in the sloped
 part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR
 and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the
 maximum percentage, this indicates that the rate of incoming data is
 greater than the rate that the backend storage can handle. In this case, we
 must further throttle incoming writes, as described in the next section.
 
 .SH ZFS TRANSACTION DELAY
 We delay transactions when we've determined that the backend storage
 isn't able to accommodate the rate of incoming writes.
 .sp
 If there is already a transaction waiting, we delay relative to when
 that transaction will finish waiting.  This way the calculated delay time
 is independent of the number of threads concurrently executing
 transactions.
 .sp
 If we are the only waiter, wait relative to when the transaction
 started, rather than the current time.  This credits the transaction for
 "time already served", e.g. reading indirect blocks.
 .sp
 The minimum time for a transaction to take is calculated as:
 .nf
     min_time = zfs_delay_scale * (dirty - min) / (max - dirty)
     min_time is then capped at 100 milliseconds.
 .fi
 .sp
 The delay has two degrees of freedom that can be adjusted via tunables.  The
 percentage of dirty data at which we start to delay is defined by
 \fBzfs_delay_min_dirty_percent\fR. This should typically be at or above
 \fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to
 delay after writing at full speed has failed to keep up with the incoming write
 rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking,
 this variable determines the amount of delay at the midpoint of the curve.
 .sp
 .nf
 delay
  10ms +-------------------------------------------------------------*+
       |                                                             *|
   9ms +                                                             *+
       |                                                             *|
   8ms +                                                             *+
       |                                                            * |
   7ms +                                                            * +
       |                                                            * |
   6ms +                                                            * +
       |                                                            * |
   5ms +                                                           *  +
       |                                                           *  |
   4ms +                                                           *  +
       |                                                           *  |
   3ms +                                                          *   +
       |                                                          *   |
   2ms +                                              (midpoint) *    +
       |                                                  |    **     |
   1ms +                                                  v ***       +
       |             zfs_delay_scale ---------->     ********         |
     0 +-------------------------------------*********----------------+
       0%                    <- zfs_dirty_data_max ->               100%
 .fi
 .sp
 Note that since the delay is added to the outstanding time remaining on the
 most recent transaction, the delay is effectively the inverse of IOPS.
 Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
 was chosen such that small changes in the amount of accumulated dirty data
 in the first 3/4 of the curve yield relatively small differences in the
 amount of delay.
 .sp
 The effects can be easier to understand when the amount of delay is
 represented on a log scale:
 .sp
 .nf
 delay
 100ms +-------------------------------------------------------------++
       +                                                              +
       |                                                              |
       +                                                             *+
  10ms +                                                             *+
       +                                                           ** +
       |                                              (midpoint)  **  |
       +                                                  |     **    +
   1ms +                                                  v ****      +
       +             zfs_delay_scale ---------->        *****         +
       |                                             ****             |
       +                                          ****                +
 100us +                                        **                    +
       +                                       *                      +
       |                                      *                       |
       +                                     *                        +
  10us +                                     *                        +
       +                                                              +
       |                                                              |
       +                                                              +
       +--------------------------------------------------------------+
       0%                    <- zfs_dirty_data_max ->               100%
 .fi
 .sp
 Note here that only as the amount of dirty data approaches its limit does
 the delay start to increase rapidly. The goal of a properly tuned system
 should be to keep the amount of dirty data out of that range by first
 ensuring that the appropriate limits are set for the I/O scheduler to reach
 optimal throughput on the backend storage, and then by changing the value
 of \fBzfs_delay_scale\fR to increase the steepness of the curve.
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index 954841f33137..e5753ae81118 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -1,104 +1,105 @@
 MODULE := zfs
 
 EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
 $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu_diff.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu_object.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu_objset.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu_send.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu_traverse.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu_tx.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dmu_zfetch.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dnode.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dnode_sync.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_dataset.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_deadlist.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_deleg.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_bookmark.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_dir.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_pool.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_prop.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_scan.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_synctask.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/fm.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/multilist.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/sa.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/sha256.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_boot.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/space_reftree.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/trace.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/unique.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_cache.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_disk.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_file.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_label.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_mirror.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_missing.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_queue.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_raidz.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/vdev_root.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap_leaf.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zap_micro.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfeature.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfeature_common.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_acl.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_byteswap.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ctldir.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_debug.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_dir.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_fm.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_fuid.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_ioctl.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_log.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_onexit.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_replay.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_rlock.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_sa.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_vfsops.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_vnops.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zfs_znode.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zil.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zio.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zio_checksum.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zio_compress.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zio_inject.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zle.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_ctldir.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_export.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_file.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_inode.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_super.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zpl_xattr.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zrlock.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/zvol.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_destroy.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dsl_userhold.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index e69889ab5810..67ef87daf137 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1,5898 +1,6435 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (rangeing from 512 bytes to
  * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal arc algorithms for
  * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each arc state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an arc list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * Arc buffers may have an associated eviction callback function.
  * This function will be invoked prior to removing the buffer (e.g.
  * in arc_do_user_evicts()).  Note however that the data associated
  * with the buffer may be evicted prior to the callback.  The callback
  * must be made with *no locks held* (to prevent deadlock).  Additionally,
  * the users of callbacks must ensure that their private data is
  * protected from simultaneous callbacks from arc_clear_callback()
  * and arc_do_user_evicts().
  *
  * It as also possible to register a callback which is run when the
  * arc_meta_limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
  * they can be reclaimed and the arc_meta_limit honored.  For example,
  * when using the ZPL each dentry holds a references on a znode.  These
  * dentries must be pruned before the arc buffer holding the znode can
  * be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
+#include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
 #include <sys/fs/swapnode.h>
 #include <sys/zpl.h>
 #include <linux/mm_compat.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <sys/dmu_tx.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_arc.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 #endif
 
-static kmutex_t		arc_reclaim_thr_lock;
-static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
-static uint8_t		arc_thread_exit;
+static kmutex_t		arc_reclaim_lock;
+static kcondvar_t	arc_reclaim_thread_cv;
+static boolean_t	arc_reclaim_thread_exit;
+static kcondvar_t	arc_reclaim_waiters_cv;
+
+static kmutex_t		arc_user_evicts_lock;
+static kcondvar_t	arc_user_evicts_cv;
+static boolean_t	arc_user_evicts_thread_exit;
 
 /* number of objects to prune from caches when arc_meta_limit is reached */
 int zfs_arc_meta_prune = 10000;
 
 typedef enum arc_reclaim_strategy {
 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
 } arc_reclaim_strategy_t;
 
 /*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
+
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
  */
-int arc_evict_iterations = 100;
+int zfs_arc_num_sublists_per_state = 0;
 
 /* number of seconds before growing cache again */
 int zfs_arc_grow_retry = 5;
 
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int zfs_arc_overflow_shift = 8;
+
 /* disable anon data aggressively growing arc_p */
 int zfs_arc_p_aggressive_disable = 1;
 
 /* disable arc_p adapt dampener in arc_adapt */
 int zfs_arc_p_dampener_disable = 1;
 
 /* log2(fraction of arc to reclaim) */
 int zfs_arc_shrink_shift = 5;
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 int zfs_arc_min_prefetch_lifespan = HZ;
 
 /* disable arc proactive arc throttle due to low memory */
 int zfs_arc_memory_throttle_disable = 1;
 
 /* disable duplicate buffer eviction */
 int zfs_disable_dup_eviction = 0;
 
 /* average block used to size buf_hash_table */
 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int arc_min_prefetch_lifespan;
+
 /*
  * If this percent of memory is free, don't throttle.
  */
 int arc_lotsfree_percent = 10;
 
 static int arc_dead;
 
 /* expiration time for arc_no_grow */
 static clock_t arc_grow_time = 0;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 static boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 unsigned long zfs_arc_max = 0;
 unsigned long zfs_arc_min = 0;
 unsigned long zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_meta_min = 0;
 
 /*
  * Limit the number of restarts in arc_adjust_meta()
  */
 unsigned long zfs_arc_meta_adjust_restarts = 4096;
 
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
 static arc_state_t ARC_mru_ghost;
 static arc_state_t ARC_mfu;
 static arc_state_t ARC_mfu_ghost;
 static arc_state_t ARC_l2c_only;
 
 typedef struct arc_stats {
 	kstat_named_t arcstat_hits;
 	kstat_named_t arcstat_misses;
 	kstat_named_t arcstat_demand_data_hits;
 	kstat_named_t arcstat_demand_data_misses;
 	kstat_named_t arcstat_demand_metadata_hits;
 	kstat_named_t arcstat_demand_metadata_misses;
 	kstat_named_t arcstat_prefetch_data_hits;
 	kstat_named_t arcstat_prefetch_data_misses;
 	kstat_named_t arcstat_prefetch_metadata_hits;
 	kstat_named_t arcstat_prefetch_metadata_misses;
 	kstat_named_t arcstat_mru_hits;
 	kstat_named_t arcstat_mru_ghost_hits;
 	kstat_named_t arcstat_mfu_hits;
 	kstat_named_t arcstat_mfu_ghost_hits;
 	kstat_named_t arcstat_deleted;
-	kstat_named_t arcstat_recycle_miss;
 	/*
 	 * Number of buffers that could not be evicted because the hash lock
 	 * was held by another thread.  The lock may not necessarily be held
 	 * by something using the same buffer, since hash locks are shared
 	 * by multiple buffers.
 	 */
 	kstat_named_t arcstat_mutex_miss;
 	/*
 	 * Number of buffers skipped because they have I/O in progress, are
 	 * indrect prefetch buffers that have not lived long enough, or are
 	 * not from the spa we're trying to evict from.
 	 */
 	kstat_named_t arcstat_evict_skip;
+	/*
+	 * Number of times arc_evict_state() was unable to evict enough
+	 * buffers to reach its target amount.
+	 */
+	kstat_named_t arcstat_evict_not_enough;
 	kstat_named_t arcstat_evict_l2_cached;
 	kstat_named_t arcstat_evict_l2_eligible;
 	kstat_named_t arcstat_evict_l2_ineligible;
+	kstat_named_t arcstat_evict_l2_skip;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
 	kstat_named_t arcstat_hash_chains;
 	kstat_named_t arcstat_hash_chain_max;
 	kstat_named_t arcstat_p;
 	kstat_named_t arcstat_c;
 	kstat_named_t arcstat_c_min;
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
 	kstat_named_t arcstat_hdr_size;
 	kstat_named_t arcstat_data_size;
 	kstat_named_t arcstat_meta_size;
 	kstat_named_t arcstat_other_size;
 	kstat_named_t arcstat_anon_size;
 	kstat_named_t arcstat_anon_evict_data;
 	kstat_named_t arcstat_anon_evict_metadata;
 	kstat_named_t arcstat_mru_size;
 	kstat_named_t arcstat_mru_evict_data;
 	kstat_named_t arcstat_mru_evict_metadata;
 	kstat_named_t arcstat_mru_ghost_size;
 	kstat_named_t arcstat_mru_ghost_evict_data;
 	kstat_named_t arcstat_mru_ghost_evict_metadata;
 	kstat_named_t arcstat_mfu_size;
 	kstat_named_t arcstat_mfu_evict_data;
 	kstat_named_t arcstat_mfu_evict_metadata;
 	kstat_named_t arcstat_mfu_ghost_size;
 	kstat_named_t arcstat_mfu_ghost_evict_data;
 	kstat_named_t arcstat_mfu_ghost_evict_metadata;
 	kstat_named_t arcstat_l2_hits;
 	kstat_named_t arcstat_l2_misses;
 	kstat_named_t arcstat_l2_feeds;
 	kstat_named_t arcstat_l2_rw_clash;
 	kstat_named_t arcstat_l2_read_bytes;
 	kstat_named_t arcstat_l2_write_bytes;
 	kstat_named_t arcstat_l2_writes_sent;
 	kstat_named_t arcstat_l2_writes_done;
 	kstat_named_t arcstat_l2_writes_error;
-	kstat_named_t arcstat_l2_writes_hdr_miss;
+	kstat_named_t arcstat_l2_writes_lock_retry;
 	kstat_named_t arcstat_l2_evict_lock_retry;
 	kstat_named_t arcstat_l2_evict_reading;
 	kstat_named_t arcstat_l2_evict_l1cached;
 	kstat_named_t arcstat_l2_free_on_write;
+	kstat_named_t arcstat_l2_cdata_free_on_write;
 	kstat_named_t arcstat_l2_abort_lowmem;
 	kstat_named_t arcstat_l2_cksum_bad;
 	kstat_named_t arcstat_l2_io_error;
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_asize;
 	kstat_named_t arcstat_l2_hdr_size;
 	kstat_named_t arcstat_l2_compress_successes;
 	kstat_named_t arcstat_l2_compress_zeros;
 	kstat_named_t arcstat_l2_compress_failures;
 	kstat_named_t arcstat_memory_throttle_count;
 	kstat_named_t arcstat_duplicate_buffers;
 	kstat_named_t arcstat_duplicate_buffers_size;
 	kstat_named_t arcstat_duplicate_reads;
 	kstat_named_t arcstat_memory_direct_count;
 	kstat_named_t arcstat_memory_indirect_count;
 	kstat_named_t arcstat_no_grow;
 	kstat_named_t arcstat_tempreserve;
 	kstat_named_t arcstat_loaned_bytes;
 	kstat_named_t arcstat_prune;
 	kstat_named_t arcstat_meta_used;
 	kstat_named_t arcstat_meta_limit;
 	kstat_named_t arcstat_meta_max;
+	kstat_named_t arcstat_meta_min;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
-	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "p",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "meta_size",			KSTAT_DATA_UINT64 },
 	{ "other_size",			KSTAT_DATA_UINT64 },
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_evict_data",		KSTAT_DATA_UINT64 },
 	{ "anon_evict_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
 	{ "mru_evict_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evict_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evict_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evict_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
 	{ "mfu_evict_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evict_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evict_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evict_metadata",	KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
+	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
 	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
 	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
 	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
 	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
 
 #define	ARCSTAT_INCR(stat, val) \
 	atomic_add_64(&arc_stats.stat.value.ui64, (val))
 
 #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 #define	ARCSTAT_MAXSTAT(stat) \
 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 kstat_t			*arc_ksp;
 static arc_state_t	*arc_anon;
 static arc_state_t	*arc_mru;
 static arc_state_t	*arc_mru_ghost;
 static arc_state_t	*arc_mfu;
 static arc_state_t	*arc_mfu_ghost;
 static arc_state_t	*arc_l2c_only;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
 #define	arc_no_grow	ARCSTAT(arcstat_no_grow)
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
 #define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
 
 #define	L2ARC_IS_VALID_COMPRESS(_c_) \
 	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
 
 static list_t arc_prune_list;
 static kmutex_t arc_prune_mtx;
 static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
 static arc_buf_hdr_t arc_eviction_hdr;
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
 #define	HDR_L2COMPRESS(hdr)	((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 #define	HDR_L2_READING(hdr)	\
 	    (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	    ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
 	    ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 
 /* For storing compression mode in b_flags */
 #define	HDR_COMPRESS_OFFSET	24
 #define	HDR_COMPRESS_NBITS	7
 
 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET(hdr->b_flags, \
 	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
 	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
 
 /*
  * Other sizes
  */
 
 #define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
 #define	HT_LOCK_ALIGN	64
 #define	HT_LOCK_PAD	(P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
 
 struct ht_lock {
 	kmutex_t	ht_lock;
 #ifdef _KERNEL
 	unsigned char	pad[HT_LOCK_PAD];
 #endif
 };
 
 #define	BUF_LOCKS 8192
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	struct ht_lock ht_locks[BUF_LOCKS];
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		2			/* num of writes */
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
  * our headroom for the next scanning cycle by this percentage multiple.
  */
 #define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
 
 /* L2ARC Performance Tunables */
 unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
 unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
 unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
 unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
 int l2arc_nocompress = B_FALSE;			/* don't compress bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
 int l2arc_norw = B_FALSE;			/* no reads during writes */
 
 /*
  * L2ARC Internals
  */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_t		*l2rcb_buf;		/* read buffer */
 	spa_t			*l2rcb_spa;		/* spa */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
 	enum zio_compress	l2rcb_compress;		/* applied compress */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	void		*l2df_data;
 	size_t		l2df_size;
 	void		(*l2df_func)(void *, size_t);
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static void arc_get_data_buf(arc_buf_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static int arc_evict_needed(arc_buf_contents_t);
-static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
+static boolean_t arc_is_overflowing(void);
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 
 static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
 static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
 
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	uint8_t *vdva = (uint8_t *)dva;
 	uint64_t crc = -1ULL;
 	int i;
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 
 	for (i = 0; i < sizeof (dva_t); i++)
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 
 	crc ^= (spa>>8) ^ birth;
 
 	return (crc);
 }
 
 #define	BUF_EMPTY(buf)						\
 	((buf)->b_dva.dva_word[0] == 0 &&			\
 	(buf)->b_dva.dva_word[1] == 0)
 
 #define	BUF_EQUAL(spa, dva, birth, buf)				\
 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
 		if (BUF_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 	ASSERT(hdr->b_birth != 0);
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (lockp != NULL) {
 		*lockp = hash_lock;
 		mutex_enter(hash_lock);
 	} else {
 		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
 		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
 	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 
 	ARCSTAT_BUMP(arcstat_hash_elements);
 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *hdr)
 {
 	arc_buf_hdr_t *fhdr, **hdrp;
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
 		ASSERT(fhdr != NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
 	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 static kmem_cache_t *hdr_full_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 	int i;
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_free() in the linux kernel\
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 /* ARGSUSED */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	bzero(hdr, HDR_FULL_SIZE);
 	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 	refcount_create(&hdr->b_l1hdr.b_refcnt);
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&hdr->b_l1hdr.b_arc_node);
 	list_link_init(&hdr->b_l2hdr.b_l2node);
+	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	bzero(hdr, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	arc_buf_t *buf = vbuf;
 
 	bzero(buf, sizeof (arc_buf_t));
 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 /* ARGSUSED */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(BUF_EMPTY(hdr));
 	cv_destroy(&hdr->b_l1hdr.b_cv);
 	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
 	ASSERTV(arc_buf_hdr_t *hdr = vbuf);
 
 	ASSERT(BUF_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 /* ARGSUSED */
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	arc_buf_t *buf = vbuf;
 
 	mutex_destroy(&buf->b_evict_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 #if defined(_KERNEL) && defined(HAVE_SPL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_alloc() in the linux kernel
 	 */
 	buf_hash_table.ht_table =
 	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 #else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 #endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++) {
 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
 		    NULL, MUTEX_DEFAULT, NULL);
 	}
 }
 
 /*
  * Transition between the two allocation states for the arc_buf_hdr struct.
  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
  * version is used when a cache buffer is only in the L2ARC in order to reduce
  * memory usage.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 {
 	arc_buf_hdr_t *nhdr;
 	l2arc_dev_t *dev;
 
 	ASSERT(HDR_HAS_L2HDR(hdr));
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
 	dev = hdr->b_l2hdr.b_dev;
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
 	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
 	if (new == hdr_full_cache) {
 		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
 		/*
 		 * arc_access and arc_change_state need to be aware that a
 		 * header has just come out of L2ARC, so we set its state to
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+		/* Verify previous threads set to NULL before freeing */
+		ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
 	} else {
 		ASSERT(hdr->b_l1hdr.b_buf == NULL);
 		ASSERT0(hdr->b_l1hdr.b_datacnt);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		/*
+		 * If we've reached here, We must have been called from
+		 * arc_evict_hdr(), as such we should have already been
+		 * removed from any ghost list we were previously on
+		 * (which protects us from racing with arc_evict_state),
+		 * thus no locking is needed during this check.
+		 */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
 		/*
-		 * We might be removing the L1hdr of a buffer which was just
-		 * written out to L2ARC. If such a buffer is compressed then we
-		 * need to free its b_tmp_cdata before destroying the header.
+		 * A buffer must not be moved into the arc_l2c_only
+		 * state if it's not finished being written out to the
+		 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
+		 * might try to be accessed, even though it was removed.
 		 */
-		if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
-		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
-			l2arc_release_cdata_buf(hdr);
+		VERIFY(!HDR_L2_WRITING(hdr));
+		VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
 		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
 	}
 	/*
 	 * The header has been reallocated so we need to re-insert it into any
 	 * lists it was on.
 	 */
 	(void) buf_hash_insert(nhdr, NULL);
 
 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
 
 	mutex_enter(&dev->l2ad_mtx);
 
 	/*
 	 * We must place the realloc'ed header back into the list at
 	 * the same spot. Otherwise, if it's placed earlier in the list,
 	 * l2arc_write_buffers() could find it during the function's
 	 * write phase, and try to write it out to the l2arc.
 	 */
 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	buf_discard_identity(hdr);
 	hdr->b_freeze_cksum = NULL;
 	kmem_cache_free(old, hdr);
 
 	return (nhdr);
 }
 
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 	if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
 		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 }
 
 static int
 arc_cksum_equal(arc_buf_t *buf)
 {
 	zio_cksum_t zc;
 	int equal;
 
 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 
 	return (equal);
 }
 
 static void
 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 {
 	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 	if (buf->b_hdr->b_freeze_cksum != NULL) {
 		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 	    buf->b_hdr->b_freeze_cksum);
 	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 	arc_buf_watch(buf);
 }
 
 #ifndef _KERNEL
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
 	panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr);
 }
 #endif
 
 /* ARGSUSED */
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size,
 		    PROT_READ | PROT_WRITE));
 	}
 #endif
 }
 
 /* ARGSUSED */
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ));
 #endif
 }
 
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
 	if (HDR_ISTYPE_METADATA(hdr)) {
 		return (ARC_BUFC_METADATA);
 	} else {
 		return (ARC_BUFC_DATA);
 	}
 }
 
 static uint32_t
 arc_bufc_to_flags(arc_buf_contents_t type)
 {
 	switch (type) {
 	case ARC_BUFC_DATA:
 		/* metadata field is 0 if buffer contains normal data */
 		return (0);
 	case ARC_BUFC_METADATA:
 		return (ARC_FLAG_BUFC_METADATA);
 	default:
 		break;
 	}
 	panic("undefined ARC buffer type!");
 	return ((uint32_t)-1);
 }
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
 		if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
 			panic("modifying non-anon buffer!");
 		if (HDR_IO_IN_PROGRESS(buf->b_hdr))
 			panic("modifying buffer while i/o in progress!");
 		arc_cksum_verify(buf);
 	}
 
 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 	if (buf->b_hdr->b_freeze_cksum != NULL) {
 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		buf->b_hdr->b_freeze_cksum = NULL;
 	}
 
 	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
 
 	arc_buf_unwatch(buf);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	kmutex_t *hash_lock;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 	arc_cksum_compute(buf, B_FALSE);
 	mutex_exit(hash_lock);
 
 }
 
 static void
 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
 	arc_state_t *state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(MUTEX_HELD(hash_lock));
 
 	state = hdr->b_l1hdr.b_state;
 
 	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
 	    (state != arc_anon)) {
 		/* We don't use the L2-only state list. */
 		if (state != arc_l2c_only) {
+			arc_buf_contents_t type = arc_buf_type(hdr);
 			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
-			list_t *list = &state->arcs_list[arc_buf_type(hdr)];
-			uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+			multilist_t *list = &state->arcs_list[type];
+			uint64_t *size = &state->arcs_lsize[type];
+
+			multilist_remove(list, hdr);
 
-			ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-			mutex_enter(&state->arcs_mtx);
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-			list_remove(list, hdr);
 			if (GHOST_STATE(state)) {
 				ASSERT0(hdr->b_l1hdr.b_datacnt);
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 				delta = hdr->b_size;
 			}
 			ASSERT(delta > 0);
 			ASSERT3U(*size, >=, delta);
 			atomic_add_64(size, -delta);
-			mutex_exit(&state->arcs_mtx);
 		}
 		/* remove the prefetch flag if we get a reference */
 		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
 	}
 }
 
 static int
 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
 	ASSERT(!GHOST_STATE(state));
 
 	/*
 	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
 	 * check to prevent usage of the arc_l2c_only list.
 	 */
 	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
-		uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+		arc_buf_contents_t type = arc_buf_type(hdr);
+		multilist_t *list = &state->arcs_list[type];
+		uint64_t *size = &state->arcs_lsize[type];
+
+		multilist_insert(list, hdr);
 
-		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-		mutex_enter(&state->arcs_mtx);
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
-		list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr);
 		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
 		atomic_add_64(size, hdr->b_size *
 		    hdr->b_l1hdr.b_datacnt);
-		mutex_exit(&state->arcs_mtx);
 	}
 	return (cnt);
 }
 
 /*
  * Returns detailed information about a specific arc buffer.  When the
  * state_index argument is set the function will calculate the arc header
  * list position for its arc state.  Since this requires a linear traversal
  * callers are strongly encourage not to do this.  However, it can be helpful
  * for targeted analysis so the functionality is provided.
  */
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
 	arc_state_t *state = NULL;
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		l1hdr = &hdr->b_l1hdr;
 		state = l1hdr->b_state;
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		l2hdr = &hdr->b_l2hdr;
 
 	memset(abi, 0, sizeof (arc_buf_info_t));
 	abi->abi_flags = hdr->b_flags;
 
 	if (l1hdr) {
 		abi->abi_datacnt = l1hdr->b_datacnt;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
 		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
 		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
 		abi->abi_holds = refcount_count(&l1hdr->b_refcnt);
 	}
 
 	if (l2hdr) {
 		abi->abi_l2arc_dattr = l2hdr->b_daddr;
 		abi->abi_l2arc_asize = l2hdr->b_asize;
 		abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr);
 		abi->abi_l2arc_hits = l2hdr->b_hits;
 	}
 
 	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
 	abi->abi_state_contents = arc_buf_type(hdr);
-	abi->abi_state_index = -1;
 	abi->abi_size = hdr->b_size;
-
-	if (l1hdr && state && state_index &&
-	    list_link_active(&l1hdr->b_arc_node)) {
-		list_t *list = &state->arcs_list[arc_buf_type(hdr)];
-		arc_buf_hdr_t *h;
-
-		mutex_enter(&state->arcs_mtx);
-		for (h = list_head(list); h != NULL; h = list_next(list, h)) {
-			abi->abi_state_index++;
-			if (h == hdr)
-				break;
-		}
-		mutex_exit(&state->arcs_mtx);
-	}
 }
 
 /*
- * Move the supplied buffer to the indicated state.  The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
     kmutex_t *hash_lock)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
 	uint32_t datacnt;
 	uint64_t from_delta, to_delta;
 	arc_buf_contents_t buftype = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
 	 * L1 hdr doesn't always exist when we change state to arc_anon before
 	 * destroying a header, in which case reallocating to add the L1 hdr is
 	 * pointless.
 	 */
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
 		datacnt = hdr->b_l1hdr.b_datacnt;
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
 		datacnt = 0;
 	}
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT3P(new_state, !=, old_state);
 	ASSERT(refcnt == 0 || datacnt > 0);
 	ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
 	ASSERT(old_state != arc_anon || datacnt <= 1);
 
 	from_delta = to_delta = datacnt * hdr->b_size;
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
-			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
 			uint64_t *size = &old_state->arcs_lsize[buftype];
 
-			if (use_mutex)
-				mutex_enter(&old_state->arcs_mtx);
-
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
-			list_remove(&old_state->arcs_list[buftype], hdr);
+			multilist_remove(&old_state->arcs_list[buftype], hdr);
 
 			/*
 			 * If prefetching out of the ghost cache,
 			 * we will have a non-zero datacnt.
 			 */
 			if (GHOST_STATE(old_state) && datacnt == 0) {
 				/* ghost elements have a ghost size */
 				ASSERT(hdr->b_l1hdr.b_buf == NULL);
 				from_delta = hdr->b_size;
 			}
 			ASSERT3U(*size, >=, from_delta);
 			atomic_add_64(size, -from_delta);
-
-			if (use_mutex)
-				mutex_exit(&old_state->arcs_mtx);
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
-			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
 			uint64_t *size = &new_state->arcs_lsize[buftype];
 
 			/*
 			 * An L1 header always exists here, since if we're
 			 * moving to some L1-cached state (i.e. not l2c_only or
 			 * anonymous), we realloc the header to add an L1hdr
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
-			if (use_mutex)
-				mutex_enter(&new_state->arcs_mtx);
-
-			list_insert_head(&new_state->arcs_list[buftype], hdr);
+			multilist_insert(&new_state->arcs_list[buftype], hdr);
 
 			/* ghost elements have a ghost size */
 			if (GHOST_STATE(new_state)) {
 				ASSERT0(datacnt);
 				ASSERT(hdr->b_l1hdr.b_buf == NULL);
 				to_delta = hdr->b_size;
 			}
 			atomic_add_64(size, to_delta);
-
-			if (use_mutex)
-				mutex_exit(&new_state->arcs_mtx);
 		}
 	}
 
 	ASSERT(!BUF_EMPTY(hdr));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
 	if (to_delta && new_state != arc_l2c_only)
 		atomic_add_64(&new_state->arcs_size, to_delta);
 	if (from_delta && old_state != arc_l2c_only) {
 		ASSERT3U(old_state->arcs_size, >=, from_delta);
 		atomic_add_64(&old_state->arcs_size, -from_delta);
 	}
 	if (HDR_HAS_L1HDR(hdr))
 		hdr->b_l1hdr.b_state = new_state;
 
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.
 	 */
-	ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
-	    list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
+	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_meta_size, space);
 		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA) {
 		ARCSTAT_INCR(arcstat_meta_used, space);
 		if (arc_meta_max < arc_meta_used)
 			arc_meta_max = arc_meta_used;
 	}
 
 	atomic_add_64(&arc_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_meta_size, -space);
 		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA) {
 		ASSERT(arc_meta_used >= space);
 		ARCSTAT_INCR(arcstat_meta_used, -space);
 	}
 
 	ASSERT(arc_size >= space);
 	atomic_add_64(&arc_size, -space);
 }
 
 arc_buf_t *
 arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 
 	VERIFY3U(size, <=, spa_maxblocksize(spa));
 	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 	ASSERT(BUF_EMPTY(hdr));
 	ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
 	hdr->b_size = size;
 	hdr->b_spa = spa_load_guid(spa);
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_l2_hits = 0;
 
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 	buf->b_next = NULL;
 
 	hdr->b_flags = arc_bufc_to_flags(type);
 	hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
 
 	hdr->b_l1hdr.b_buf = buf;
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_datacnt = 1;
+	hdr->b_l1hdr.b_tmp_cdata = NULL;
 
 	arc_get_data_buf(buf);
 
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 
 	return (buf);
 }
 
 static char *arc_onloan_tag = "onloan";
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, uint64_t size)
 {
 	arc_buf_t *buf;
 
 	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
 
 	atomic_add_64(&arc_loaned_bytes, size);
 	return (buf);
 }
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(buf->b_data != NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(buf->b_data != NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 
 	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
 }
 
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
 	arc_buf_t *buf;
 	arc_buf_hdr_t *hdr = from->b_hdr;
 	uint64_t size = hdr->b_size;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
 
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 	buf->b_next = hdr->b_l1hdr.b_buf;
 	hdr->b_l1hdr.b_buf = buf;
 	arc_get_data_buf(buf);
 	bcopy(from->b_data, buf->b_data, size);
 
 	/*
 	 * This buffer already exists in the arc so create a duplicate
 	 * copy for the caller.  If the buffer is associated with user data
 	 * then track the size and number of duplicates.  These stats will be
 	 * updated as duplicate buffers are created and destroyed.
 	 */
 	if (HDR_ISTYPE_DATA(hdr)) {
 		ARCSTAT_BUMP(arcstat_duplicate_buffers);
 		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
 	}
 	hdr->b_l1hdr.b_datacnt += 1;
 	return (buf);
 }
 
 void
 arc_buf_add_ref(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 
 	/*
 	 * Check to see if this buffer is evicted.  Callers
 	 * must verify b_data != NULL to know if the add_ref
 	 * was successful.
 	 */
 	mutex_enter(&buf->b_evict_lock);
 	if (buf->b_data == NULL) {
 		mutex_exit(&buf->b_evict_lock);
 		return;
 	}
 	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	mutex_exit(&buf->b_evict_lock);
 
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu);
 
 	add_reference(hdr, hash_lock, tag);
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, hash_lock);
 	mutex_exit(hash_lock);
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
 	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 	    data, metadata, hits);
 }
 
+static void
+arc_buf_free_on_write(void *data, size_t size,
+    void (*free_func)(void *, size_t))
+{
+	l2arc_data_free_t *df;
+
+	df = kmem_alloc(sizeof (*df), KM_SLEEP);
+	df->l2df_data = data;
+	df->l2df_size = size;
+	df->l2df_func = free_func;
+	mutex_enter(&l2arc_free_on_write_mtx);
+	list_insert_head(l2arc_free_on_write, df);
+	mutex_exit(&l2arc_free_on_write_mtx);
+}
+
 /*
  * Free the arc data buffer.  If it is an l2arc write in progress,
  * the buffer is placed on l2arc_free_on_write to be freed later.
  */
 static void
 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (HDR_L2_WRITING(hdr)) {
-		l2arc_data_free_t *df;
-		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
-		df->l2df_data = buf->b_data;
-		df->l2df_size = hdr->b_size;
-		df->l2df_func = free_func;
-		mutex_enter(&l2arc_free_on_write_mtx);
-		list_insert_head(l2arc_free_on_write, df);
-		mutex_exit(&l2arc_free_on_write_mtx);
+		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else {
 		free_func(buf->b_data, hdr->b_size);
 	}
 }
 
+static void
+arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L2HDR(hdr));
+	ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
+
+	/*
+	 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
+	 * that doesn't exist, the header is in the arc_l2c_only state,
+	 * and there isn't anything to free (it's already been freed).
+	 */
+	if (!HDR_HAS_L1HDR(hdr))
+		return;
+
+	/*
+	 * The header isn't being written to the l2arc device, thus it
+	 * shouldn't have a b_tmp_cdata to free.
+	 */
+	if (!HDR_L2_WRITING(hdr)) {
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+		return;
+	}
+
+	/*
+	 * The header does not have compression enabled. This can be due
+	 * to the buffer not being compressible, or because we're
+	 * freeing the buffer before the second phase of
+	 * l2arc_write_buffer() has started (which does the compression
+	 * step). In either case, b_tmp_cdata does not point to a
+	 * separately compressed buffer, so there's nothing to free (it
+	 * points to the same buffer as the arc_buf_t's b_data field).
+	 */
+	if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+		hdr->b_l1hdr.b_tmp_cdata = NULL;
+		return;
+	}
+
+	/*
+	 * There's nothing to free since the buffer was all zero's and
+	 * compressed to a zero length buffer.
+	 */
+	if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+		return;
+	}
+
+	ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
+
+	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
+	    hdr->b_size, zio_data_buf_free);
+
+	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
+	hdr->b_l1hdr.b_tmp_cdata = NULL;
+}
+
 /*
  * Free up buf->b_data and if 'remove' is set, then pull the
  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
  */
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
+arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
 {
 	arc_buf_t **bufp;
 
 	/* free up data associated with the buf */
 	if (buf->b_data != NULL) {
 		arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
 		uint64_t size = buf->b_hdr->b_size;
 		arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
-		if (!recycle) {
-			if (type == ARC_BUFC_METADATA) {
-				arc_buf_data_free(buf, zio_buf_free);
-				arc_space_return(size, ARC_SPACE_META);
-			} else {
-				ASSERT(type == ARC_BUFC_DATA);
-				arc_buf_data_free(buf, zio_data_buf_free);
-				arc_space_return(size, ARC_SPACE_DATA);
-			}
+		if (type == ARC_BUFC_METADATA) {
+			arc_buf_data_free(buf, zio_buf_free);
+			arc_space_return(size, ARC_SPACE_META);
+		} else {
+			ASSERT(type == ARC_BUFC_DATA);
+			arc_buf_data_free(buf, zio_data_buf_free);
+			arc_space_return(size, ARC_SPACE_DATA);
 		}
-		if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
+
+		/* protected by hash lock, if in the hash table */
+		if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
 			uint64_t *cnt = &state->arcs_lsize[type];
 
 			ASSERT(refcount_is_zero(
 			    &buf->b_hdr->b_l1hdr.b_refcnt));
 			ASSERT(state != arc_anon && state != arc_l2c_only);
 
 			ASSERT3U(*cnt, >=, size);
 			atomic_add_64(cnt, -size);
 		}
 		ASSERT3U(state->arcs_size, >=, size);
 		atomic_add_64(&state->arcs_size, -size);
 		buf->b_data = NULL;
 
 		/*
 		 * If we're destroying a duplicate buffer make sure
 		 * that the appropriate statistics are updated.
 		 */
 		if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
 		    HDR_ISTYPE_DATA(buf->b_hdr)) {
 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
 			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
 		}
 		ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
 		buf->b_hdr->b_l1hdr.b_datacnt -= 1;
 	}
 
 	/* only remove the buf if requested */
 	if (!remove)
 		return;
 
 	/* remove the buf from the hdr list */
 	for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
 	    bufp = &(*bufp)->b_next)
 		continue;
 	*bufp = buf->b_next;
 	buf->b_next = NULL;
 
 	ASSERT(buf->b_efunc == NULL);
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
 		    hdr->b_l1hdr.b_datacnt > 0);
 		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 		boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx);
 
 		if (!buflist_held) {
 			mutex_enter(&l2hdr->b_dev->l2ad_mtx);
 			l2hdr = &hdr->b_l2hdr;
 		}
 
 		list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
 
+		/*
+		 * We don't want to leak the b_tmp_cdata buffer that was
+		 * allocated in l2arc_write_buffers()
+		 */
+		arc_buf_l2_cdata_free(hdr);
+
 		arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
 		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
 
 		if (!buflist_held)
 			mutex_exit(&l2hdr->b_dev->l2ad_mtx);
 
 		hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
 	}
 
 	if (!BUF_EMPTY(hdr))
 		buf_discard_identity(hdr);
 
 	if (hdr->b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_freeze_cksum = NULL;
 	}
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		while (hdr->b_l1hdr.b_buf) {
 			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
 
 			if (buf->b_efunc != NULL) {
-				mutex_enter(&arc_eviction_mtx);
+				mutex_enter(&arc_user_evicts_lock);
 				mutex_enter(&buf->b_evict_lock);
 				ASSERT(buf->b_hdr != NULL);
-				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-				    FALSE);
+				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
 				hdr->b_l1hdr.b_buf = buf->b_next;
 				buf->b_hdr = &arc_eviction_hdr;
 				buf->b_next = arc_eviction_list;
 				arc_eviction_list = buf;
 				mutex_exit(&buf->b_evict_lock);
-				mutex_exit(&arc_eviction_mtx);
+				cv_signal(&arc_user_evicts_cv);
+				mutex_exit(&arc_user_evicts_lock);
 			} else {
-				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
-				    TRUE);
+				arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
 			}
 		}
 	}
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
 arc_buf_free(arc_buf_t *buf, void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	int hashed = hdr->b_l1hdr.b_state != arc_anon;
 
 	ASSERT(buf->b_efunc == NULL);
 	ASSERT(buf->b_data != NULL);
 
 	if (hashed) {
 		kmutex_t *hash_lock = HDR_LOCK(hdr);
 
 		mutex_enter(hash_lock);
 		hdr = buf->b_hdr;
 		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 		(void) remove_reference(hdr, hash_lock, tag);
 		if (hdr->b_l1hdr.b_datacnt > 1) {
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 		} else {
 			ASSERT(buf == hdr->b_l1hdr.b_buf);
 			ASSERT(buf->b_efunc == NULL);
 			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
 		}
 		mutex_exit(hash_lock);
 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
 		int destroy_hdr;
 		/*
 		 * We are in the middle of an async write.  Don't destroy
 		 * this buffer unless the write completes before we finish
 		 * decrementing the reference count.
 		 */
-		mutex_enter(&arc_eviction_mtx);
+		mutex_enter(&arc_user_evicts_lock);
 		(void) remove_reference(hdr, NULL, tag);
 		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-		mutex_exit(&arc_eviction_mtx);
+		mutex_exit(&arc_user_evicts_lock);
 		if (destroy_hdr)
 			arc_hdr_destroy(hdr);
 	} else {
 		if (remove_reference(hdr, NULL, tag) > 0)
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 		else
 			arc_hdr_destroy(hdr);
 	}
 }
 
 boolean_t
 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	kmutex_t *hash_lock = NULL;
 	boolean_t no_callback = (buf->b_efunc == NULL);
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
 		arc_buf_free(buf, tag);
 		return (no_callback);
 	}
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
 	ASSERT(buf->b_data != NULL);
 
 	(void) remove_reference(hdr, hash_lock, tag);
 	if (hdr->b_l1hdr.b_datacnt > 1) {
 		if (no_callback)
-			arc_buf_destroy(buf, FALSE, TRUE);
+			arc_buf_destroy(buf, TRUE);
 	} else if (no_callback) {
 		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
 		ASSERT(buf->b_efunc == NULL);
 		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
 	}
 	ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
 	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	mutex_exit(hash_lock);
 	return (no_callback);
 }
 
 uint64_t
 arc_buf_size(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_size);
 }
 
 /*
  * Called from the DMU to determine if the current buffer should be
  * evicted. In order to ensure proper locking, the eviction must be initiated
  * from the DMU. Return true if the buffer is associated with user data and
  * duplicate buffers still exist.
  */
 boolean_t
 arc_buf_eviction_needed(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr;
 	boolean_t evict_needed = B_FALSE;
 
 	if (zfs_disable_dup_eviction)
 		return (B_FALSE);
 
 	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 	if (hdr == NULL) {
 		/*
 		 * We are in arc_do_user_evicts(); let that function
 		 * perform the eviction.
 		 */
 		ASSERT(buf->b_data == NULL);
 		mutex_exit(&buf->b_evict_lock);
 		return (B_FALSE);
 	} else if (buf->b_data == NULL) {
 		/*
 		 * We have already been added to the arc eviction list;
 		 * recommend eviction.
 		 */
 		ASSERT3P(hdr, ==, &arc_eviction_hdr);
 		mutex_exit(&buf->b_evict_lock);
 		return (B_TRUE);
 	}
 
 	if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
 		evict_needed = B_TRUE;
 
 	mutex_exit(&buf->b_evict_lock);
 	return (evict_needed);
 }
 
 /*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on its state prior to entering this
+ * function. The following transitions are possible:
  *
- * This function makes a "best effort".  It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
  */
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
-    arc_buf_contents_t type)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
-	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
-	list_t *list = &state->arcs_list[type];
-	kmutex_t *hash_lock;
-	boolean_t have_lock;
-	void *stolen = NULL;
-	arc_buf_hdr_t marker = {{{ 0 }}};
-	int count = 0;
-
-	ASSERT(state == arc_mru || state == arc_mfu);
-
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+	arc_state_t *evicted_state, *state;
+	int64_t bytes_evicted = 0;
 
-top:
-	/*
-	 * The ghost list lock must be acquired first in order to prevent
-	 * a 3 party deadlock:
-	 *
-	 *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
-	 *    l2ad_mtx in arc_hdr_realloc
-	 *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
-	 *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
-	 *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
-	 *
-	 * This situation is avoided by acquiring the ghost list lock first.
-	 */
-	mutex_enter(&evicted_state->arcs_mtx);
-	mutex_enter(&state->arcs_mtx);
-
-	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
-		hdr_prev = list_prev(list, hdr);
-		/* prefetch buffers have a minimum lifespan */
-		if (HDR_IO_IN_PROGRESS(hdr) ||
-		    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-		    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-		    zfs_arc_min_prefetch_lifespan)) {
-			skipped++;
-			continue;
-		}
-		/* "lookahead" for better eviction candidate */
-		if (recycle && hdr->b_size != bytes &&
-		    hdr_prev && hdr_prev->b_size == bytes)
-			continue;
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-		/* ignore markers */
-		if (hdr->b_spa == 0)
-			continue;
+	state = hdr->b_l1hdr.b_state;
+	if (GHOST_STATE(state)) {
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(hdr->b_l1hdr.b_buf == NULL);
 
 		/*
-		 * It may take a long time to evict all the bufs requested.
-		 * To avoid blocking all arc activity, periodically drop
-		 * the arcs_mtx and give other threads a chance to run
-		 * before reacquiring the lock.
-		 *
-		 * If we are looking for a buffer to recycle, we are in
-		 * the hot code path, so don't sleep.
+		 * l2arc_write_buffers() relies on a header's L1 portion
+		 * (i.e. its b_tmp_cdata field) during its write phase.
+		 * Thus, we cannot push a header onto the arc_l2c_only
+		 * state (removing its L1 piece) until the header is
+		 * done being written to the l2arc.
 		 */
-		if (!recycle && count++ > arc_evict_iterations) {
-			list_insert_after(list, hdr, &marker);
-			mutex_exit(&state->arcs_mtx);
-			mutex_exit(&evicted_state->arcs_mtx);
-			kpreempt(KPREEMPT_SYNC);
-			mutex_enter(&evicted_state->arcs_mtx);
-			mutex_enter(&state->arcs_mtx);
-			hdr_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
-			count = 0;
-			continue;
+		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+			ARCSTAT_BUMP(arcstat_evict_l2_skip);
+			return (bytes_evicted);
 		}
 
-		hash_lock = HDR_LOCK(hdr);
-		have_lock = MUTEX_HELD(hash_lock);
-		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
-			ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
-			while (hdr->b_l1hdr.b_buf) {
-				arc_buf_t *buf = hdr->b_l1hdr.b_buf;
-				if (!mutex_tryenter(&buf->b_evict_lock)) {
-					missed += 1;
-					break;
-				}
-				if (buf->b_data != NULL) {
-					bytes_evicted += hdr->b_size;
-					if (recycle &&
-					    arc_buf_type(hdr) == type &&
-					    hdr->b_size == bytes &&
-					    !HDR_L2_WRITING(hdr)) {
-						stolen = buf->b_data;
-						recycle = FALSE;
-					}
-				}
-				if (buf->b_efunc != NULL) {
-					mutex_enter(&arc_eviction_mtx);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, FALSE);
-					hdr->b_l1hdr.b_buf = buf->b_next;
-					buf->b_hdr = &arc_eviction_hdr;
-					buf->b_next = arc_eviction_list;
-					arc_eviction_list = buf;
-					mutex_exit(&arc_eviction_mtx);
-					mutex_exit(&buf->b_evict_lock);
-				} else {
-					mutex_exit(&buf->b_evict_lock);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, TRUE);
-				}
-			}
+		ARCSTAT_BUMP(arcstat_deleted);
+		bytes_evicted += hdr->b_size;
 
-			if (HDR_HAS_L2HDR(hdr)) {
-				ARCSTAT_INCR(arcstat_evict_l2_cached,
-				    hdr->b_size);
-			} else {
-				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
-					ARCSTAT_INCR(arcstat_evict_l2_eligible,
-					    hdr->b_size);
-				} else {
-					ARCSTAT_INCR(
-					    arcstat_evict_l2_ineligible,
-					    hdr->b_size);
-				}
-			}
+		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
-			if (hdr->b_l1hdr.b_datacnt == 0) {
-				arc_change_state(evicted_state, hdr, hash_lock);
-				ASSERT(HDR_IN_HASH_TABLE(hdr));
-				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
-				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
-				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
-			}
-			if (!have_lock)
-				mutex_exit(hash_lock);
-			if (bytes >= 0 && bytes_evicted >= bytes)
-				break;
+		if (HDR_HAS_L2HDR(hdr)) {
+			/*
+			 * This buffer is cached on the 2nd Level ARC;
+			 * don't destroy the header.
+			 */
+			arc_change_state(arc_l2c_only, hdr, hash_lock);
+			/*
+			 * dropping from L1+L2 cached to L2-only,
+			 * realloc to remove the L1 header.
+			 */
+			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+			    hdr_l2only_cache);
 		} else {
-			missed += 1;
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
 		}
+		return (bytes_evicted);
 	}
 
-	mutex_exit(&state->arcs_mtx);
-	mutex_exit(&evicted_state->arcs_mtx);
+	ASSERT(state == arc_mru || state == arc_mfu);
+	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
 
-	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
-	    (bytes < 0 || bytes_evicted < bytes)) {
-		/* Prevent second pass from recycling metadata into data */
-		recycle = FALSE;
-		type = ARC_BUFC_METADATA;
-		list = &state->arcs_list[type];
-		goto top;
+	/* prefetch buffers have a minimum lifespan */
+	if (HDR_IO_IN_PROGRESS(hdr) ||
+	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+	    arc_min_prefetch_lifespan)) {
+		ARCSTAT_BUMP(arcstat_evict_skip);
+		return (bytes_evicted);
 	}
 
-	if (bytes_evicted < bytes)
-		dprintf("only evicted %lld bytes from %x\n",
-		    (longlong_t)bytes_evicted, state->arcs_state);
+	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+	ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+	while (hdr->b_l1hdr.b_buf) {
+		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+		if (!mutex_tryenter(&buf->b_evict_lock)) {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+			break;
+		}
+		if (buf->b_data != NULL)
+			bytes_evicted += hdr->b_size;
+		if (buf->b_efunc != NULL) {
+			mutex_enter(&arc_user_evicts_lock);
+			arc_buf_destroy(buf, FALSE);
+			hdr->b_l1hdr.b_buf = buf->b_next;
+			buf->b_hdr = &arc_eviction_hdr;
+			buf->b_next = arc_eviction_list;
+			arc_eviction_list = buf;
+			cv_signal(&arc_user_evicts_cv);
+			mutex_exit(&arc_user_evicts_lock);
+			mutex_exit(&buf->b_evict_lock);
+		} else {
+			mutex_exit(&buf->b_evict_lock);
+			arc_buf_destroy(buf, TRUE);
+		}
+	}
 
-	if (skipped)
-		ARCSTAT_INCR(arcstat_evict_skip, skipped);
+	if (HDR_HAS_L2HDR(hdr)) {
+		ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
+	} else {
+		if (l2arc_write_eligible(hdr->b_spa, hdr))
+			ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
+		else
+			ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
+	}
 
-	if (missed)
-		ARCSTAT_INCR(arcstat_mutex_miss, missed);
+	if (hdr->b_l1hdr.b_datacnt == 0) {
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
+		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+	}
 
-	/*
-	 * Note: we have just evicted some data into the ghost state,
-	 * potentially putting the ghost size over the desired size.  Rather
-	 * that evicting from the ghost list in this hot code path, leave
-	 * this chore to the arc_reclaim_thread().
-	 */
-	return (stolen);
+	return (bytes_evicted);
 }
 
-/*
- * Remove buffers from list until we've removed the specified number of
- * bytes.  Destroy the buffers that are removed.
- */
-static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
 {
-	arc_buf_hdr_t *hdr, *hdr_prev;
-	arc_buf_hdr_t marker;
-	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
+	multilist_sublist_t *mls;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
-	uint64_t bytes_deleted = 0;
-	uint64_t bufs_skipped = 0;
-	int count = 0;
+	int evict_count = 0;
 
-	ASSERT(GHOST_STATE(state));
-	bzero(&marker, sizeof (marker));
-top:
-	mutex_enter(&state->arcs_mtx);
-	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
-		hdr_prev = list_prev(list, hdr);
-		if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
-			panic("invalid hdr=%p", (void *)hdr);
-		if (spa && hdr->b_spa != spa)
-			continue;
+	ASSERT3P(marker, !=, NULL);
+	ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
+
+	mls = multilist_sublist_lock(ml, idx);
 
-		/* ignore markers */
+	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+	    hdr = multilist_sublist_prev(mls, marker)) {
+		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+		    (evict_count >= zfs_arc_evict_batch_limit))
+			break;
+
+		/*
+		 * To keep our iteration location, move the marker
+		 * forward. Since we're not holding hdr's hash lock, we
+		 * must be very careful and not remove 'hdr' from the
+		 * sublist. Otherwise, other consumers might mistake the
+		 * 'hdr' as not being on a sublist when they call the
+		 * multilist_link_active() function (they all rely on
+		 * the hash lock protecting concurrent insertions and
+		 * removals). multilist_sublist_move_forward() was
+		 * specifically implemented to ensure this is the case
+		 * (only 'marker' will be removed and re-inserted).
+		 */
+		multilist_sublist_move_forward(mls, marker);
+
+		/*
+		 * The only case where the b_spa field should ever be
+		 * zero, is the marker headers inserted by
+		 * arc_evict_state(). It's possible for multiple threads
+		 * to be calling arc_evict_state() concurrently (e.g.
+		 * dsl_pool_close() and zio_inject_fault()), so we must
+		 * skip any markers we see from these other threads.
+		 */
 		if (hdr->b_spa == 0)
 			continue;
 
-		hash_lock = HDR_LOCK(hdr);
-		/* caller may be trying to modify this buffer, skip it */
-		if (MUTEX_HELD(hash_lock))
+		/* we're only interested in evicting buffers of a certain spa */
+		if (spa != 0 && hdr->b_spa != spa) {
+			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
+		}
+
+		hash_lock = HDR_LOCK(hdr);
 
 		/*
-		 * It may take a long time to evict all the bufs requested.
-		 * To avoid blocking all arc activity, periodically drop
-		 * the arcs_mtx and give other threads a chance to run
-		 * before reacquiring the lock.
+		 * We aren't calling this function from any code path
+		 * that would already be holding a hash lock, so we're
+		 * asserting on this assumption to be defensive in case
+		 * this ever changes. Without this check, it would be
+		 * possible to incorrectly increment arcstat_mutex_miss
+		 * below (e.g. if the code changed such that we called
+		 * this function with a hash lock held).
 		 */
-		if (count++ > arc_evict_iterations) {
-			list_insert_after(list, hdr, &marker);
-			mutex_exit(&state->arcs_mtx);
-			kpreempt(KPREEMPT_SYNC);
-			mutex_enter(&state->arcs_mtx);
-			hdr_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
-			count = 0;
-			continue;
-		}
+		ASSERT(!MUTEX_HELD(hash_lock));
+
 		if (mutex_tryenter(hash_lock)) {
-			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT(!HDR_HAS_L1HDR(hdr) ||
-			    hdr->b_l1hdr.b_buf == NULL);
-			ARCSTAT_BUMP(arcstat_deleted);
-			bytes_deleted += hdr->b_size;
+			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+			mutex_exit(hash_lock);
 
-			if (HDR_HAS_L2HDR(hdr)) {
-				/*
-				 * This buffer is cached on the 2nd Level ARC;
-				 * don't destroy the header.
-				 */
-				arc_change_state(arc_l2c_only, hdr, hash_lock);
-				/*
-				 * dropping from L1+L2 cached to L2-only,
-				 * realloc to remove the L1 header.
-				 */
-				hdr = arc_hdr_realloc(hdr, hdr_full_cache,
-				    hdr_l2only_cache);
-				mutex_exit(hash_lock);
-			} else {
-				arc_change_state(arc_anon, hdr, hash_lock);
-				mutex_exit(hash_lock);
-				arc_hdr_destroy(hdr);
-			}
+			bytes_evicted += evicted;
 
-			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
-			if (bytes >= 0 && bytes_deleted >= bytes)
-				break;
-		} else if (bytes < 0) {
 			/*
-			 * Insert a list marker and then wait for the
-			 * hash lock to become available. Once its
-			 * available, restart from where we left off.
+			 * If evicted is zero, arc_evict_hdr() must have
+			 * decided to skip this header, don't increment
+			 * evict_count in this case.
 			 */
-			list_insert_after(list, hdr, &marker);
-			mutex_exit(&state->arcs_mtx);
-			mutex_enter(hash_lock);
-			mutex_exit(hash_lock);
-			mutex_enter(&state->arcs_mtx);
-			hdr_prev = list_prev(list, &marker);
-			list_remove(list, &marker);
+			if (evicted != 0)
+				evict_count++;
+
+			/*
+			 * If arc_size isn't overflowing, signal any
+			 * threads that might happen to be waiting.
+			 *
+			 * For each header evicted, we wake up a single
+			 * thread. If we used cv_broadcast, we could
+			 * wake up "too many" threads causing arc_size
+			 * to significantly overflow arc_c; since
+			 * arc_get_data_buf() doesn't check for overflow
+			 * when it's woken up (it doesn't because it's
+			 * possible for the ARC to be overflowing while
+			 * full of un-evictable buffers, and the
+			 * function should proceed in this case).
+			 *
+			 * If threads are left sleeping, due to not
+			 * using cv_broadcast, they will be woken up
+			 * just before arc_reclaim_thread() sleeps.
+			 */
+			mutex_enter(&arc_reclaim_lock);
+			if (!arc_is_overflowing())
+				cv_signal(&arc_reclaim_waiters_cv);
+			mutex_exit(&arc_reclaim_lock);
 		} else {
-			bufs_skipped += 1;
+			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
-	mutex_exit(&state->arcs_mtx);
 
-	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
-	    (bytes < 0 || bytes_deleted < bytes)) {
-		list = &state->arcs_list[ARC_BUFC_METADATA];
-		goto top;
-	}
-
-	if (bufs_skipped) {
-		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
-		ASSERT(bytes >= 0);
-	}
+	multilist_sublist_unlock(mls);
 
-	if (bytes_deleted < bytes)
-		dprintf("only deleted %lld bytes from %p\n",
-		    (longlong_t)bytes_deleted, state);
+	return (bytes_evicted);
 }
 
-static void
-arc_adjust(void)
+/*
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
 {
-	int64_t adjustment, delta;
+	uint64_t total_evicted = 0;
+	multilist_t *ml = &state->arcs_list[type];
+	int num_sublists;
+	arc_buf_hdr_t **markers;
+	int i;
+
+	ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL));
+
+	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
-	 * Adjust MRU size
+	 * If we've tried to evict from each sublist, made some
+	 * progress, but still have not hit the target number of bytes
+	 * to evict, we want to keep trying. The markers allow us to
+	 * pick up where we left off for each individual sublist, rather
+	 * than starting from the tail each time.
 	 */
+	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+	for (i = 0; i < num_sublists; i++) {
+		multilist_sublist_t *mls;
 
-	adjustment = MIN((int64_t)(arc_size - arc_c),
-	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
+		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+		/*
+		 * A b_spa of 0 is used to indicate that this header is
+		 * a marker. This fact is used in arc_adjust_type() and
+		 * arc_evict_state_impl().
+		 */
+		markers[i]->b_spa = 0;
 
-	if (adjustment > 0 && arc_mru->arcs_size > 0) {
-		delta = MIN(arc_mru->arcs_size, adjustment);
-		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
+		mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_insert_tail(mls, markers[i]);
+		multilist_sublist_unlock(mls);
 	}
 
 	/*
-	 * Adjust MFU size
+	 * While we haven't hit our target number of bytes to evict, or
+	 * we're evicting all available buffers.
 	 */
+	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+		/*
+		 * Start eviction using a randomly selected sublist,
+		 * this is to try and evenly balance eviction across all
+		 * sublists. Always starting at the same sublist
+		 * (e.g. index 0) would cause evictions to favor certain
+		 * sublists over others.
+		 */
+		int sublist_idx = multilist_get_random_index(ml);
+		uint64_t scan_evicted = 0;
 
-	adjustment = arc_size - arc_c;
+		for (i = 0; i < num_sublists; i++) {
+			uint64_t bytes_remaining;
+			uint64_t bytes_evicted;
 
-	if (adjustment > 0 && arc_mfu->arcs_size > 0) {
-		delta = MIN(arc_mfu->arcs_size, adjustment);
-		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
-	}
+			if (bytes == ARC_EVICT_ALL)
+				bytes_remaining = ARC_EVICT_ALL;
+			else if (total_evicted < bytes)
+				bytes_remaining = bytes - total_evicted;
+			else
+				break;
 
-	/*
-	 * Adjust ghost lists
-	 */
+			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+			    markers[sublist_idx], spa, bytes_remaining);
+
+			scan_evicted += bytes_evicted;
+			total_evicted += bytes_evicted;
+
+			/* we've reached the end, wrap to the beginning */
+			if (++sublist_idx >= num_sublists)
+				sublist_idx = 0;
+		}
+
+		/*
+		 * If we didn't evict anything during this scan, we have
+		 * no reason to believe we'll evict more during another
+		 * scan, so break the loop.
+		 */
+		if (scan_evicted == 0) {
+			/* This isn't possible, let's make that obvious */
+			ASSERT3S(bytes, !=, 0);
 
-	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+			/*
+			 * When bytes is ARC_EVICT_ALL, the only way to
+			 * break the loop is when scan_evicted is zero.
+			 * In that case, we actually have evicted enough,
+			 * so we don't want to increment the kstat.
+			 */
+			if (bytes != ARC_EVICT_ALL) {
+				ASSERT3S(total_evicted, <, bytes);
+				ARCSTAT_BUMP(arcstat_evict_not_enough);
+			}
 
-	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
-		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, 0, delta);
+			break;
+		}
 	}
 
-	adjustment =
-	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+	for (i = 0; i < num_sublists; i++) {
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_remove(mls, markers[i]);
+		multilist_sublist_unlock(mls);
 
-	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
-		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta);
+		kmem_cache_free(hdr_full_cache, markers[i]);
 	}
+	kmem_free(markers, sizeof (*markers) * num_sublists);
+
+	return (total_evicted);
+}
+
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+    boolean_t retry)
+{
+	uint64_t evicted = 0;
+
+	while (state->arcs_lsize[type] != 0) {
+		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+
+		if (!retry)
+			break;
+	}
+
+	return (evicted);
 }
 
 /*
  * Request that arc user drop references so that N bytes can be released
  * from the cache.  This provides a mechanism to ensure the arc can honor
  * the arc_meta_limit and reclaim buffers which are pinned in the cache
  * by higher layers.  (i.e. the zpl)
  */
 static void
 arc_do_user_prune(int64_t adjustment)
 {
 	arc_prune_func_t *func;
 	void *private;
 	arc_prune_t *cp, *np;
 
 	mutex_enter(&arc_prune_mtx);
 
 	cp = list_head(&arc_prune_list);
 	while (cp != NULL) {
 		func = cp->p_pfunc;
 		private = cp->p_private;
 		np = list_next(&arc_prune_list, cp);
 		refcount_add(&cp->p_refcnt, func);
 		mutex_exit(&arc_prune_mtx);
 
 		if (func != NULL)
 			func(adjustment, private);
 
 		mutex_enter(&arc_prune_mtx);
 
 		/* User removed prune callback concurrently with execution */
 		if (refcount_remove(&cp->p_refcnt, func) == 0) {
 			ASSERT(!list_link_active(&cp->p_node));
 			refcount_destroy(&cp->p_refcnt);
 			kmem_free(cp, sizeof (*cp));
 		}
 
 		cp = np;
 	}
 
 	ARCSTAT_BUMP(arcstat_prune);
 	mutex_exit(&arc_prune_mtx);
 }
 
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
+{
+	int64_t delta;
+
+	if (bytes > 0 && state->arcs_lsize[type] > 0) {
+		delta = MIN(state->arcs_lsize[type], bytes);
+		return (arc_evict_state(state, spa, delta, type));
+	}
+
+	return (0);
+}
+
+/*
+ * The goal of this function is to evict enough meta data buffers from the
+ * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
+ * more complicated than it appears because it is common for data buffers
+ * to have holds on meta data buffers.  In addition, dnode meta data buffers
+ * will be held by the dnodes in the block preventing them from being freed.
+ * This means we can't simply traverse the ARC and expect to always find
+ * enough unheld meta data buffer to release.
+ *
+ * Therefore, this function has been updated to make alternating passes
+ * over the ARC releasing data buffers and then newly unheld meta data
+ * buffers.  This ensures forward progress is maintained and arc_meta_used
+ * will decrease.  Normally this is sufficient, but if required the ARC
+ * will call the registered prune callbacks causing dentry and inodes to
+ * be dropped from the VFS cache.  This will make dnode meta data buffers
+ * available for reclaim.
+ */
+static uint64_t
+arc_adjust_meta(void)
+{
+	int64_t adjustmnt, delta, prune = 0;
+	uint64_t total_evicted = 0;
+	arc_buf_contents_t type = ARC_BUFC_DATA;
+	unsigned long restarts = zfs_arc_meta_adjust_restarts;
+
+restart:
+	/*
+	 * This slightly differs than the way we evict from the mru in
+	 * arc_adjust because we don't have a "target" value (i.e. no
+	 * "meta" arc_p). As a result, I think we can completely
+	 * cannibalize the metadata in the MRU before we evict the
+	 * metadata from the MFU. I think we probably need to implement a
+	 * "metadata arc_p" value to do this properly.
+	 */
+	adjustmnt = arc_meta_used - arc_meta_limit;
+
+	if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
+		delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
+		total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
+		adjustmnt -= delta;
+	}
+
+	/*
+	 * We can't afford to recalculate adjustmnt here. If we do,
+	 * new metadata buffers can sneak into the MRU or ANON lists,
+	 * thus penalize the MFU metadata. Although the fudge factor is
+	 * small, it has been empirically shown to be significant for
+	 * certain workloads (e.g. creating many empty directories). As
+	 * such, we use the original calculation for adjustmnt, and
+	 * simply decrement the amount of data evicted from the MRU.
+	 */
+
+	if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
+		delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
+		total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
+	}
+
+	adjustmnt = arc_meta_used - arc_meta_limit;
+
+	if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
+		delta = MIN(adjustmnt,
+		    arc_mru_ghost->arcs_lsize[type]);
+		total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
+		adjustmnt -= delta;
+	}
+
+	if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
+		delta = MIN(adjustmnt,
+		    arc_mfu_ghost->arcs_lsize[type]);
+		total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
+	}
+
+	/*
+	 * If after attempting to make the requested adjustment to the ARC
+	 * the meta limit is still being exceeded then request that the
+	 * higher layers drop some cached objects which have holds on ARC
+	 * meta buffers.  Requests to the upper layers will be made with
+	 * increasingly large scan sizes until the ARC is below the limit.
+	 */
+	if (arc_meta_used > arc_meta_limit) {
+		if (type == ARC_BUFC_DATA) {
+			type = ARC_BUFC_METADATA;
+		} else {
+			type = ARC_BUFC_DATA;
+
+			if (zfs_arc_meta_prune) {
+				prune += zfs_arc_meta_prune;
+				arc_do_user_prune(prune);
+			}
+		}
+
+		if (restarts > 0) {
+			restarts--;
+			goto restart;
+		}
+	}
+	return (total_evicted);
+}
+
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
+{
+	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
+	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
+	int data_idx = multilist_get_random_index(data_ml);
+	int meta_idx = multilist_get_random_index(meta_ml);
+	multilist_sublist_t *data_mls;
+	multilist_sublist_t *meta_mls;
+	arc_buf_contents_t type;
+	arc_buf_hdr_t *data_hdr;
+	arc_buf_hdr_t *meta_hdr;
+
+	/*
+	 * We keep the sublist lock until we're finished, to prevent
+	 * the headers from being destroyed via arc_evict_state().
+	 */
+	data_mls = multilist_sublist_lock(data_ml, data_idx);
+	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
+
+	/*
+	 * These two loops are to ensure we skip any markers that
+	 * might be at the tail of the lists due to arc_evict_state().
+	 */
+
+	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+		if (data_hdr->b_spa != 0)
+			break;
+	}
+
+	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+		if (meta_hdr->b_spa != 0)
+			break;
+	}
+
+	if (data_hdr == NULL && meta_hdr == NULL) {
+		type = ARC_BUFC_DATA;
+	} else if (data_hdr == NULL) {
+		ASSERT3P(meta_hdr, !=, NULL);
+		type = ARC_BUFC_METADATA;
+	} else if (meta_hdr == NULL) {
+		ASSERT3P(data_hdr, !=, NULL);
+		type = ARC_BUFC_DATA;
+	} else {
+		ASSERT3P(data_hdr, !=, NULL);
+		ASSERT3P(meta_hdr, !=, NULL);
+
+		/* The headers can't be on the sublist without an L1 header */
+		ASSERT(HDR_HAS_L1HDR(data_hdr));
+		ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+		if (data_hdr->b_l1hdr.b_arc_access <
+		    meta_hdr->b_l1hdr.b_arc_access) {
+			type = ARC_BUFC_DATA;
+		} else {
+			type = ARC_BUFC_METADATA;
+		}
+	}
+
+	multilist_sublist_unlock(meta_mls);
+	multilist_sublist_unlock(data_mls);
+
+	return (type);
+}
+
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
+arc_adjust(void)
+{
+	uint64_t total_evicted = 0;
+	uint64_t bytes;
+	int64_t target;
+
+	/*
+	 * If we're over arc_meta_limit, we want to correct that before
+	 * potentially evicting data buffers below.
+	 */
+	total_evicted += arc_adjust_meta();
+
+	/*
+	 * Adjust MRU size
+	 *
+	 * If we're over the target cache size, we want to evict enough
+	 * from the list to get back to our target size. We don't want
+	 * to evict too much from the MRU, such that it drops below
+	 * arc_p. So, if we're over our target cache size more than
+	 * the MRU is over arc_p, we'll evict enough to get back to
+	 * arc_p here, and then evict more from the MFU below.
+	 */
+	target = MIN((int64_t)(arc_size - arc_c),
+	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
+	    arc_p));
+
+	/*
+	 * If we're below arc_meta_min, always prefer to evict data.
+	 * Otherwise, try to satisfy the requested number of bytes to
+	 * evict from the type which contains older buffers; in an
+	 * effort to keep newer buffers in the cache regardless of their
+	 * type. If we cannot satisfy the number of bytes from this
+	 * type, spill over into the next type.
+	 */
+	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from metadata.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+	}
+
+	/*
+	 * Adjust MFU size
+	 *
+	 * Now that we've tried to evict enough from the MRU to get its
+	 * size back to arc_p, if we're still above the target cache
+	 * size, we evict the rest from the MFU.
+	 */
+	target = arc_size - arc_c;
+
+	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+	}
+
+	/*
+	 * Adjust ghost lists
+	 *
+	 * In addition to the above, the ARC also defines target values
+	 * for the ghost lists. The sum of the mru list and mru ghost
+	 * list should never exceed the target size of the cache, and
+	 * the sum of the mru list, mfu list, mru ghost list, and mfu
+	 * ghost list should never exceed twice the target size of the
+	 * cache. The following logic enforces these limits on the ghost
+	 * caches, and evicts from them as needed.
+	 */
+	target = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+
+	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
+
+	target -= bytes;
+
+	total_evicted +=
+	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
+
+	/*
+	 * We assume the sum of the mru list and mfu list is less than
+	 * or equal to arc_c (we enforced this above), which means we
+	 * can use the simpler of the two equations below:
+	 *
+	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+	 *		    mru ghost + mfu ghost <= arc_c
+	 */
+	target = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+
+	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
+
+	target -= bytes;
+
+	total_evicted +=
+	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+	return (total_evicted);
+}
+
 static void
 arc_do_user_evicts(void)
 {
-	mutex_enter(&arc_eviction_mtx);
+	mutex_enter(&arc_user_evicts_lock);
 	while (arc_eviction_list != NULL) {
 		arc_buf_t *buf = arc_eviction_list;
 		arc_eviction_list = buf->b_next;
 		mutex_enter(&buf->b_evict_lock);
 		buf->b_hdr = NULL;
 		mutex_exit(&buf->b_evict_lock);
-		mutex_exit(&arc_eviction_mtx);
+		mutex_exit(&arc_user_evicts_lock);
 
 		if (buf->b_efunc != NULL)
 			VERIFY0(buf->b_efunc(buf->b_private));
 
 		buf->b_efunc = NULL;
 		buf->b_private = NULL;
 		kmem_cache_free(buf_cache, buf);
-		mutex_enter(&arc_eviction_mtx);
+		mutex_enter(&arc_user_evicts_lock);
 	}
-	mutex_exit(&arc_eviction_mtx);
+	mutex_exit(&arc_user_evicts_lock);
 }
 
-/*
- * The goal of this function is to evict enough meta data buffers from the
- * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
- * more complicated than it appears because it is common for data buffers
- * to have holds on meta data buffers.  In addition, dnode meta data buffers
- * will be held by the dnodes in the block preventing them from being freed.
- * This means we can't simply traverse the ARC and expect to always find
- * enough unheld meta data buffer to release.
- *
- * Therefore, this function has been updated to make alternating passes
- * over the ARC releasing data buffers and then newly unheld meta data
- * buffers.  This ensures forward progress is maintained and arc_meta_used
- * will decrease.  Normally this is sufficient, but if required the ARC
- * will call the registered prune callbacks causing dentry and inodes to
- * be dropped from the VFS cache.  This will make dnode meta data buffers
- * available for reclaim.
- */
-static void
-arc_adjust_meta(void)
+void
+arc_flush(spa_t *spa, boolean_t retry)
 {
-	int64_t adjustmnt, delta, prune = 0;
-	arc_buf_contents_t type = ARC_BUFC_DATA;
-	unsigned long restarts = zfs_arc_meta_adjust_restarts;
-
-restart:
-	/*
-	 * This slightly differs than the way we evict from the mru in
-	 * arc_adjust because we don't have a "target" value (i.e. no
-	 * "meta" arc_p). As a result, I think we can completely
-	 * cannibalize the metadata in the MRU before we evict the
-	 * metadata from the MFU. I think we probably need to implement a
-	 * "metadata arc_p" value to do this properly.
-	 */
-	adjustmnt = arc_meta_used - arc_meta_limit;
-
-	if (adjustmnt > 0 && arc_mru->arcs_lsize[type] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[type], adjustmnt);
-		arc_evict(arc_mru, 0, delta, FALSE, type);
-		adjustmnt -= delta;
-	}
-
-	/*
-	 * We can't afford to recalculate adjustmnt here. If we do,
-	 * new metadata buffers can sneak into the MRU or ANON lists,
-	 * thus penalize the MFU metadata. Although the fudge factor is
-	 * small, it has been empirically shown to be significant for
-	 * certain workloads (e.g. creating many empty directories). As
-	 * such, we use the original calculation for adjustmnt, and
-	 * simply decrement the amount of data evicted from the MRU.
-	 */
-
-	if (adjustmnt > 0 && arc_mfu->arcs_lsize[type] > 0) {
-		delta = MIN(arc_mfu->arcs_lsize[type], adjustmnt);
-		arc_evict(arc_mfu, 0, delta, FALSE, type);
-	}
-
-	adjustmnt = arc_meta_used - arc_meta_limit;
-
-	if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-		delta = MIN(adjustmnt,
-		    arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]);
-		arc_evict_ghost(arc_mru_ghost, 0, delta);
-	}
-
-	if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[type] > 0) {
-		delta = MIN(adjustmnt,
-		    arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta);
-	}
+	uint64_t guid = 0;
 
 	/*
-	 * If after attempting to make the requested adjustment to the ARC
-	 * the meta limit is still being exceeded then request that the
-	 * higher layers drop some cached objects which have holds on ARC
-	 * meta buffers.  Requests to the upper layers will be made with
-	 * increasingly large scan sizes until the ARC is below the limit.
+	 * If retry is TRUE, a spa must not be specified since we have
+	 * no good way to determine if all of a spa's buffers have been
+	 * evicted from an arc state.
 	 */
-	if (arc_meta_used > arc_meta_limit) {
-		if (type == ARC_BUFC_DATA) {
-			type = ARC_BUFC_METADATA;
-		} else {
-			type = ARC_BUFC_DATA;
-
-			if (zfs_arc_meta_prune) {
-				prune += zfs_arc_meta_prune;
-				arc_do_user_prune(prune);
-			}
-		}
-
-		if (restarts > 0) {
-			restarts--;
-			goto restart;
-		}
-	}
-}
-
-/*
- * Flush all *evictable* data from the cache for the given spa.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
-void
-arc_flush(spa_t *spa)
-{
-	uint64_t guid = 0;
+	ASSERT(!retry || spa == 0);
 
 	if (spa != NULL)
 		guid = spa_load_guid(spa);
 
-	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa != NULL)
-			break;
-	}
-	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa != NULL)
-			break;
-	}
-	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa != NULL)
-			break;
-	}
-	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa != NULL)
-			break;
-	}
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
-	arc_evict_ghost(arc_mru_ghost, guid, -1);
-	arc_evict_ghost(arc_mfu_ghost, guid, -1);
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 
-	mutex_enter(&arc_reclaim_thr_lock);
 	arc_do_user_evicts();
-	mutex_exit(&arc_reclaim_thr_lock);
 	ASSERT(spa || arc_eviction_list == NULL);
 }
 
 void
 arc_shrink(uint64_t bytes)
 {
 	if (arc_c > arc_c_min) {
 		uint64_t to_free;
 
 		to_free = bytes ? bytes : arc_c >> zfs_arc_shrink_shift;
 
 		if (arc_c > arc_c_min + to_free)
 			atomic_add_64(&arc_c, -to_free);
 		else
 			arc_c = arc_c_min;
 
 		to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
 
 		if (arc_p > to_free)
 			atomic_add_64(&arc_p, -to_free);
 		else
 			arc_p = 0;
 
 		if (arc_c > arc_size)
 			arc_c = MAX(arc_size, arc_c_min);
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
 		ASSERT(arc_c >= arc_c_min);
 		ASSERT((int64_t)arc_p >= 0);
 	}
 
 	if (arc_size > arc_c)
-		arc_adjust();
+		(void) arc_adjust();
 }
 
 static void
 arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 	extern kmem_cache_t	*zio_buf_cache[];
 	extern kmem_cache_t	*zio_data_buf_cache[];
 
 	/*
 	 * An aggressive reclamation will shrink the cache size as well as
 	 * reap free buffers from the arc kmem caches.
 	 */
 	if (strat == ARC_RECLAIM_AGGR)
 		arc_shrink(bytes);
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 
+	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
 }
 
 /*
  * Unlike other ZFS implementations this thread is only responsible for
  * adapting the target ARC size on Linux.  The responsibility for memory
  * reclamation has been entirely delegated to the arc_shrinker_func()
  * which is registered with the VM.  To reflect this change in behavior
  * the arc_reclaim thread has been renamed to arc_adapt.
+ *
+ * The following comment from arc_reclaim_thread() in illumos is still
+ * applicable:
+ *
+ * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * enough data and signal them to proceed. When this happens, the threads in
+ * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * particular arc header. Thus, we must be careful to never sleep on a
+ * hash lock in this thread. This is to prevent the following deadlock:
+ *
+ *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ *    waiting for the reclaim thread to signal it.
+ *
+ *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
+ *    fails, and goes to sleep forever.
+ *
+ * This possible deadlock is avoided by always acquiring a hash lock
+ * using mutex_tryenter() from arc_reclaim_thread().
  */
 static void
 arc_adapt_thread(void)
 {
 	callb_cpr_t		cpr;
 	fstrans_cookie_t	cookie;
+	uint64_t		arc_evicted;
 
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
 
 	cookie = spl_fstrans_mark();
-	mutex_enter(&arc_reclaim_thr_lock);
-	while (arc_thread_exit == 0) {
+	mutex_enter(&arc_reclaim_lock);
+	while (arc_reclaim_thread_exit == 0) {
 #ifndef _KERNEL
 		arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
 
+		mutex_exit(&arc_reclaim_lock);
 		if (spa_get_random(100) == 0) {
 
 			if (arc_no_grow) {
 				if (last_reclaim == ARC_RECLAIM_CONS) {
 					last_reclaim = ARC_RECLAIM_AGGR;
 				} else {
 					last_reclaim = ARC_RECLAIM_CONS;
 				}
 			} else {
 				arc_no_grow = TRUE;
 				last_reclaim = ARC_RECLAIM_AGGR;
 				membar_producer();
 			}
 
 			/* reset the growth delay for every reclaim */
 			arc_grow_time = ddi_get_lbolt() +
 			    (zfs_arc_grow_retry * hz);
 
 			arc_kmem_reap_now(last_reclaim, 0);
 			arc_warm = B_TRUE;
 		}
+#else /* _KERNEL */
+		mutex_exit(&arc_reclaim_lock);
 #endif /* !_KERNEL */
 
 		/* No recent memory pressure allow the ARC to grow. */
 		if (arc_no_grow &&
 		    ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time))
 			arc_no_grow = FALSE;
 
-		arc_adjust_meta();
+		arc_evicted = arc_adjust();
 
-		arc_adjust();
+		/*
+		 * We're either no longer overflowing, or we
+		 * can't evict anything more, so we should wake
+		 * up any threads before we go to sleep.
+		 */
+		if (arc_size <= arc_c || arc_evicted == 0)
+			cv_broadcast(&arc_reclaim_waiters_cv);
 
-		if (arc_eviction_list != NULL)
-			arc_do_user_evicts();
+		mutex_enter(&arc_reclaim_lock);
 
 		/* block until needed, or one second, whichever is shorter */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait_interruptible(&arc_reclaim_thr_cv,
-		    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
-		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+		(void) cv_timedwait_interruptible(&arc_reclaim_thread_cv,
+		    &arc_reclaim_lock, (ddi_get_lbolt() + hz));
+		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
 
 
 		/* Allow the module options to be changed */
 		if (zfs_arc_max > 64 << 20 &&
 		    zfs_arc_max < physmem * PAGESIZE &&
 		    zfs_arc_max != arc_c_max)
 			arc_c_max = zfs_arc_max;
 
 		if (zfs_arc_min > 0 &&
 		    zfs_arc_min < arc_c_max &&
 		    zfs_arc_min != arc_c_min)
 			arc_c_min = zfs_arc_min;
 
 		if (zfs_arc_meta_limit > 0 &&
 		    zfs_arc_meta_limit <= arc_c_max &&
 		    zfs_arc_meta_limit != arc_meta_limit)
 			arc_meta_limit = zfs_arc_meta_limit;
+	}
 
+	arc_reclaim_thread_exit = 0;
+	cv_broadcast(&arc_reclaim_thread_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
+	spl_fstrans_unmark(cookie);
+	thread_exit();
+}
+
+static void
+arc_user_evicts_thread(void)
+{
+	callb_cpr_t cpr;
+	fstrans_cookie_t	cookie;
 
+	CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
 
+	cookie = spl_fstrans_mark();
+	mutex_enter(&arc_user_evicts_lock);
+	while (!arc_user_evicts_thread_exit) {
+		mutex_exit(&arc_user_evicts_lock);
+
+		arc_do_user_evicts();
+
+		/*
+		 * This is necessary in order for the mdb ::arc dcmd to
+		 * show up to date information. Since the ::arc command
+		 * does not call the kstat's update function, without
+		 * this call, the command may show stale stats for the
+		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+		 * with this change, the data might be up to 1 second
+		 * out of date; but that should suffice. The arc_state_t
+		 * structures can be queried directly if more accurate
+		 * information is needed.
+		 */
+		if (arc_ksp != NULL)
+			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+
+		mutex_enter(&arc_user_evicts_lock);
+
+		/*
+		 * Block until signaled, or after one second (we need to
+		 * call the arc's kstat update function regularly).
+		 */
+		CALLB_CPR_SAFE_BEGIN(&cpr);
+		(void) cv_timedwait_interruptible(&arc_user_evicts_cv,
+		    &arc_user_evicts_lock, ddi_get_lbolt() + hz);
+		CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
 	}
 
-	arc_thread_exit = 0;
-	cv_broadcast(&arc_reclaim_thr_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
+	arc_user_evicts_thread_exit = FALSE;
+	cv_broadcast(&arc_user_evicts_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_user_evicts_lock */
 	spl_fstrans_unmark(cookie);
 	thread_exit();
 }
 
 #ifdef _KERNEL
 /*
  * Determine the amount of memory eligible for eviction contained in the
  * ARC. All clean data reported by the ghost lists can always be safely
  * evicted. Due to arc_c_min, the same does not hold for all clean data
  * contained by the regular mru and mfu lists.
  *
  * In the case of the regular mru and mfu lists, we need to report as
  * much clean data as possible, such that evicting that same reported
  * data will not bring arc_size below arc_c_min. Thus, in certain
  * circumstances, the total amount of clean data in the mru and mfu
  * lists might not actually be evictable.
  *
  * The following two distinct cases are accounted for:
  *
  * 1. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is greater than or equal to arc_c_min.
  *    (i.e. amount of dirty data >= arc_c_min)
  *
  *    This is the easy case; all clean data contained by the mru and mfu
  *    lists is evictable. Evicting all clean data can only drop arc_size
  *    to the amount of dirty data, which is greater than arc_c_min.
  *
  * 2. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is less than arc_c_min.
  *    (i.e. arc_c_min > amount of dirty data)
  *
  *    2.1. arc_size is greater than or equal arc_c_min.
  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
  *
  *         In this case, not all clean data from the regular mru and mfu
  *         lists is actually evictable; we must leave enough clean data
  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
  *         evictable data from the two lists combined, is exactly the
  *         difference between arc_size and arc_c_min.
  *
  *    2.2. arc_size is less than arc_c_min
  *         (i.e. arc_c_min > arc_size > amount of dirty data)
  *
  *         In this case, none of the data contained in the mru and mfu
  *         lists is evictable, even if it's clean. Since arc_size is
  *         already below arc_c_min, evicting any more would only
  *         increase this negative difference.
  */
 static uint64_t
 arc_evictable_memory(void) {
 	uint64_t arc_clean =
 	    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
 	    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
 	    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
 	    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
 	uint64_t ghost_clean =
 	    arc_mru_ghost->arcs_lsize[ARC_BUFC_DATA] +
 	    arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
 	    arc_mfu_ghost->arcs_lsize[ARC_BUFC_DATA] +
 	    arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA];
 	uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
 
 	if (arc_dirty >= arc_c_min)
 		return (ghost_clean + arc_clean);
 
 	return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0));
 }
 
 /*
  * If sc->nr_to_scan is zero, the caller is requesting a query of the
  * number of objects which can potentially be freed.  If it is nonzero,
  * the request is to free that many objects.
  *
  * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
  * in struct shrinker and also require the shrinker to return the number
  * of objects freed.
  *
  * Older kernels require the shrinker to return the number of freeable
  * objects following the freeing of nr_to_free.
  */
 static spl_shrinker_t
 __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
 {
 	int64_t pages;
 
 	/* The arc is considered warm once reclaim has occurred */
 	if (unlikely(arc_warm == B_FALSE))
 		arc_warm = B_TRUE;
 
 	/* Return the potential number of reclaimable pages */
 	pages = btop((int64_t)arc_evictable_memory());
 	if (sc->nr_to_scan == 0)
 		return (pages);
 
 	/* Not allowed to perform filesystem reclaim */
 	if (!(sc->gfp_mask & __GFP_FS))
 		return (SHRINK_STOP);
 
 	/* Reclaim in progress */
-	if (mutex_tryenter(&arc_reclaim_thr_lock) == 0)
+	if (mutex_tryenter(&arc_reclaim_lock) == 0)
 		return (SHRINK_STOP);
 
+	mutex_exit(&arc_reclaim_lock);
+
 	/*
 	 * Evict the requested number of pages by shrinking arc_c the
 	 * requested amount.  If there is nothing left to evict just
 	 * reap whatever we can from the various arc slabs.
 	 */
 	if (pages > 0) {
 		arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
 
 #ifdef HAVE_SPLIT_SHRINKER_CALLBACK
 		pages = MAX(pages - btop(arc_evictable_memory()), 0);
 #else
 		pages = btop(arc_evictable_memory());
 #endif
 	} else {
 		arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
 		pages = SHRINK_STOP;
 	}
 
+	/*
+	 * We've reaped what we can, wake up threads.
+	 */
+	cv_broadcast(&arc_reclaim_waiters_cv);
+
 	/*
 	 * When direct reclaim is observed it usually indicates a rapid
 	 * increase in memory pressure.  This occurs because the kswapd
 	 * threads were unable to asynchronously keep enough free memory
 	 * available.  In this case set arc_no_grow to briefly pause arc
 	 * growth to avoid compounding the memory pressure.
 	 */
 	if (current_is_kswapd()) {
 		ARCSTAT_BUMP(arcstat_memory_indirect_count);
 	} else {
 		arc_no_grow = B_TRUE;
 		arc_grow_time = ddi_get_lbolt() + (zfs_arc_grow_retry * hz);
 		ARCSTAT_BUMP(arcstat_memory_direct_count);
 	}
 
-	mutex_exit(&arc_reclaim_thr_lock);
-
 	return (pages);
 }
 SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
 
 SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS);
 #endif /* _KERNEL */
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are comming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
 
 	if (state == arc_l2c_only)
 		return;
 
 	ASSERT(bytes > 0);
 	/*
 	 * Adapt the target size of the MRU list:
 	 *	- if we just hit in the MRU ghost list, then increase
 	 *	  the target size of the MRU list.
 	 *	- if we just hit in the MFU ghost list, then increase
 	 *	  the target size of the MFU list by decreasing the
 	 *	  target size of the MRU list.
 	 */
 	if (state == arc_mru_ghost) {
 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
 
 		if (!zfs_arc_p_dampener_disable)
 			mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
 		uint64_t delta;
 
 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
 
 		if (!zfs_arc_p_dampener_disable)
 			mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
 		arc_p = MAX(0, arc_p - delta);
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
 		atomic_add_64(&arc_c, (int64_t)bytes);
 		if (arc_c > arc_c_max)
 			arc_c = arc_c_max;
 		else if (state == arc_anon)
 			atomic_add_64(&arc_p, (int64_t)bytes);
 		if (arc_p > arc_c)
 			arc_p = arc_c;
 	}
 	ASSERT((int64_t)arc_p >= 0);
 }
 
 /*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
  */
-static int
-arc_evict_needed(arc_buf_contents_t type)
+static boolean_t
+arc_is_overflowing(void)
 {
-	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
-		return (1);
-
-	if (arc_no_grow)
-		return (1);
+	/* Always allow at least one block of overflow */
+	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+	    arc_c >> zfs_arc_overflow_shift);
 
-	return (arc_size > arc_c);
+	return (arc_size >= arc_c + overflow);
 }
 
 /*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead.  Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU.  In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted.  In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
+ * The buffer, supplied as the first argument, needs a data block. If we
+ * are hitting the hard limit for the cache size, we must sleep, waiting
+ * for the eviction thread to catch up. If we're past the target size
+ * but below the hard limit, we'll only signal the reclaim thread and
+ * continue on.
  */
 static void
 arc_get_data_buf(arc_buf_t *buf)
 {
 	arc_state_t		*state = buf->b_hdr->b_l1hdr.b_state;
 	uint64_t		size = buf->b_hdr->b_size;
 	arc_buf_contents_t	type = arc_buf_type(buf->b_hdr);
-	arc_buf_contents_t	evict = ARC_BUFC_DATA;
-	boolean_t		recycle = TRUE;
 
 	arc_adapt(size, state);
 
 	/*
-	 * We have not yet reached cache maximum size,
-	 * just allocate a new buffer.
+	 * If arc_size is currently overflowing, and has grown past our
+	 * upper limit, we must be adding data faster than the evict
+	 * thread can evict. Thus, to ensure we don't compound the
+	 * problem by adding more data and forcing arc_size to grow even
+	 * further past it's target size, we halt and wait for the
+	 * eviction thread to catch up.
+	 *
+	 * It's also possible that the reclaim thread is unable to evict
+	 * enough buffers to get arc_size below the overflow limit (e.g.
+	 * due to buffers being un-evictable, or hash lock collisions).
+	 * In this case, we want to proceed regardless if we're
+	 * overflowing; thus we don't use a while loop here.
 	 */
-	if (!arc_evict_needed(type)) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_META);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
+	if (arc_is_overflowing()) {
+		mutex_enter(&arc_reclaim_lock);
+
+		/*
+		 * Now that we've acquired the lock, we may no longer be
+		 * over the overflow limit, lets check.
+		 *
+		 * We're ignoring the case of spurious wake ups. If that
+		 * were to happen, it'd let this thread consume an ARC
+		 * buffer before it should have (i.e. before we're under
+		 * the overflow limit and were signalled by the reclaim
+		 * thread). As long as that is a rare occurrence, it
+		 * shouldn't cause any harm.
+		 */
+		if (arc_is_overflowing()) {
+			cv_signal(&arc_reclaim_thread_cv);
+			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
 		}
-		goto out;
-	}
 
-	/*
-	 * If we are prefetching from the mfu ghost list, this buffer
-	 * will end up on the mru list; so steal space from there.
-	 */
-	if (state == arc_mfu_ghost)
-		state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
-	else if (state == arc_mru_ghost)
-		state = arc_mru;
-
-	if (state == arc_mru || state == arc_anon) {
-		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-		state = (arc_mfu->arcs_lsize[type] >= size &&
-		    arc_p > mru_used) ? arc_mfu : arc_mru;
-	} else {
-		/* MFU cases */
-		uint64_t mfu_space = arc_c - arc_p;
-		state =  (arc_mru->arcs_lsize[type] >= size &&
-		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+		mutex_exit(&arc_reclaim_lock);
 	}
 
-	/*
-	 * Evict data buffers prior to metadata buffers, unless we're
-	 * over the metadata limit and adding a metadata buffer.
-	 */
 	if (type == ARC_BUFC_METADATA) {
-		if (arc_meta_used >= arc_meta_limit)
-			evict = ARC_BUFC_METADATA;
-		else
-			/*
-			 * In this case, we're evicting data while
-			 * adding metadata. Thus, to prevent recycling a
-			 * data buffer into a metadata buffer, recycling
-			 * is disabled in the following arc_evict call.
-			 */
-			recycle = FALSE;
+		buf->b_data = zio_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_META);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		buf->b_data = zio_data_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
-	if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_META);
-
-			/*
-			 * If we are unable to recycle an existing meta buffer
-			 * signal the reclaim thread.  It will notify users
-			 * via the prune callback to drop references.  The
-			 * prune callback in run in the context of the reclaim
-			 * thread to avoid deadlocking on the hash_lock.
-			 * Of course, only do this when recycle is true.
-			 */
-			if (recycle)
-				cv_signal(&arc_reclaim_thr_cv);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
-		}
-
-		/* Only bump this if we tried to recycle and failed */
-		if (recycle)
-			ARCSTAT_BUMP(arcstat_recycle_miss);
-	}
-	ASSERT(buf->b_data != NULL);
-out:
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
 		arc_buf_hdr_t *hdr = buf->b_hdr;
 
 		atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
-		if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
+
+		/*
+		 * If this is reached via arc_read, the link is
+		 * protected by the hash lock. If reached via
+		 * arc_buf_alloc, the header should not be accessed by
+		 * any other thread. And, if reached via arc_read_done,
+		 * the hash lock will protect it if it's found in the
+		 * hash table; otherwise no other thread should be
+		 * trying to [add|remove]_reference it.
+		 */
+		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
 			    size);
 		}
 		/*
 		 * If we are growing the cache, and we are adding anonymous
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
-		if (!zfs_arc_p_aggressive_disable &&
-		    arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
+		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
 		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
 			arc_p = MIN(arc_c, arc_p + size);
 	}
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  * NOTE: the hash lock is dropped in this function.
  */
 static void
 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	clock_t now;
 
 	ASSERT(MUTEX_HELD(hash_lock));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * This buffer is not in the cache, and does not
 		 * appear in our "ghost" list.  Add the new buffer
 		 * to the MRU state.
 		 */
 
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mru, hdr, hash_lock);
 
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		now = ddi_get_lbolt();
 
 		/*
 		 * If this buffer is here because of a prefetch, then either:
 		 * - clear the flag if this is a "referencing" read
 		 *   (any subsequent access will bump this into the MFU state).
 		 * or
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
 		if (HDR_PREFETCH(hdr)) {
 			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
-				ASSERT(list_link_active(
+				/* link protected by hash lock */
+				ASSERT(multilist_link_active(
 				    &hdr->b_l1hdr.b_arc_node));
 			} else {
 				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
 				atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * This buffer has been "accessed" only once so far,
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 			arc_change_state(arc_mfu, hdr, hash_lock);
 		}
 		atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
 		ARCSTAT_BUMP(arcstat_mru_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
 		 * was evicted from the cache.  Move it to the
 		 * MFU state.
 		 */
 
 		if (HDR_PREFETCH(hdr)) {
 			new_state = arc_mru;
 			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
 				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		arc_change_state(new_state, hdr, hash_lock);
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
 		 *
 		 * NOTE: an add_reference() that occurred when we did
 		 * the arc_read() will have kicked this off the list.
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
 		if ((HDR_PREFETCH(hdr)) != 0) {
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+			/* link protected by hash_lock */
+			ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		}
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
 		ARCSTAT_BUMP(arcstat_mfu_hits);
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
 		 * This buffer has been accessed more than once but has
 		 * been evicted from the cache.  Move it back to the
 		 * MFU state.
 		 */
 
 		if (HDR_PREFETCH(hdr)) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
 			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
 			new_state = arc_mru;
 		}
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(new_state, hdr, hash_lock);
 
 		atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mfu, hdr, hash_lock);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
 	}
 }
 
 /* a generic arc_done_func_t which you can use */
 /* ARGSUSED */
 void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	if (zio == NULL || zio->io_error == 0)
 		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
 	VERIFY(arc_buf_remove_ref(buf, arg));
 }
 
 /* a generic arc_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
 	if (zio && zio->io_error) {
 		VERIFY(arc_buf_remove_ref(buf, arg));
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
 		ASSERT(buf->b_data);
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	arc_buf_hdr_t	*hdr;
 	arc_buf_t	*buf;
 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list, *acb;
 	int		freeable = FALSE;
 
 	buf = zio->io_private;
 	hdr = buf->b_hdr;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
 		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
 		found = buf_hash_find(hdr->b_spa, zio->io_bp,
 		    &hash_lock);
 
 		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
 		    hash_lock == NULL) ||
 		    (found == hdr &&
 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 		    (found == hdr && HDR_L2_READING(hdr)));
 	}
 
 	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
 		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
 
 	/* byteswap if necessary */
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT(callback_list != NULL);
 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
 		dmu_object_byteswap_t bswap =
 		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 		if (BP_GET_LEVEL(zio->io_bp) > 0)
 		    byteswap_uint64_array(buf->b_data, hdr->b_size);
 		else
 		    dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size);
 	}
 
 	arc_cksum_compute(buf, B_FALSE);
 	arc_buf_watch(buf);
 
 	if (hash_lock && zio->io_error == 0 &&
 	    hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
 		 * called arc_access (to prevent any simultaneous readers from
 		 * getting confused).
 		 */
 		arc_access(hdr, hash_lock);
 	}
 
 	/* create copies of the data buffer for the callers */
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
 		if (acb->acb_done) {
 			if (abuf == NULL) {
 				ARCSTAT_BUMP(arcstat_duplicate_reads);
 				abuf = arc_buf_clone(buf);
 			}
 			acb->acb_buf = abuf;
 			abuf = NULL;
 		}
 	}
 	hdr->b_l1hdr.b_acb = NULL;
 	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
 	if (abuf == buf) {
 		ASSERT(buf->b_efunc == NULL);
 		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
 		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
 	}
 
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
 	    callback_list != NULL);
 
 	if (zio->io_error != 0) {
 		hdr->b_flags |= ARC_FLAG_IO_ERROR;
 		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr, hash_lock);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/*
 	 * Broadcast before we drop the hash_lock to avoid the possibility
 	 * that the hdr (and hence the cv) might be freed before we get to
 	 * the cv_broadcast().
 	 */
 	cv_broadcast(&hdr->b_l1hdr.b_cv);
 
 	if (hash_lock != NULL) {
 		mutex_exit(hash_lock);
 	} else {
 		/*
 		 * This block was freed while we waited for the read to
 		 * complete.  It has been removed from the hash table and
 		 * moved to the anonymous state (so that it won't show up
 		 * in the cache).
 		 */
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done)
 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_next;
 		kmem_free(acb, sizeof (arc_callback_t));
 	}
 
 	if (freeable)
 		arc_hdr_destroy(hdr);
 }
 
 /*
  * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
     void *private, zio_priority_t priority, int zio_flags,
     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	arc_buf_t *buf = NULL;
 	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 	int rc = 0;
 
 	ASSERT(!BP_IS_EMBEDDED(bp) ||
 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 
 top:
 	if (!BP_IS_EMBEDDED(bp)) {
 		/*
 		 * Embedded BP's have no DVA and require no I/O to "read".
 		 * Create an anonymous arc buf to back it.
 		 */
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
 
 		*arc_flags |= ARC_FLAG_CACHED;
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
 			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 
 			if (done) {
 				arc_callback_t	*acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 
 				ASSERT(acb->acb_done != NULL);
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb = acb;
 				add_reference(hdr, hash_lock, private);
 				mutex_exit(hash_lock);
 				goto out;
 			}
 			mutex_exit(hash_lock);
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 		    hdr->b_l1hdr.b_state == arc_mfu);
 
 		if (done) {
 			add_reference(hdr, hash_lock, private);
 			/*
 			 * If this block is already in use, create a new
 			 * copy of the data so that we will be guaranteed
 			 * that arc_release() will always succeed.
 			 */
 			buf = hdr->b_l1hdr.b_buf;
 			ASSERT(buf);
 			ASSERT(buf->b_data);
 			if (HDR_BUF_AVAILABLE(hdr)) {
 				ASSERT(buf->b_efunc == NULL);
 				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
 			} else {
 				buf = arc_buf_clone(buf);
 			}
 
 		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
 		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
 			hdr->b_flags |= ARC_FLAG_PREFETCH;
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
 		if (*arc_flags & ARC_FLAG_L2CACHE)
 			hdr->b_flags |= ARC_FLAG_L2CACHE;
 		if (*arc_flags & ARC_FLAG_L2COMPRESS)
 			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, hits);
 
 		if (done)
 			done(NULL, buf, private);
 	} else {
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
 		int32_t b_asize = 0;
 
 		/*
 		 * Gracefully handle a damaged logical block size as a
 		 * checksum error by passing a dummy zio to the done callback.
 		 */
 		if (size > spa_maxblocksize(spa)) {
 			if (done) {
 				rzio = zio_null(pio, spa, NULL,
 				    NULL, NULL, zio_flags);
 				rzio->io_error = ECKSUM;
 				done(rzio, buf, private);
 				zio_nowait(rzio);
 			}
 			rc = ECKSUM;
 			goto out;
 		}
 
 		if (hdr == NULL) {
 			/* this block is not in the cache */
 			arc_buf_hdr_t *exists = NULL;
 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 			buf = arc_buf_alloc(spa, size, private, type);
 			hdr = buf->b_hdr;
 			if (!BP_IS_EMBEDDED(bp)) {
 				hdr->b_dva = *BP_IDENTITY(bp);
 				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				(void) arc_buf_remove_ref(buf, private);
 				goto top; /* restart the IO request */
 			}
 
 			/* if this is a prefetch, we don't have a reference */
 			if (*arc_flags & ARC_FLAG_PREFETCH) {
 				(void) remove_reference(hdr, hash_lock,
 				    private);
 				hdr->b_flags |= ARC_FLAG_PREFETCH;
 			}
 			if (*arc_flags & ARC_FLAG_L2CACHE)
 				hdr->b_flags |= ARC_FLAG_L2CACHE;
 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
 				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
 			if (BP_GET_LEVEL(bp) > 0)
 				hdr->b_flags |= ARC_FLAG_INDIRECT;
 		} else {
 			/*
 			 * This block is in the ghost cache. If it was L2-only
 			 * (and thus didn't have an L1 hdr), we realloc the
 			 * header to add an L1 hdr.
 			 */
 			if (!HDR_HAS_L1HDR(hdr)) {
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
 
 			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-			ASSERT(hdr->b_l1hdr.b_buf == NULL);
+			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
 			/* if this is a prefetch, we don't have a reference */
 			if (*arc_flags & ARC_FLAG_PREFETCH)
 				hdr->b_flags |= ARC_FLAG_PREFETCH;
 			else
 				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_FLAG_L2CACHE)
 				hdr->b_flags |= ARC_FLAG_L2CACHE;
 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
 				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 			buf->b_hdr = hdr;
 			buf->b_data = NULL;
 			buf->b_efunc = NULL;
 			buf->b_private = NULL;
 			buf->b_next = NULL;
 			hdr->b_l1hdr.b_buf = buf;
 			ASSERT0(hdr->b_l1hdr.b_datacnt);
 			hdr->b_l1hdr.b_datacnt = 1;
 			arc_get_data_buf(buf);
 			arc_access(hdr, hash_lock);
 		}
 
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 
 		ASSERT(hdr->b_l1hdr.b_acb == NULL);
 		hdr->b_l1hdr.b_acb = acb;
 		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			b_compress = HDR_GET_COMPRESS(hdr);
 			b_asize = hdr->b_l2hdr.b_asize;
 			/*
 			 * Lock out device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
 		/*
 		 * At this point, we have a level 1 cache miss.  Try again in
 		 * L2ARC if possible.
 		 */
 		ASSERT3U(hdr->b_size, ==, size);
 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
 		    uint64_t, size, zbookmark_phys_t *, zb);
 		ARCSTAT_BUMP(arcstat_misses);
 		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, misses);
 
 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
 				l2arc_read_callback_t *cb;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 				atomic_inc_32(&hdr->b_l2hdr.b_hits);
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_buf = buf;
 				cb->l2rcb_spa = spa;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 				cb->l2rcb_compress = b_compress;
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + size < vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
 				if (b_compress == ZIO_COMPRESS_EMPTY) {
 					rzio = zio_null(pio, spa, vd,
 					    l2arc_read_done, cb,
 					    zio_flags | ZIO_FLAG_DONT_CACHE |
 					    ZIO_FLAG_CANFAIL |
 					    ZIO_FLAG_DONT_PROPAGATE |
 					    ZIO_FLAG_DONT_RETRY);
 				} else {
 					rzio = zio_read_phys(pio, vd, addr,
 					    b_asize, buf->b_data,
 					    ZIO_CHECKSUM_OFF,
 					    l2arc_read_done, cb, priority,
 					    zio_flags | ZIO_FLAG_DONT_CACHE |
 					    ZIO_FLAG_CANFAIL |
 					    ZIO_FLAG_DONT_PROPAGATE |
 					    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				}
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
 
 				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					goto out;
 				}
 
 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					goto out;
 
 				/* l2arc read error; goto zio_read() */
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			if (l2arc_ndev != 0) {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
 		    arc_read_done, buf, priority, zio_flags, zb);
 
 		if (*arc_flags & ARC_FLAG_WAIT) {
 			rc = zio_wait(rzio);
 			goto out;
 		}
 
 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 
 out:
 	spa_read_history_add(spa, zb, *arc_flags);
 	return (rc);
 }
 
 arc_prune_t *
 arc_add_prune_callback(arc_prune_func_t *func, void *private)
 {
 	arc_prune_t *p;
 
 	p = kmem_alloc(sizeof (*p), KM_SLEEP);
 	p->p_pfunc = func;
 	p->p_private = private;
 	list_link_init(&p->p_node);
 	refcount_create(&p->p_refcnt);
 
 	mutex_enter(&arc_prune_mtx);
 	refcount_add(&p->p_refcnt, &arc_prune_list);
 	list_insert_head(&arc_prune_list, p);
 	mutex_exit(&arc_prune_mtx);
 
 	return (p);
 }
 
 void
 arc_remove_prune_callback(arc_prune_t *p)
 {
 	mutex_enter(&arc_prune_mtx);
 	list_remove(&arc_prune_list, p);
 	if (refcount_remove(&p->p_refcnt, &arc_prune_list) == 0) {
 		refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 }
 
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
 	ASSERT(buf->b_hdr != NULL);
 	ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
 	    func == NULL);
 	ASSERT(buf->b_efunc == NULL);
 	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
 
 	buf->b_efunc = func;
 	buf->b_private = private;
 }
 
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
 arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
 	if (HDR_BUF_AVAILABLE(hdr)) {
 		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
 		add_reference(hdr, hash_lock, FTAG);
 		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
 		mutex_exit(hash_lock);
 
 		arc_release(buf, FTAG);
 		(void) arc_buf_remove_ref(buf, FTAG);
 	} else {
 		mutex_exit(hash_lock);
 	}
 
 }
 
 /*
  * Clear the user eviction callback set by arc_set_callback(), first calling
  * it if it exists.  Because the presence of a callback keeps an arc_buf cached
  * clearing the callback may result in the arc_buf being destroyed.  However,
  * it will not result in the *last* arc_buf being destroyed, hence the data
  * will remain cached in the ARC. We make a copy of the arc buffer here so
  * that we can process the callback without holding any locks.
  *
  * It's possible that the callback is already in the process of being cleared
  * by another thread.  In this case we can not clear the callback.
  *
  * Returns B_TRUE if the callback was successfully called and cleared.
  */
 boolean_t
 arc_clear_callback(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	arc_evict_func_t *efunc = buf->b_efunc;
 	void *private = buf->b_private;
 
 	mutex_enter(&buf->b_evict_lock);
 	hdr = buf->b_hdr;
 	if (hdr == NULL) {
 		/*
 		 * We are in arc_do_user_evicts().
 		 */
 		ASSERT(buf->b_data == NULL);
 		mutex_exit(&buf->b_evict_lock);
 		return (B_FALSE);
 	} else if (buf->b_data == NULL) {
 		/*
 		 * We are on the eviction list; process this buffer now
 		 * but let arc_do_user_evicts() do the reaping.
 		 */
 		buf->b_efunc = NULL;
 		mutex_exit(&buf->b_evict_lock);
 		VERIFY0(efunc(private));
 		return (B_TRUE);
 	}
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
 	    hdr->b_l1hdr.b_datacnt);
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu);
 
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 
 	if (hdr->b_l1hdr.b_datacnt > 1) {
 		mutex_exit(&buf->b_evict_lock);
-		arc_buf_destroy(buf, FALSE, TRUE);
+		arc_buf_destroy(buf, TRUE);
 	} else {
 		ASSERT(buf == hdr->b_l1hdr.b_buf);
 		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
 		mutex_exit(&buf->b_evict_lock);
 	}
 
 	mutex_exit(hash_lock);
 	VERIFY0(efunc(private));
 	return (B_TRUE);
 }
 
 /*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
 	kmutex_t *hash_lock;
 	arc_state_t *state;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
-	 * It would be nice to assert that if it's DMU metadata (level >
+	 * It would be nice to assert that if its DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	mutex_enter(&buf->b_evict_lock);
 
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 		ASSERT(BUF_EMPTY(hdr));
 
 		ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
 		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		ASSERT3P(buf->b_efunc, ==, NULL);
 		ASSERT3P(buf->b_private, ==, NULL);
 
 		hdr->b_l1hdr.b_arc_access = 0;
 		arc_buf_thaw(buf);
 
 		return;
 	}
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	/*
 	 * This assignment is only valid as long as the hash_lock is
 	 * held, we must be careful not to reference state or the
 	 * b_state field after dropping the lock.
 	 */
 	state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
 
 	/* this buffer is not on any list */
 	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
 		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
 
 		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 		list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr);
+
+		/*
+		 * We don't want to leak the b_tmp_cdata buffer that was
+		 * allocated in l2arc_write_buffers()
+		 */
+		arc_buf_l2_cdata_free(hdr);
+
 		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 
 		hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
 	}
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_l1hdr.b_datacnt > 1) {
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
 		uint64_t spa = hdr->b_spa;
 		arc_buf_contents_t type = arc_buf_type(hdr);
 		uint32_t flags = hdr->b_flags;
 
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
 		/*
 		 * Pull the data off of this hdr and attach it to
 		 * a new anonymous hdr.
 		 */
 		(void) remove_reference(hdr, hash_lock, tag);
 		bufp = &hdr->b_l1hdr.b_buf;
 		while (*bufp != buf)
 			bufp = &(*bufp)->b_next;
 		*bufp = buf->b_next;
 		buf->b_next = NULL;
 
 		ASSERT3P(state, !=, arc_l2c_only);
 		ASSERT3U(state->arcs_size, >=, hdr->b_size);
 		atomic_add_64(&state->arcs_size, -hdr->b_size);
 		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			uint64_t *size;
 
 			ASSERT3P(state, !=, arc_l2c_only);
 			size = &state->arcs_lsize[type];
 			ASSERT3U(*size, >=, hdr->b_size);
 			atomic_add_64(size, -hdr->b_size);
 		}
 
 		/*
 		 * We're releasing a duplicate user data buffer, update
 		 * our statistics accordingly.
 		 */
 		if (HDR_ISTYPE_DATA(hdr)) {
 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
 			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
 			    -hdr->b_size);
 		}
 		hdr->b_l1hdr.b_datacnt -= 1;
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		mutex_exit(hash_lock);
 
 		nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 		nhdr->b_size = blksz;
 		nhdr->b_spa = spa;
 
 		nhdr->b_l1hdr.b_mru_hits = 0;
 		nhdr->b_l1hdr.b_mru_ghost_hits = 0;
 		nhdr->b_l1hdr.b_mfu_hits = 0;
 		nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		nhdr->b_l1hdr.b_l2_hits = 0;
 		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
 		nhdr->b_flags |= arc_bufc_to_flags(type);
 		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
 
 		nhdr->b_l1hdr.b_buf = buf;
 		nhdr->b_l1hdr.b_datacnt = 1;
 		nhdr->b_l1hdr.b_state = arc_anon;
 		nhdr->b_l1hdr.b_arc_access = 0;
+		nhdr->b_l1hdr.b_tmp_cdata = NULL;
 		nhdr->b_freeze_cksum = NULL;
 
 		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 		mutex_exit(&buf->b_evict_lock);
 		atomic_add_64(&arc_anon->arcs_size, blksz);
 	} else {
 		mutex_exit(&buf->b_evict_lock);
 		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
-		/* protected by hash lock */
-		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+		/* protected by hash lock, or hdr is on arc_anon */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		hdr->b_l1hdr.b_l2_hits = 0;
 		arc_change_state(arc_anon, hdr, hash_lock);
 		hdr->b_l1hdr.b_arc_access = 0;
 		mutex_exit(hash_lock);
 
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	int released;
 
 	mutex_enter(&buf->b_evict_lock);
 	released = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	int referenced;
 
 	mutex_enter(&buf->b_evict_lock);
 	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
 	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	/*
 	 * If the IO is already in progress, then this is a re-write
 	 * attempt, so we need to thaw and re-compute the cksum.
 	 * It is the responsibility of the callback to handle the
 	 * accounting for any re-write attempt.
 	 */
 	if (HDR_IO_IN_PROGRESS(hdr)) {
 		mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 		if (hdr->b_freeze_cksum != NULL) {
 			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
 			hdr->b_freeze_cksum = NULL;
 		}
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 	}
 	arc_cksum_compute(buf, B_FALSE);
 	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
 }
 
 /*
  * The SPA calls this callback for each physical write that happens on behalf
  * of a logical write.  See the comment in dbuf_write_physdone() for details.
  */
 static void
 arc_write_physdone(zio_t *zio)
 {
 	arc_write_callback_t *cb = zio->io_private;
 	if (cb->awcb_physdone != NULL)
 		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(hdr->b_l1hdr.b_acb == NULL);
 
 	if (zio->io_error == 0) {
 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(BUF_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero or compressed enough to be
 	 * embedded in the BP, no write was performed so there will be no
 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
 	 * (and uncached).
 	 */
 	if (!BUF_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT(zio->io_error == 0);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists, hash_lock);
 				mutex_exit(hash_lock);
 				arc_hdr_destroy(exists);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT(hdr->b_l1hdr.b_datacnt == 1);
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
 		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
 	}
 
 	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
     arc_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 
 	ASSERT(ready != NULL);
 	ASSERT(done != NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(hdr->b_l1hdr.b_acb == NULL);
 	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
 	if (l2arc)
 		hdr->b_flags |= ARC_FLAG_L2CACHE;
 	if (l2arc_compress)
 		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
 	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
 	    priority, zio_flags, zb);
 
 	return (zio);
 }
 
 static int
 arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
 	if (zfs_arc_memory_throttle_disable)
 		return (0);
 
 	if (freemem <= physmem * arc_lotsfree_percent / 100) {
 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
 		return (SET_ERROR(EAGAIN));
 	}
 #endif
 	return (0);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 	if (reserve > arc_c/4 && !arc_no_grow)
 		arc_c = MIN(arc_c_max, reserve * 4);
 
 	/*
 	 * Throttle when the calculated memory footprint for the TXG
 	 * exceeds the target ARC size.
 	 */
 	if (reserve > arc_c) {
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	error = arc_memory_throttle(reserve, txg);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 
 	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
 	    anon_size > arc_c / 4) {
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
 		    arc_tempreserve>>10,
 		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
 		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
 		    reserve>>10, arc_c>>10);
 		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
 	size->value.ui64 = state->arcs_size;
 	evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
 	evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
 }
 
 static int
 arc_kstat_update(kstat_t *ksp, int rw)
 {
 	arc_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	} else {
 		arc_kstat_update_state(arc_anon,
 		    &as->arcstat_anon_size,
 		    &as->arcstat_anon_evict_data,
 		    &as->arcstat_anon_evict_metadata);
 		arc_kstat_update_state(arc_mru,
 		    &as->arcstat_mru_size,
 		    &as->arcstat_mru_evict_data,
 		    &as->arcstat_mru_evict_metadata);
 		arc_kstat_update_state(arc_mru_ghost,
 		    &as->arcstat_mru_ghost_size,
 		    &as->arcstat_mru_ghost_evict_data,
 		    &as->arcstat_mru_ghost_evict_metadata);
 		arc_kstat_update_state(arc_mfu,
 		    &as->arcstat_mfu_size,
 		    &as->arcstat_mfu_evict_data,
 		    &as->arcstat_mfu_evict_metadata);
 		arc_kstat_update_state(arc_mfu_ghost,
 		    &as->arcstat_mfu_ghost_size,
 		    &as->arcstat_mfu_ghost_evict_data,
 		    &as->arcstat_mfu_ghost_evict_metadata);
 	}
 
 	return (0);
 }
 
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+	arc_buf_hdr_t *hdr = obj;
+
+	/*
+	 * We rely on b_dva to generate evenly distributed index
+	 * numbers using buf_hash below. So, as an added precaution,
+	 * let's make sure we never add empty buffers to the arc lists.
+	 */
+	ASSERT(!BUF_EMPTY(hdr));
+
+	/*
+	 * The assumption here, is the hash value for a given
+	 * arc_buf_hdr_t will remain constant throughout its lifetime
+	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
+	 * Thus, we don't need to store the header's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+	    multilist_get_num_sublists(ml));
+}
+
 void
 arc_init(void)
 {
-	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
 
 	/* Convert seconds to clock ticks */
 	zfs_arc_min_prefetch_lifespan = 1 * hz;
 
 	/* Start out with 1/8 of all memory */
 	arc_c = physmem * PAGESIZE / 8;
 
 #ifdef _KERNEL
 	/*
 	 * On architectures where the physical memory can be larger
 	 * than the addressable space (intel in 32-bit mode), we may
 	 * need to limit the cache to 1/8 of VM size.
 	 */
 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
 	/*
 	 * Register a shrinker to support synchronous (direct) memory
 	 * reclaim from the arc.  This is done to prevent kswapd from
 	 * swapping out pages when it is preferable to shrink the arc.
 	 */
 	spl_register_shrinker(&arc_shrinker);
 #endif
 
 	/* set min cache to zero */
 	arc_c_min = 4<<20;
 	/* set max to 1/2 of all memory */
 	arc_c_max = arc_c * 4;
 
 	/*
 	 * Allow the tunables to override our calculations if they are
 	 * reasonable (ie. over 64MB)
 	 */
 	if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
 		arc_c_max = zfs_arc_max;
 	if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max)
 		arc_c_min = zfs_arc_min;
 
 	arc_c = arc_c_max;
 	arc_p = (arc_c >> 1);
 
 	/* limit meta-data to 3/4 of the arc capacity */
 	arc_meta_limit = (3 * arc_c_max) / 4;
 	arc_meta_max = 0;
 
 	/* Allow the tunable to override if it is reasonable */
 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
 		arc_meta_limit = zfs_arc_meta_limit;
 
+	if (zfs_arc_num_sublists_per_state < 1)
+		zfs_arc_num_sublists_per_state = num_online_cpus();
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	arc_anon = &ARC_anon;
 	arc_mru = &ARC_mru;
 	arc_mru_ghost = &ARC_mru_ghost;
 	arc_mfu = &ARC_mfu;
 	arc_mfu_ghost = &ARC_mfu_ghost;
 	arc_l2c_only = &ARC_l2c_only;
 	arc_size = 0;
 
-	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-	list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
-	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
 	    sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
 
 	arc_anon->arcs_state = ARC_STATE_ANON;
 	arc_mru->arcs_state = ARC_STATE_MRU;
 	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 
 	buf_init();
 
-	arc_thread_exit = 0;
+	arc_reclaim_thread_exit = FALSE;
+	arc_user_evicts_thread_exit = FALSE;
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
 	    offsetof(arc_prune_t, p_node));
 	arc_eviction_list = NULL;
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	(void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0,
 	    TS_RUN, minclsyspri);
 
+	(void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
+
 	arc_dead = FALSE;
 	arc_warm = B_FALSE;
 
 	/*
 	 * Calculate maximum amount of dirty data per pool.
 	 *
 	 * If it has been set by a module parameter, take that.
 	 * Otherwise, use a percentage of physical memory defined by
 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
 	 * zfs_dirty_data_max_max (default 25% of physical memory).
 	 */
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = physmem * PAGESIZE *
 		    zfs_dirty_data_max_max_percent / 100;
 
 	if (zfs_dirty_data_max == 0) {
 		zfs_dirty_data_max = physmem * PAGESIZE *
 		    zfs_dirty_data_max_percent / 100;
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
 }
 
 void
 arc_fini(void)
 {
 	arc_prune_t *p;
 
-	mutex_enter(&arc_reclaim_thr_lock);
 #ifdef _KERNEL
 	spl_unregister_shrinker(&arc_shrinker);
 #endif /* _KERNEL */
 
-	arc_thread_exit = 1;
-	while (arc_thread_exit != 0)
-		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
-	mutex_exit(&arc_reclaim_thr_lock);
+	mutex_enter(&arc_reclaim_lock);
+	arc_reclaim_thread_exit = TRUE;
+	/*
+	 * The reclaim thread will set arc_reclaim_thread_exit back to
+	 * FALSE when it is finished exiting; we're waiting for that.
+	 */
+	while (arc_reclaim_thread_exit) {
+		cv_signal(&arc_reclaim_thread_cv);
+		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
+	}
+	mutex_exit(&arc_reclaim_lock);
+
+	mutex_enter(&arc_user_evicts_lock);
+	arc_user_evicts_thread_exit = TRUE;
+	/*
+	 * The user evicts thread will set arc_user_evicts_thread_exit
+	 * to FALSE when it is finished exiting; we're waiting for that.
+	 */
+	while (arc_user_evicts_thread_exit) {
+		cv_signal(&arc_user_evicts_cv);
+		cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
+	}
+	mutex_exit(&arc_user_evicts_lock);
 
-	arc_flush(NULL);
+	/* Use TRUE to ensure *all* buffers are evicted */
+	arc_flush(NULL, TRUE);
 
 	arc_dead = TRUE;
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_head(&arc_prune_list)) != NULL) {
 		list_remove(&arc_prune_list, p);
 		refcount_remove(&p->p_refcnt, &arc_prune_list);
 		refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
-	mutex_destroy(&arc_eviction_mtx);
-	mutex_destroy(&arc_reclaim_thr_lock);
-	cv_destroy(&arc_reclaim_thr_cv);
-
-	list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
-	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
-	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
-	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
-	mutex_destroy(&arc_anon->arcs_mtx);
-	mutex_destroy(&arc_mru->arcs_mtx);
-	mutex_destroy(&arc_mru_ghost->arcs_mtx);
-	mutex_destroy(&arc_mfu->arcs_mtx);
-	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
-	mutex_destroy(&arc_l2c_only->arcs_mtx);
+	mutex_destroy(&arc_reclaim_lock);
+	cv_destroy(&arc_reclaim_thread_cv);
+	cv_destroy(&arc_reclaim_waiters_cv);
+
+	mutex_destroy(&arc_user_evicts_lock);
+	cv_destroy(&arc_user_evicts_cv);
+
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 
 	buf_fini();
 
 	ASSERT0(arc_loaned_bytes);
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there. It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction. If a compressible buffer is
  * found during scanning and selected for writing to an L2ARC device, we
  * temporarily boost scanning headroom during the next scan cycle to make
  * sure we adapt to compression effects (which might significantly reduce
  * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_nocompress	skip compressing buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
  *				scanning, we multiply headroom by this
  *				percentage factor for the next scan cycle,
  *				since more compressed buffers are likely to
  *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(void)
 {
 	uint64_t size;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
 	 * altered them.
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
 		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
 		    "be greater than zero, resetting it to the default (%d)",
 		    L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 	} while (vdev_is_dead(next->l2ad_vdev));
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev))
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write(void)
 {
 	list_t *buflist;
 	l2arc_data_free_t *df, *df_prev;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	buflist = l2arc_free_on_write;
 
 	for (df = list_tail(buflist); df; df = df_prev) {
 		df_prev = list_prev(buflist, df);
 		ASSERT(df->l2df_data != NULL);
 		ASSERT(df->l2df_func != NULL);
 		df->l2df_func(df->l2df_data, df->l2df_size);
 		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t *cb;
 	l2arc_dev_t *dev;
 	list_t *buflist;
 	arc_buf_hdr_t *head, *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	int64_t bytes_dropped = 0;
 
 	cb = zio->io_private;
 	ASSERT(cb != NULL);
 	dev = cb->l2wcb_dev;
 	ASSERT(dev != NULL);
 	head = cb->l2wcb_head;
 	ASSERT(head != NULL);
 	buflist = &dev->l2ad_buflist;
 	ASSERT(buflist != NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	if (zio->io_error != 0)
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
-	mutex_enter(&dev->l2ad_mtx);
-
 	/*
 	 * All writes completed, or an error was hit.
 	 */
+top:
+	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
-			 * This buffer misses out.  It may be in a stage
-			 * of eviction.  Its ARC_FLAG_L2_WRITING flag will be
-			 * left set, denying reads to this buffer.
+			 * Missed the hash lock. We must retry so we
+			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
-			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
-			continue;
+			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+			/*
+			 * We don't want to rescan the headers we've
+			 * already marked as having been written out, so
+			 * we reinsert the head node so we can pick up
+			 * where we left off.
+			 */
+			list_remove(buflist, head);
+			list_insert_after(buflist, hdr, head);
+
+			mutex_exit(&dev->l2ad_mtx);
+
+			/*
+			 * We wait for the hash lock to become available
+			 * to try and prevent busy waiting, and increase
+			 * the chance we'll be able to acquire the lock
+			 * the next time around.
+			 */
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			goto top;
 		}
 
 		/*
-		 * It's possible that this buffer got evicted from the L1 cache
-		 * before we grabbed the vdev + hash locks, in which case
-		 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
-		 * Only free the buffer if we still have an L1 hdr.
+		 * We could not have been moved into the arc_l2c_only
+		 * state while in-flight due to our ARC_FLAG_L2_WRITING
+		 * bit being set. Let's just ensure that's being enforced.
+		 */
+		ASSERT(HDR_HAS_L1HDR(hdr));
+
+		/*
+		 * We may have allocated a buffer for L2ARC compression,
+		 * we must release it to avoid leaking this data.
 		 */
-		if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
-		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
-			l2arc_release_cdata_buf(hdr);
+		l2arc_release_cdata_buf(hdr);
 
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, hdr);
 			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
 
 			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
 		}
 
 		/*
-		 * Allow ARC to begin reads to this L2ARC entry.
+		 * Allow ARC to begin reads and ghost list evictions to
+		 * this L2ARC entry.
 		 */
 		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
 
 		mutex_exit(hash_lock);
 	}
 
 	atomic_inc_64(&l2arc_writes_done);
 	list_remove(buflist, head);
 	ASSERT(!HDR_HAS_L1HDR(head));
 	kmem_cache_free(hdr_l2only_cache, head);
 	mutex_exit(&dev->l2ad_mtx);
 
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	kmutex_t *hash_lock;
 	int equal;
 
 	ASSERT(zio->io_vd != NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	cb = zio->io_private;
 	ASSERT(cb != NULL);
 	buf = cb->l2rcb_buf;
 	ASSERT(buf != NULL);
 
 	hash_lock = HDR_LOCK(buf->b_hdr);
 	mutex_enter(hash_lock);
 	hdr = buf->b_hdr;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * If the buffer was compressed, decompress it first.
 	 */
 	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
 		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
 	ASSERT(zio->io_data != NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	equal = arc_cksum_equal(buf);
 	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = buf;
 		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 		arc_read_done(zio);
 	} else {
 		mutex_exit(hash_lock);
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
 		if (!equal)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
 			    buf->b_data, zio->io_size, arc_read_done, buf,
 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
-static list_t *
-l2arc_list_locked(int list_num, kmutex_t **lock)
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
 {
-	list_t *list = NULL;
+	multilist_t *ml = NULL;
+	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num <= 3);
 
 	switch (list_num) {
 	case 0:
-		list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
-		*lock = &arc_mfu->arcs_mtx;
+		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
-		list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
-		*lock = &arc_mru->arcs_mtx;
+		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
-		list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
-		*lock = &arc_mfu->arcs_mtx;
+		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
-		list = &arc_mru->arcs_list[ARC_BUFC_DATA];
-		*lock = &arc_mru->arcs_mtx;
+		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	}
 
-	ASSERT(!(MUTEX_HELD(*lock)));
-	mutex_enter(*lock);
-	return (list);
+	/*
+	 * Return a randomly-selected sublist. This is acceptable
+	 * because the caller feeds only a little bit of data for each
+	 * call (8MB). Subsequent calls will result in different
+	 * sublists being selected.
+	 */
+	idx = multilist_get_random_index(ml);
+	return (multilist_sublist_lock(ml, idx));
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes.  This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 	int64_t bytes_evicted = 0;
 
 	buflist = &dev->l2ad_buflist;
 
 	if (!all && dev->l2ad_first) {
 		/*
 		 * This is the first sweep through the device.  There is
 		 * nothing to evict.
 		 */
 		return;
 	}
 
 	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
 		/*
 		 * When nearing the end of the device, evict to the end
 		 * before the device write hand jumps to the start.
 		 */
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 top:
 	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		if (HDR_L2_WRITE_HEAD(hdr)) {
 			/*
 			 * We hit a write head node.  Leave it for
 			 * l2arc_write_done().
 			 */
 			list_remove(buflist, hdr);
 			mutex_exit(hash_lock);
 			continue;
 		}
 
 		if (!all && HDR_HAS_L2HDR(hdr) &&
 		    (hdr->b_l2hdr.b_daddr > taddr ||
 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		ASSERT(HDR_HAS_L2HDR(hdr));
 		if (!HDR_HAS_L1HDR(hdr)) {
 			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_size.
 			 */
 			arc_change_state(arc_anon, hdr, hash_lock);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
 			}
 
 			/*
 			 * Tell ARC this no longer exists in L2ARC.
 			 */
 			/* Tell ARC this no longer exists in L2ARC. */
 			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
 			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
 			list_remove(buflist, hdr);
 
-			/* This may have been leftover after a failed write. */
-			hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
+			/* Ensure this header has finished being written */
+			ASSERT(!HDR_L2_WRITING(hdr));
+			ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
 
 	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
 	dev->l2ad_evict = taddr;
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
  *
  * Returns the number of bytes actually written (which may be smaller than
  * the delta by which the device hand has changed due to alignment).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
     boolean_t *headroom_boost)
 {
 	arc_buf_hdr_t *hdr, *hdr_prev, *head;
-	list_t *list;
 	uint64_t write_asize, write_psize, write_sz, headroom,
 	    buf_compress_minsz;
 	void *buf_data;
-	kmutex_t *list_lock = NULL;
 	boolean_t full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
 	uint64_t guid = spa_load_guid(spa);
 	int try;
 	const boolean_t do_headroom_boost = *headroom_boost;
 
 	ASSERT(dev->l2ad_vdev != NULL);
 
 	/* Lower the flag now, we might want to raise it again later. */
 	*headroom_boost = B_FALSE;
 
 	pio = NULL;
 	write_sz = write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
 	head->b_flags |= ARC_FLAG_HAS_L2HDR;
 
 	/*
 	 * We will want to try to compress buffers that are at least 2x the
 	 * device sector size.
 	 */
 	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
 
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
-	mutex_enter(&dev->l2ad_mtx);
 	for (try = 0; try <= 3; try++) {
+		multilist_sublist_t *mls = l2arc_sublist_lock(try);
 		uint64_t passed_sz = 0;
 
-		list = l2arc_list_locked(try, &list_lock);
-
 		/*
 		 * L2ARC fast warmup.
 		 *
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		if (arc_warm == B_FALSE)
-			hdr = list_head(list);
+			hdr = multilist_sublist_head(mls);
 		else
-			hdr = list_tail(list);
+			hdr = multilist_sublist_tail(mls);
 
 		headroom = target_sz * l2arc_headroom;
 		if (do_headroom_boost)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		for (; hdr; hdr = hdr_prev) {
 			kmutex_t *hash_lock;
 			uint64_t buf_sz;
 
 			if (arc_warm == B_FALSE)
-				hdr_prev = list_next(list, hdr);
+				hdr_prev = multilist_sublist_next(mls, hdr);
 			else
-				hdr_prev = list_prev(list, hdr);
+				hdr_prev = multilist_sublist_prev(mls, hdr);
 
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 				/*
 				 * Skip this buffer rather than waiting.
 				 */
 				continue;
 			}
 
 			passed_sz += hdr->b_size;
 			if (passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
 
 			if ((write_sz + hdr->b_size) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
+				mutex_enter(&dev->l2ad_mtx);
 				list_insert_head(&dev->l2ad_buflist, head);
+				mutex_exit(&dev->l2ad_mtx);
 
 				cb = kmem_alloc(sizeof (l2arc_write_callback_t),
 				    KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 			}
 
 			/*
 			 * Create and add a new L2ARC header.
 			 */
 			hdr->b_l2hdr.b_dev = dev;
 			arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 			hdr->b_flags |= ARC_FLAG_L2_WRITING;
 			/*
 			 * Temporarily stash the data buffer in b_tmp_cdata.
 			 * The subsequent write step will pick it up from
 			 * there. This is because can't access b_l1hdr.b_buf
 			 * without holding the hash_lock, which we in turn
 			 * can't access without holding the ARC list locks
 			 * (which we want to avoid during compression/writing)
 			 */
 			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
 			hdr->b_l2hdr.b_asize = hdr->b_size;
 			hdr->b_l2hdr.b_hits = 0;
 			hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
 
 			buf_sz = hdr->b_size;
 			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
 
+			mutex_enter(&dev->l2ad_mtx);
 			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * Compute and store the buffer cksum before
 			 * writing.  On debug the cksum is verified first.
 			 */
 			arc_cksum_verify(hdr->b_l1hdr.b_buf);
 			arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
 
 			mutex_exit(hash_lock);
 
 			write_sz += buf_sz;
 		}
 
-		mutex_exit(list_lock);
+		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
 
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_sz);
-		mutex_exit(&dev->l2ad_mtx);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 		return (0);
 	}
 
+	mutex_enter(&dev->l2ad_mtx);
+
 	/*
 	 * Now start writing the buffers. We're starting at the write head
 	 * and work backwards, retracing the course of the buffer selector
 	 * loop above.
 	 */
 	for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
 	    hdr = list_prev(&dev->l2ad_buflist, hdr)) {
 		uint64_t buf_sz;
 
+		/*
+		 * We rely on the L1 portion of the header below, so
+		 * it's invalid for this header to have been evicted out
+		 * of the ghost cache, prior to being written out. The
+		 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+		 */
+		ASSERT(HDR_HAS_L1HDR(hdr));
+
 		/*
 		 * We shouldn't need to lock the buffer here, since we flagged
 		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
 		 * take care to only access its L2 cache parameters. In
 		 * particular, hdr->l1hdr.b_buf may be invalid by now due to
 		 * ARC eviction.
 		 */
 		hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 
 		if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) &&
 		    hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
 			if (l2arc_compress_buf(hdr)) {
 				/*
 				 * If compression succeeded, enable headroom
 				 * boost on the next scan cycle.
 				 */
 				*headroom_boost = B_TRUE;
 			}
 		}
 
 		/*
 		 * Pick up the buffer data we had previously stashed away
 		 * (and now potentially also compressed).
 		 */
 		buf_data = hdr->b_l1hdr.b_tmp_cdata;
 		buf_sz = hdr->b_l2hdr.b_asize;
 
 		/* Compression may have squashed the buffer to zero length. */
 		if (buf_sz != 0) {
 			uint64_t buf_p_sz;
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
 			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 			(void) zio_nowait(wzio);
 
 			write_asize += buf_sz;
 			/*
 			 * Keep the clock hand suitably device-aligned.
 			 */
 			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
 			write_psize += buf_p_sz;
 			dev->l2ad_hand += buf_p_sz;
 		}
 	}
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
 	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
 	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 	}
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	return (write_asize);
 }
 
 /*
  * Compresses an L2ARC buffer.
  * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
  * size in l2hdr->b_asize. This routine tries to compress the data and
  * depending on the compression result there are three possible outcomes:
  * *) The buffer was incompressible. The original l2hdr contents were left
  *    untouched and are ready for writing to an L2 device.
  * *) The buffer was all-zeros, so there is no need to write it to an L2
  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
  *    data buffer which holds the compressed data to be written, and b_asize
  *    tells us how much data there is. b_compress is set to the appropriate
  *    compression algorithm. Once writing is done, invoke
  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
  *
  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
  * buffer was incompressible).
  */
 static boolean_t
 l2arc_compress_buf(arc_buf_hdr_t *hdr)
 {
 	void *cdata;
 	size_t csize, len, rounded;
 	l2arc_buf_hdr_t *l2hdr;
 
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	l2hdr = &hdr->b_l2hdr;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
 	ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
 
 	len = l2hdr->b_asize;
 	cdata = zio_data_buf_alloc(len);
 	ASSERT3P(cdata, !=, NULL);
 	csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
 	    cdata, l2hdr->b_asize);
 
 	rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
 	if (rounded > csize) {
 		bzero((char *)cdata + csize, rounded - csize);
 		csize = rounded;
 	}
 
 	if (csize == 0) {
 		/* zero block, indicate that there's nothing to write */
 		zio_data_buf_free(cdata, len);
 		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
 		l2hdr->b_asize = 0;
 		hdr->b_l1hdr.b_tmp_cdata = NULL;
 		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
 		return (B_TRUE);
 	} else if (csize > 0 && csize < len) {
 		/*
 		 * Compression succeeded, we'll keep the cdata around for
 		 * writing and release it afterwards.
 		 */
 		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
 		l2hdr->b_asize = csize;
 		hdr->b_l1hdr.b_tmp_cdata = cdata;
 		ARCSTAT_BUMP(arcstat_l2_compress_successes);
 		return (B_TRUE);
 	} else {
 		/*
 		 * Compression failed, release the compressed buffer.
 		 * l2hdr will be left unmodified.
 		 */
 		zio_data_buf_free(cdata, len);
 		ARCSTAT_BUMP(arcstat_l2_compress_failures);
 		return (B_FALSE);
 	}
 }
 
 /*
  * Decompresses a zio read back from an l2arc device. On success, the
  * underlying zio's io_data buffer is overwritten by the uncompressed
  * version. On decompression error (corrupt compressed stream), the
  * zio->io_error value is set to signal an I/O error.
  *
  * Please note that the compressed data stream is not checksummed, so
  * if the underlying device is experiencing data corruption, we may feed
  * corrupt data to the decompressor, so the decompressor needs to be
  * able to handle this situation (LZ4 does).
  */
 static void
 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
 {
 	uint64_t csize;
 	void *cdata;
 
 	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
 
 	if (zio->io_error != 0) {
 		/*
 		 * An io error has occured, just restore the original io
 		 * size in preparation for a main pool read.
 		 */
 		zio->io_orig_size = zio->io_size = hdr->b_size;
 		return;
 	}
 
 	if (c == ZIO_COMPRESS_EMPTY) {
 		/*
 		 * An empty buffer results in a null zio, which means we
 		 * need to fill its io_data after we're done restoring the
 		 * buffer's contents.
 		 */
 		ASSERT(hdr->b_l1hdr.b_buf != NULL);
 		bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
 		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
 	} else {
 		ASSERT(zio->io_data != NULL);
 		/*
 		 * We copy the compressed data from the start of the arc buffer
 		 * (the zio_read will have pulled in only what we need, the
 		 * rest is garbage which we will overwrite at decompression)
 		 * and then decompress back to the ARC data buffer. This way we
 		 * can minimize copying by simply decompressing back over the
 		 * original compressed data (rather than decompressing to an
 		 * aux buffer and then copying back the uncompressed buffer,
 		 * which is likely to be much larger).
 		 */
 		csize = zio->io_size;
 		cdata = zio_data_buf_alloc(csize);
 		bcopy(zio->io_data, cdata, csize);
 		if (zio_decompress_data(c, cdata, zio->io_data, csize,
 		    hdr->b_size) != 0)
 			zio->io_error = SET_ERROR(EIO);
 		zio_data_buf_free(cdata, csize);
 	}
 
 	/* Restore the expected uncompressed IO size. */
 	zio->io_orig_size = zio->io_size = hdr->b_size;
 }
 
 /*
  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
  * This buffer serves as a temporary holder of compressed data while
  * the buffer entry is being written to an l2arc device. Once that is
  * done, we can dispose of it.
  */
 static void
 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
 {
+	enum zio_compress comp = HDR_GET_COMPRESS(hdr);
+
 	ASSERT(HDR_HAS_L1HDR(hdr));
-	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
+	ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
+
+	if (comp == ZIO_COMPRESS_OFF) {
+		/*
+		 * In this case, b_tmp_cdata points to the same buffer
+		 * as the arc_buf_t's b_data field. We don't want to
+		 * free it, since the arc_buf_t will handle that.
+		 */
+		hdr->b_l1hdr.b_tmp_cdata = NULL;
+	} else if (comp == ZIO_COMPRESS_EMPTY) {
+		/*
+		 * In this case, b_tmp_cdata was compressed to an empty
+		 * buffer, thus there's nothing to free and b_tmp_cdata
+		 * should have been set to NULL in l2arc_write_buffers().
+		 */
+		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+	} else {
 		/*
 		 * If the data was compressed, then we've allocated a
 		 * temporary buffer for it, so now we need to release it.
 		 */
 		ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
 		zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
 		    hdr->b_size);
+		hdr->b_l1hdr.b_tmp_cdata = NULL;
 	}
-	hdr->b_l1hdr.b_tmp_cdata = NULL;
+
 }
 
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 static void
 l2arc_feed_thread(void)
 {
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	boolean_t headroom_boost = B_FALSE;
 	fstrans_cookie_t cookie;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_interruptible(&l2arc_feed_thr_cv,
 		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT(spa != NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (arc_no_grow) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size();
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	l2arc_dev_t *dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev != NULL);
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t *adddev;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
 
 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
 
 	/*
 	 * Find the device by vdev
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
 		nextdev = list_next(l2arc_dev_list, dev);
 		if (vd == dev->l2ad_vdev) {
 			remdev = dev;
 			break;
 		}
 	}
 	ASSERT(remdev != NULL);
 
 	/*
 	 * Remove device from global list
 	 */
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	mutex_destroy(&remdev->l2ad_mtx);
 	kmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 	l2arc_writes_sent = 0;
 	l2arc_writes_done = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	/*
 	 * This is called from dmu_fini(), which is called from spa_fini();
 	 * Because of this, we can assume that all l2arc devices have
 	 * already been removed when the pools themselves were removed.
 	 */
 
 	l2arc_do_free_on_write();
 
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & FWRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, minclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & FWRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(arc_buf_size);
 EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_remove_ref);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
 module_param(zfs_arc_min, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_min, "Min arc size");
 
 module_param(zfs_arc_max, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_max, "Max arc size");
 
 module_param(zfs_arc_meta_limit, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size");
 
+module_param(zfs_arc_meta_min, ulong, 0644);
+MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata");
+
 module_param(zfs_arc_meta_prune, int, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune");
 
 module_param(zfs_arc_meta_adjust_restarts, ulong, 0644);
 MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts,
 	"Limit number of restarts in arc_adjust_meta");
 
 module_param(zfs_arc_grow_retry, int, 0644);
 MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
 
 module_param(zfs_arc_p_aggressive_disable, int, 0644);
 MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
 
 module_param(zfs_arc_p_dampener_disable, int, 0644);
 MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
 
 module_param(zfs_arc_shrink_shift, int, 0644);
 MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
 
 module_param(zfs_disable_dup_eviction, int, 0644);
 MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
 
 module_param(zfs_arc_average_blocksize, int, 0444);
 MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
 
 module_param(zfs_arc_memory_throttle_disable, int, 0644);
 MODULE_PARM_DESC(zfs_arc_memory_throttle_disable, "disable memory throttle");
 
 module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
 MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
 
+module_param(zfs_arc_num_sublists_per_state, int, 0644);
+MODULE_PARM_DESC(zfs_arc_num_sublists_per_state,
+	"Number of sublists used in each of the ARC state lists");
+
 module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
 
 module_param(l2arc_write_boost, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup");
 
 module_param(l2arc_headroom, ulong, 0644);
 MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache");
 
 module_param(l2arc_headroom_boost, ulong, 0644);
 MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier");
 
 module_param(l2arc_feed_secs, ulong, 0644);
 MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing");
 
 module_param(l2arc_feed_min_ms, ulong, 0644);
 MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds");
 
 module_param(l2arc_noprefetch, int, 0644);
 MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers");
 
 module_param(l2arc_nocompress, int, 0644);
 MODULE_PARM_DESC(l2arc_nocompress, "Skip compressing L2ARC buffers");
 
 module_param(l2arc_feed_again, int, 0644);
 MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup");
 
 module_param(l2arc_norw, int, 0644);
 MODULE_PARM_DESC(l2arc_norw, "No reads during writes");
 
 #endif
diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c
index 5e7eaf1acf33..afdf828ed542 100644
--- a/module/zfs/dbuf_stats.c
+++ b/module/zfs/dbuf_stats.c
@@ -1,234 +1,233 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 
 /*
  * Calculate the index of the arc header for the state, disabled by default.
  */
 int zfs_dbuf_state_index = 0;
 
 /*
  * ==========================================================================
  * Dbuf Hash Read Routines
  * ==========================================================================
  */
 typedef struct dbuf_stats_t {
 	kmutex_t		lock;
 	kstat_t			*kstat;
 	dbuf_hash_table_t	*hash;
 	int			idx;
 } dbuf_stats_t;
 
 static dbuf_stats_t dbuf_stats_hash_table;
 
 static int
 dbuf_stats_hash_table_headers(char *buf, size_t size)
 {
 	(void) snprintf(buf, size,
 	    "%-88s | %-124s | %s\n"
 	    "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | "
-	    "%-5s %-5s %-6s %-8s %-6s %-8s %-12s "
+	    "%-5s %-5s %-8s %-6s %-8s %-12s "
 	    "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | "
 	    "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n",
 	    "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
 	    "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list",
-	    "atype", "index", "flags", "count", "asize", "access",
+	    "atype", "flags", "count", "asize", "access",
 	    "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
 	    "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
 	    "bsize", "lvls", "dholds", "blocks", "dsize");
 
 	return (0);
 }
 
 int
 __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
 {
 	arc_buf_info_t abi = { 0 };
 	dmu_object_info_t doi = { 0 };
 	dnode_t *dn = DB_DNODE(db);
 	size_t nwritten;
 
 	if (db->db_buf)
 		arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
 
 	if (dn)
 		__dmu_object_info_from_dnode(dn, &doi);
 
 	nwritten = snprintf(buf, size,
 	    "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
-	    "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu "
+	    "%-5d %-5d 0x%-6x %-6lu %-8llu %-12llu "
 	    "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | "
 	    "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n",
 	    /* dmu_buf_impl_t */
 	    spa_name(dn->dn_objset->os_spa),
 	    (u_longlong_t)dmu_objset_id(db->db_objset),
 	    (longlong_t)db->db.db_object,
 	    (longlong_t)db->db_level,
 	    (longlong_t)db->db_blkid,
 	    (u_longlong_t)db->db.db_offset,
 	    (u_longlong_t)db->db.db_size,
 	    !!dbuf_is_metadata(db),
 	    db->db_state,
 	    (ulong_t)refcount_count(&db->db_holds),
 	    /* arc_buf_info_t */
 	    abi.abi_state_type,
 	    abi.abi_state_contents,
-	    (longlong_t)abi.abi_state_index,
 	    abi.abi_flags,
 	    (ulong_t)abi.abi_datacnt,
 	    (u_longlong_t)abi.abi_size,
 	    (u_longlong_t)abi.abi_access,
 	    (ulong_t)abi.abi_mru_hits,
 	    (ulong_t)abi.abi_mru_ghost_hits,
 	    (ulong_t)abi.abi_mfu_hits,
 	    (ulong_t)abi.abi_mfu_ghost_hits,
 	    (ulong_t)abi.abi_l2arc_hits,
 	    (u_longlong_t)abi.abi_l2arc_dattr,
 	    (u_longlong_t)abi.abi_l2arc_asize,
 	    abi.abi_l2arc_compress,
 	    (ulong_t)abi.abi_holds,
 	    /* dmu_object_info_t */
 	    doi.doi_type,
 	    doi.doi_bonus_type,
 	    (ulong_t)doi.doi_data_block_size,
 	    (ulong_t)doi.doi_metadata_block_size,
 	    (u_longlong_t)doi.doi_bonus_size,
 	    (ulong_t)doi.doi_indirection,
 	    (ulong_t)refcount_count(&dn->dn_holds),
 	    (u_longlong_t)doi.doi_fill_count,
 	    (u_longlong_t)doi.doi_max_offset);
 
 	if (nwritten >= size)
 		return (size);
 
 	return (nwritten + 1);
 }
 
 static int
 dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
 {
 	dbuf_stats_t *dsh = (dbuf_stats_t *)data;
 	dbuf_hash_table_t *h = dsh->hash;
 	dmu_buf_impl_t *db;
 	int length, error = 0;
 
 	ASSERT3S(dsh->idx, >=, 0);
 	ASSERT3S(dsh->idx, <=, h->hash_table_mask);
 	memset(buf, 0, size);
 
 	mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
 	for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
 		/*
 		 * Returning ENOMEM will cause the data and header functions
 		 * to be called with a larger scratch buffers.
 		 */
 		if (size < 512) {
 			error = ENOMEM;
 			break;
 		}
 
 		mutex_enter(&db->db_mtx);
 		mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
 
 		if (db->db_state != DB_EVICTING) {
 			length = __dbuf_stats_hash_table_data(buf, size, db);
 			buf += length;
 			size -= length;
 		}
 
 		mutex_exit(&db->db_mtx);
 		mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
 
 	return (error);
 }
 
 static void *
 dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n)
 {
 	dbuf_stats_t *dsh = ksp->ks_private;
 
 	ASSERT(MUTEX_HELD(&dsh->lock));
 
 	if (n <= dsh->hash->hash_table_mask) {
 		dsh->idx = n;
 		return (dsh);
 	}
 
 	return (NULL);
 }
 
 static void
 dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
 {
 	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
 	kstat_t *ksp;
 
 	mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
 	dsh->hash = hash;
 
 	ksp = kstat_create("zfs", 0, "dbufs", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	dsh->kstat = ksp;
 
 	if (ksp) {
 		ksp->ks_lock = &dsh->lock;
 		ksp->ks_ndata = UINT32_MAX;
 		ksp->ks_private = dsh;
 		kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
 		    dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
 		kstat_install(ksp);
 	}
 }
 
 static void
 dbuf_stats_hash_table_destroy(void)
 {
 	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
 	kstat_t *ksp;
 
 	ksp = dsh->kstat;
 	if (ksp)
 		kstat_delete(ksp);
 
 	mutex_destroy(&dsh->lock);
 }
 
 void
 dbuf_stats_init(dbuf_hash_table_t *hash)
 {
 	dbuf_stats_hash_table_init(hash);
 }
 
 void
 dbuf_stats_destroy(void)
 {
 	dbuf_stats_hash_table_destroy();
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 module_param(zfs_dbuf_state_index, int, 0644);
 MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index");
 #endif
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 7d84d8bf4e81..108fc5299f41 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -1,1093 +1,1100 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_scan.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_deadlist.h>
 #include <sys/bptree.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_userhold.h>
 #include <sys/trace_txg.h>
 
 /*
  * ZFS Write Throttle
  * ------------------
  *
  * ZFS must limit the rate of incoming writes to the rate at which it is able
  * to sync data modifications to the backend storage. Throttling by too much
  * creates an artificial limit; throttling by too little can only be sustained
  * for short periods and would lead to highly lumpy performance. On a per-pool
  * basis, ZFS tracks the amount of modified (dirty) data. As operations change
  * data, the amount of dirty data increases; as ZFS syncs out data, the amount
  * of dirty data decreases. When the amount of dirty data exceeds a
  * predetermined threshold further modifications are blocked until the amount
  * of dirty data decreases (as data is synced out).
  *
  * The limit on dirty data is tunable, and should be adjusted according to
  * both the IO capacity and available memory of the system. The larger the
  * window, the more ZFS is able to aggregate and amortize metadata (and data)
  * changes. However, memory is a limited resource, and allowing for more dirty
  * data comes at the cost of keeping other useful data in memory (for example
  * ZFS data cached by the ARC).
  *
  * Implementation
  *
  * As buffers are modified dsl_pool_willuse_space() increments both the per-
  * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
  * dirty space used; dsl_pool_dirty_space() decrements those values as data
  * is synced out from dsl_pool_sync(). While only the poolwide value is
  * relevant, the per-txg value is useful for debugging. The tunable
  * zfs_dirty_data_max determines the dirty space limit. Once that value is
  * exceeded, new writes are halted until space frees up.
  *
  * The zfs_dirty_data_sync tunable dictates the threshold at which we
  * ensure that there is a txg syncing (see the comment in txg.c for a full
  * description of transaction group stages).
  *
  * The IO scheduler uses both the dirty space limit and current amount of
  * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
  * issues. See the comment in vdev_queue.c for details of the IO scheduler.
  *
  * The delay is also calculated based on the amount of dirty data.  See the
  * comment above dmu_tx_delay() for details.
  */
 
 /*
  * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
  * capped at zfs_dirty_data_max_max.  It can also be overridden with a module
  * parameter.
  */
 unsigned long zfs_dirty_data_max = 0;
 unsigned long zfs_dirty_data_max_max = 0;
 int zfs_dirty_data_max_percent = 10;
 int zfs_dirty_data_max_max_percent = 25;
 
 /*
  * If there is at least this much dirty data, push out a txg.
  */
 unsigned long zfs_dirty_data_sync = 64 * 1024 * 1024;
 
 /*
  * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
  * and delay each transaction.
  * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
  */
 int zfs_delay_min_dirty_percent = 60;
 
 /*
  * This controls how quickly the delay approaches infinity.
  * Larger values cause it to delay more for a given amount of dirty data.
  * Therefore larger values will cause there to be less dirty data for a
  * given throughput.
  *
  * For the smoothest delay, this value should be about 1 billion divided
  * by the maximum number of operations per second.  This will smoothly
  * handle between 10x and 1/10th this number.
  *
  * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
  * multiply in dmu_tx_delay().
  */
 unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 
 hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
 hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
 
 int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
 
 	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
 dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp;
 	blkptr_t *bp = spa_get_rootblkptr(spa);
 
 	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 	dp->dp_spa = spa;
 	dp->dp_meta_rootbp = *bp;
 	rrw_init(&dp->dp_config_rwlock, B_TRUE);
 	txg_init(dp, txg);
 
 	txg_list_create(&dp->dp_dirty_datasets,
 	    offsetof(dsl_dataset_t, ds_dirty_link));
 	txg_list_create(&dp->dp_dirty_zilogs,
 	    offsetof(zilog_t, zl_dirty_link));
 	txg_list_create(&dp->dp_dirty_dirs,
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks,
 	    offsetof(dsl_sync_task_t, dst_node));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
 	dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri,
 	    1, 4, 0);
 
 	return (dp);
 }
 
 int
 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 	    &dp->dp_meta_objset);
 	if (err != 0)
 		dsl_pool_close(dp);
 	else
 		*dpp = dp;
 
 	return (err);
 }
 
 int
 dsl_pool_open(dsl_pool_t *dp)
 {
 	int err;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
 	if (err)
 		goto out;
 
 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir);
 	if (err)
 		goto out;
 
 	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 	if (err)
 		goto out;
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 		if (err)
 			goto out;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 		if (err == 0) {
 			err = dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 			    &dp->dp_origin_snap);
 			dsl_dataset_rele(ds, FTAG);
 		}
 		dsl_dir_rele(dd, dp);
 		if (err)
 			goto out;
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 		    &dp->dp_free_dir);
 		if (err)
 			goto out;
 
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 		if (err)
 			goto out;
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	/*
 	 * Note: errors ignored, because the leak dir will not exist if we
 	 * have not encountered a leak yet.
 	 */
 	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
 	    &dp->dp_leak_dir);
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 		    &dp->dp_bptree_obj);
 		if (err != 0)
 			goto out;
 	}
 
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 		    &dp->dp_empty_bpobj);
 		if (err != 0)
 			goto out;
 	}
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 	    &dp->dp_tmp_userrefs_obj);
 	if (err == ENOENT)
 		err = 0;
 	if (err)
 		goto out;
 
 	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
 out:
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 	return (err);
 }
 
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
 	/*
 	 * Drop our references from dsl_pool_open().
 	 *
 	 * Since we held the origin_snap from "syncing" context (which
 	 * includes pool-opening context), it actually only got a "ref"
 	 * and not a hold, so just drop that here.
 	 */
 	if (dp->dp_origin_snap)
 		dsl_dataset_rele(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
 		dsl_dir_rele(dp->dp_mos_dir, dp);
 	if (dp->dp_free_dir)
 		dsl_dir_rele(dp->dp_free_dir, dp);
 	if (dp->dp_leak_dir)
 		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_rele(dp->dp_root_dir, dp);
 
 	bpobj_close(&dp->dp_free_bpobj);
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset)
 		dmu_objset_evict(dp->dp_meta_objset);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
-	arc_flush(dp->dp_spa);
+	/*
+	 * We can't set retry to TRUE since we're explicitly specifying
+	 * a spa to flush. This is good enough; any missed buffers for
+	 * this spa won't cause trouble, and they'll eventually fall
+	 * out of the ARC just like any other unused buffer.
+	 */
+	arc_flush(dp->dp_spa, FALSE);
+
 	txg_fini(dp);
 	dsl_scan_fini(dp);
 	dmu_buf_user_evict_wait();
 
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	taskq_destroy(dp->dp_iput_taskq);
 	if (dp->dp_blkstats)
 		vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
 dsl_pool_t *
 dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 	objset_t *os;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = dmu_objset_create_impl(spa,
 	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 
 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 	ASSERT0(err);
 
 	/* Initialize scan structures */
 	VERIFY0(dsl_scan_init(dp, txg));
 
 	/* create and open the root dir */
 	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    MOS_DIR_NAME, &dp->dp_mos_dir));
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		/* create and open the free dir */
 		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 		    FREE_DIR_NAME, tx);
 		VERIFY0(dsl_pool_open_special_dir(dp,
 		    FREE_DIR_NAME, &dp->dp_free_dir));
 
 		/* create and open the free_bplist */
 		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 		    dp->dp_meta_objset, obj));
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 		dsl_pool_create_origin(dp, tx);
 
 	/* create the root dataset */
 	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 
 	/* create the root objset */
 	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 	VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx)));
 #ifdef _KERNEL
 	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
 	dsl_dataset_rele(ds, FTAG);
 
 	dmu_tx_commit(tx);
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 
 	return (dp);
 }
 
 /*
  * Account for the meta-objset space in its placeholder dsl_dir.
  */
 void
 dsl_pool_mos_diduse_space(dsl_pool_t *dp,
     int64_t used, int64_t comp, int64_t uncomp)
 {
 	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 	mutex_enter(&dp->dp_lock);
 	dp->dp_mos_used_delta += used;
 	dp->dp_mos_compressed_delta += comp;
 	dp->dp_mos_uncompressed_delta += uncomp;
 	mutex_exit(&dp->dp_lock);
 }
 
 static int
 deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_deadlist_t *dl = arg;
 	dsl_deadlist_insert(dl, bp, tx);
 	return (0);
 }
 
 static void
 dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
 	VERIFY0(zio_wait(zio));
 	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 }
 
 static void
 dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 {
 	ASSERT(MUTEX_HELD(&dp->dp_lock));
 
 	if (delta < 0)
 		ASSERT3U(-delta, <=, dp->dp_dirty_total);
 
 	dp->dp_dirty_total += delta;
 
 	/*
 	 * Note: we signal even when increasing dp_dirty_total.
 	 * This ensures forward progress -- each thread wakes the next waiter.
 	 */
 	if (dp->dp_dirty_total <= zfs_dirty_data_max)
 		cv_signal(&dp->dp_spaceavail_cv);
 }
 
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
 	zio_t *zio;
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	objset_t *mos = dp->dp_meta_objset;
 	list_t synced_datasets;
 
 	list_create(&synced_datasets, sizeof (dsl_dataset_t),
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Write out all dirty blocks of dirty datasets.
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
 		 * we may have taken a snapshot of them.  However, we
 		 * may sync newly-created datasets on pass 2.
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
 		list_insert_tail(&synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	VERIFY0(zio_wait(zio));
 
 	/*
 	 * We have written all of the accounted dirty data, so our
 	 * dp_space_towrite should now be zero.  However, some seldom-used
 	 * code paths do not adhere to this (e.g. dbuf_undirty(), also
 	 * rounding error in dbuf_write_physdone).
 	 * Shore up the accounting of any dirtied space now.
 	 */
 	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 
 	/*
 	 * After the data blocks have been written (ensured by the zio_wait()
 	 * above), update the user/group space accounting.
 	 */
 	for (ds = list_head(&synced_datasets); ds != NULL;
 	    ds = list_next(&synced_datasets, ds)) {
 		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 	}
 
 	/*
 	 * Sync the datasets again to push out the changes due to
 	 * userspace updates.  This must be done before we process the
 	 * sync tasks, so that any snapshots will have the correct
 	 * user accounting information (and we won't get confused
 	 * about which blocks are part of the snapshot).
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	VERIFY0(zio_wait(zio));
 
 	/*
 	 * Now that the datasets have been completely synced, we can
 	 * clean up our in-memory structures accumulated while syncing:
 	 *
 	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
 	 *  - release hold from dsl_dataset_dirty()
 	 */
 	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
 		ASSERTV(objset_t *os = ds->ds_objset);
 		bplist_iterate(&ds->ds_pending_deadlist,
 		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 		ASSERT(!dmu_objset_is_dirty(os, txg));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
 
 	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 		dsl_dir_sync(dd, tx);
 	}
 
 	/*
 	 * The MOS's space is accounted for in the pool/$MOS
 	 * (dp_mos_dir).  We can't modify the mos while we're syncing
 	 * it, so we remember the deltas and apply them here.
 	 */
 	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 	    dp->dp_mos_uncompressed_delta != 0) {
 		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 		    dp->dp_mos_used_delta,
 		    dp->dp_mos_compressed_delta,
 		    dp->dp_mos_uncompressed_delta, tx);
 		dp->dp_mos_used_delta = 0;
 		dp->dp_mos_compressed_delta = 0;
 		dp->dp_mos_uncompressed_delta = 0;
 	}
 
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 		dsl_pool_sync_mos(dp, tx);
 	}
 
 	/*
 	 * If we modify a dataset in the same txg that we want to destroy it,
 	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
 	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 	 * and clearing the hold on it) before we process the sync_tasks.
 	 * The MOS data dirtied by the sync_tasks will be synced on the next
 	 * pass.
 	 */
 	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 		dsl_sync_task_t *dst;
 		/*
 		 * No more sync tasks should have been added while we
 		 * were syncing.
 		 */
 		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
 		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 			dsl_sync_task_sync(dst, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 }
 
 void
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	zilog_t *zilog;
 
 	while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) {
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 		zil_clean(zilog, txg);
 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
 /*
  * TRUE if the current thread is the tx_sync_thread or if we
  * are being called from SPA context during pool initialization.
  */
 int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
 	return (curthread == dp->dp_tx.tx_sync_thread ||
 	    spa_is_initializing(dp->dp_spa));
 }
 
 uint64_t
 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 {
 	uint64_t space, resv;
 
 	/*
 	 * If we're trying to assess whether it's OK to do a free,
 	 * cut the reservation in half to allow forward progress
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
 	space = spa_get_dspace(dp->dp_spa);
 	resv = spa_get_slop_space(dp->dp_spa);
 	if (netfree)
 		resv >>= 1;
 
 	return (space - resv);
 }
 
 boolean_t
 dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 	boolean_t rv;
 
 	mutex_enter(&dp->dp_lock);
 	if (dp->dp_dirty_total > zfs_dirty_data_sync)
 		txg_kick(dp);
 	rv = (dp->dp_dirty_total > delay_min_bytes);
 	mutex_exit(&dp->dp_lock);
 	return (rv);
 }
 
 void
 dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
 	if (space > 0) {
 		mutex_enter(&dp->dp_lock);
 		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 		dsl_pool_dirty_delta(dp, space);
 		mutex_exit(&dp->dp_lock);
 	}
 }
 
 void
 dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 {
 	ASSERT3S(space, >=, 0);
 	if (space == 0)
 		return;
 
 	mutex_enter(&dp->dp_lock);
 	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
 		/* XXX writing something we didn't dirty? */
 		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 	}
 	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
 	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
 	ASSERT3U(dp->dp_dirty_total, >=, space);
 	dsl_pool_dirty_delta(dp, -space);
 	mutex_exit(&dp->dp_lock);
 }
 
 /* ARGSUSED */
 static int
 upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds, *prev = NULL;
 	int err;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 			break;
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 		prev = NULL;
 	}
 
 	if (prev == NULL) {
 		prev = dp->dp_origin_snap;
 
 		/*
 		 * The $ORIGIN can't have any data, or the accounting
 		 * will be wrong.
 		 */
 		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
 
 		/* The origin doesn't get attached to itself */
 		if (ds->ds_object == prev->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			return (0);
 		}
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
 		dsl_dataset_phys(ds)->ds_prev_snap_txg =
 		    dsl_dataset_phys(prev)->ds_creation_txg;
 
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		dsl_dataset_phys(prev)->ds_num_children++;
 
 		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 			ASSERT(ds->ds_prev == NULL);
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 			    ds, &ds->ds_prev));
 		}
 	}
 
 	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
 	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 
 	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		dsl_dataset_phys(prev)->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 	}
 	VERIFY0(zap_add_int(dp->dp_meta_objset,
 	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 
 	dsl_dataset_rele(ds, FTAG);
 	if (prev != dp->dp_origin_snap)
 		dsl_dataset_rele(prev, FTAG);
 	return (0);
 }
 
 void
 dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 /* ARGSUSED */
 static int
 upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	objset_t *mos = dp->dp_meta_objset;
 
 	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
 		dsl_dataset_t *origin;
 
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 
 		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 			dsl_dir_phys(origin->ds_dir)->dd_clones =
 			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
 			    0, tx);
 		}
 
 		VERIFY0(zap_add_int(dp->dp_meta_objset,
 		    dsl_dir_phys(origin->ds_dir)->dd_clones,
 		    ds->ds_object, tx));
 
 		dsl_dataset_rele(origin, FTAG);
 	}
 	return (0);
 }
 
 void
 dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	uint64_t obj;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 	VERIFY0(dsl_pool_open_special_dir(dp,
 	    FREE_DIR_NAME, &dp->dp_free_dir));
 
 	/*
 	 * We can't use bpobj_alloc(), because spa_version() still
 	 * returns the old version, and we need a new-version bpobj with
 	 * subobj support.  So call dmu_object_alloc() directly.
 	 */
 	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 
 	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 void
 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	uint64_t dsobj;
 	dsl_dataset_t *ds;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap == NULL);
 	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 
 	/* create the origin dir, ds, & snap-ds */
 	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 	    NULL, 0, kcred, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
 }
 
 taskq_t *
 dsl_pool_iput_taskq(dsl_pool_t *dp)
 {
 	return (dp->dp_iput_taskq);
 }
 
 /*
  * Walk through the pool-wide zap object of temporary snapshot user holds
  * and release them.
  */
 void
 dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 {
 	zap_attribute_t za;
 	zap_cursor_t zc;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	nvlist_t *holds;
 
 	if (zapobj == 0)
 		return;
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 
 	holds = fnvlist_alloc();
 
 	for (zap_cursor_init(&zc, mos, zapobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		char *htag;
 		nvlist_t *tags;
 
 		htag = strchr(za.za_name, '-');
 		*htag = '\0';
 		++htag;
 		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
 			tags = fnvlist_alloc();
 			fnvlist_add_boolean(tags, htag);
 			fnvlist_add_nvlist(holds, za.za_name, tags);
 			fnvlist_free(tags);
 		} else {
 			fnvlist_add_boolean(tags, htag);
 		}
 	}
 	dsl_dataset_user_release_tmp(dp, holds);
 	fnvlist_free(holds);
 	zap_cursor_fini(&zc);
 }
 
 /*
  * Create the pool-wide zap object for storing temporary snapshot holds.
  */
 void
 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 
 	ASSERT(dp->dp_tmp_userrefs_obj == 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 }
 
 static int
 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
     const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 	char *name;
 	int error;
 
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	/*
 	 * If the pool was created prior to SPA_VERSION_USERREFS, the
 	 * zap object for temporary holds might not exist yet.
 	 */
 	if (zapobj == 0) {
 		if (holding) {
 			dsl_pool_user_hold_create_obj(dp, tx);
 			zapobj = dp->dp_tmp_userrefs_obj;
 		} else {
 			return (SET_ERROR(ENOENT));
 		}
 	}
 
 	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 	if (holding)
 		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 	else
 		error = zap_remove(mos, zapobj, name, tx);
 	strfree(name);
 
 	return (error);
 }
 
 /*
  * Add a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     uint64_t now, dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 }
 
 /*
  * Release a temporary hold for the given dataset object and tag.
  */
 int
 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
 	    tx, B_FALSE));
 }
 
 /*
  * DSL Pool Configuration Lock
  *
  * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
  * creation / destruction / rename / property setting).  It must be held for
  * read to hold a dataset or dsl_dir.  I.e. you must call
  * dsl_pool_config_enter() or dsl_pool_hold() before calling
  * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
  * must be held continuously until all datasets and dsl_dirs are released.
  *
  * The only exception to this rule is that if a "long hold" is placed on
  * a dataset, then the dp_config_rwlock may be dropped while the dataset
  * is still held.  The long hold will prevent the dataset from being
  * destroyed -- the destroy will fail with EBUSY.  A long hold can be
  * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
  * (by calling dsl_{dataset,objset}_{try}own{_obj}).
  *
  * Legitimate long-holders (including owners) should be long-running, cancelable
  * tasks that should cause "zfs destroy" to fail.  This includes DMU
  * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
  * "zfs send", and "zfs diff".  There are several other long-holders whose
  * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
  *
  * The usual formula for long-holding would be:
  * dsl_pool_hold()
  * dsl_dataset_hold()
  * ... perform checks ...
  * dsl_dataset_long_hold()
  * dsl_pool_rele()
  * ... perform long-running task ...
  * dsl_dataset_long_rele()
  * dsl_dataset_rele()
  *
  * Note that when the long hold is released, the dataset is still held but
  * the pool is not held.  The dataset may change arbitrarily during this time
  * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
  * dataset except release it.
  *
  * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
  * or modifying operations.
  *
  * Modifying operations should generally use dsl_sync_task().  The synctask
  * infrastructure enforces proper locking strategy with respect to the
  * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
  *
  * Read-only operations will manually hold the pool, then the dataset, obtain
  * information from the dataset, then release the pool and dataset.
  * dmu_objset_{hold,rele}() are convenience routines that also do the pool
  * hold/rele.
  */
 
 int
 dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, tag);
 	if (error == 0) {
 		*dp = spa_get_dsl(spa);
 		dsl_pool_config_enter(*dp, tag);
 	}
 	return (error);
 }
 
 void
 dsl_pool_rele(dsl_pool_t *dp, void *tag)
 {
 	dsl_pool_config_exit(dp, tag);
 	spa_close(dp->dp_spa, tag);
 }
 
 void
 dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
 {
 	/*
 	 * We use a "reentrant" reader-writer lock, but not reentrantly.
 	 *
 	 * The rrwlock can (with the track_all flag) track all reading threads,
 	 * which is very useful for debugging which code path failed to release
 	 * the lock, and for verifying that the *current* thread does hold
 	 * the lock.
 	 *
 	 * (Unlike a rwlock, which knows that N threads hold it for
 	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
 	 * if any thread holds it for read, even if this thread doesn't).
 	 */
 	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
 	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
 }
 
 void
 dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
 {
 	rrw_exit(&dp->dp_config_rwlock, tag);
 }
 
 boolean_t
 dsl_pool_config_held(dsl_pool_t *dp)
 {
 	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
 }
 
 boolean_t
 dsl_pool_config_held_writer(dsl_pool_t *dp)
 {
 	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dsl_pool_config_enter);
 EXPORT_SYMBOL(dsl_pool_config_exit);
 
 /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
 module_param(zfs_dirty_data_max_percent, int, 0444);
 MODULE_PARM_DESC(zfs_dirty_data_max_percent, "percent of ram can be dirty");
 
 /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */
 module_param(zfs_dirty_data_max_max_percent, int, 0444);
 MODULE_PARM_DESC(zfs_dirty_data_max_max_percent,
 	"zfs_dirty_data_max upper bound as % of RAM");
 
 module_param(zfs_delay_min_dirty_percent, int, 0644);
 MODULE_PARM_DESC(zfs_delay_min_dirty_percent, "transaction delay threshold");
 
 module_param(zfs_dirty_data_max, ulong, 0644);
 MODULE_PARM_DESC(zfs_dirty_data_max, "determines the dirty space limit");
 
 /* zfs_dirty_data_max_max only applied at module load in arc_init(). */
 module_param(zfs_dirty_data_max_max, ulong, 0444);
 MODULE_PARM_DESC(zfs_dirty_data_max_max,
 	"zfs_dirty_data_max upper bound in bytes");
 
 module_param(zfs_dirty_data_sync, ulong, 0644);
 MODULE_PARM_DESC(zfs_dirty_data_sync, "sync txg when this much dirty data");
 
 module_param(zfs_delay_scale, ulong, 0644);
 MODULE_PARM_DESC(zfs_delay_scale, "how quickly delay approaches infinity");
 #endif
diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c
new file mode 100644
index 000000000000..e4446ded2208
--- /dev/null
+++ b/module/zfs/multilist.c
@@ -0,0 +1,375 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+#include <sys/trace_multilist.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+#ifdef DEBUG
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+#endif
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ *  - 'size' denotes the size of the structure containing the
+ *     multilist_node_t.
+ *  - 'offset' denotes the byte offset of the mutlilist_node_t within
+ *     the structure that contains it.
+ *  - 'num' specifies the number of internal sublists to create.
+ *  - 'index_func' is used to determine which sublist to insert into
+ *     when the multilist_insert() function is called; as well as which
+ *     sublist to remove from when multilist_remove() is called. The
+ *     requirements this function must meet, are the following:
+ *
+ *      - It must always return the same value when called on the same
+ *        object (to ensure the object is removed from the list it was
+ *        inserted into).
+ *
+ *      - It must return a value in the range [0, number of sublists).
+ *        The multilist_get_num_sublists() function may be used to
+ *        determine the number of sublists in the multilist.
+ *
+ *     Also, in order to reduce internal contention between the sublists
+ *     during insertion and removal, this function should choose evenly
+ *     between all available sublists when inserting. This isn't a hard
+ *     requirement, but a general rule of thumb in order to garner the
+ *     best multi-threaded performance out of the data structure.
+ */
+void
+multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num,
+    multilist_sublist_index_func_t *index_func)
+{
+	int i;
+
+	ASSERT3P(ml, !=, NULL);
+	ASSERT3U(size, >, 0);
+	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+	ASSERT3U(num, >, 0);
+	ASSERT3P(index_func, !=, NULL);
+
+	ml->ml_offset = offset;
+	ml->ml_num_sublists = num;
+	ml->ml_index_func = index_func;
+
+	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+	    ml->ml_num_sublists, KM_SLEEP);
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+
+	for (i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
+		list_create(&mls->mls_list, size, offset);
+	}
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+	int i;
+
+	ASSERT(multilist_is_empty(ml));
+
+	for (i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+		ASSERT(list_is_empty(&mls->mls_list));
+
+		list_destroy(&mls->mls_list);
+		mutex_destroy(&mls->mls_lock);
+	}
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+	kmem_free(ml->ml_sublists,
+	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+	ml->ml_num_sublists = 0;
+	ml->ml_offset = 0;
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+
+	/*
+	 * Note: Callers may already hold the sublist lock by calling
+	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
+	 * returning TRUE if and only if the current thread holds the
+	 * lock.  While it's a little ugly to make the lock recursive in
+	 * this way, it works and allows the calling code to be much
+	 * simpler -- otherwise it would have to pass around a flag
+	 * indicating that it already has the lock.
+	 */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_insert_head(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+	/* See comment in multilist_insert(). */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_remove(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+	int i;
+
+	for (i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		/* See comment in multilist_insert(). */
+		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+		if (need_lock)
+			mutex_enter(&mls->mls_lock);
+
+		if (!list_is_empty(&mls->mls_list)) {
+			if (need_lock)
+				mutex_exit(&mls->mls_lock);
+
+			return (FALSE);
+		}
+
+		if (need_lock)
+			mutex_exit(&mls->mls_lock);
+	}
+
+	return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+	return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+	return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+	multilist_sublist_t *mls;
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+	mls = &ml->ml_sublists[sublist_idx];
+	mutex_enter(&mls->mls_lock);
+
+	return (mls);
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+	mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ *       than the object given as the parameter. This is relied upon in
+ *       arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+	void *prev = list_prev(&mls->mls_list, obj);
+
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	ASSERT(!list_is_empty(&mls->mls_list));
+
+	/* 'obj' must be at the head of the list, nothing to do */
+	if (prev == NULL)
+		return;
+
+	list_remove(&mls->mls_list, obj);
+	list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_remove(&mls->mls_list, obj);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+	list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+	return (list_link_active(link));
+}
diff --git a/module/zfs/trace.c b/module/zfs/trace.c
index 470cf18bff30..0c9990e8547b 100644
--- a/module/zfs/trace.c
+++ b/module/zfs/trace.c
@@ -1,47 +1,50 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Each Linux tracepoints subsystem must define CREATE_TRACE_POINTS in one
  * (and only one) C file, so this dummy file exists for that purpose.
  */
 
+#include <sys/multilist.h>
 #include <sys/arc_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dnode.h>
+#include <sys/multilist.h>
 #include <sys/zfs_znode.h>
 #include <sys/zil_impl.h>
 #include <sys/zrlock.h>
 
 #define	CREATE_TRACE_POINTS
 #include <sys/trace.h>
 #include <sys/trace_acl.h>
 #include <sys/trace_arc.h>
 #include <sys/trace_dbuf.h>
 #include <sys/trace_dmu.h>
 #include <sys/trace_dnode.h>
+#include <sys/trace_multilist.h>
 #include <sys/trace_txg.h>
 #include <sys/trace_zil.h>
 #include <sys/trace_zrlock.h>
diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c
index 5afb23c595ae..40b507a0b6d8 100644
--- a/module/zfs/zio_inject.c
+++ b/module/zfs/zio_inject.c
@@ -1,535 +1,539 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 /*
  * ZFS fault injection
  *
  * To handle fault injection, we keep track of a series of zinject_record_t
  * structures which describe which logical block(s) should be injected with a
  * fault.  These are kept in a global list.  Each record corresponds to a given
  * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
  * or exported while the injection record exists.
  *
  * Device level injection is done using the 'zi_guid' field.  If this is set, it
  * means that the error is destined for a particular device, not a piece of
  * data.
  *
  * This is a rather poor data structure and algorithm, but we don't expect more
  * than a few faults at any one time, so it should be sufficient for our needs.
  */
 
 #include <sys/arc.h>
 #include <sys/zio_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/fs/zfs.h>
 
 uint32_t zio_injection_enabled = 0;
 
 typedef struct inject_handler {
 	int			zi_id;
 	spa_t			*zi_spa;
 	zinject_record_t	zi_record;
 	list_node_t		zi_link;
 } inject_handler_t;
 
 static list_t inject_handlers;
 static krwlock_t inject_lock;
 static int inject_next_id = 1;
 
 /*
  * Returns true if the given record matches the I/O in progress.
  */
 static boolean_t
 zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
     zinject_record_t *record, int error)
 {
 	/*
 	 * Check for a match against the MOS, which is based on type
 	 */
 	if (zb->zb_objset == DMU_META_OBJSET &&
 	    record->zi_objset == DMU_META_OBJSET &&
 	    record->zi_object == DMU_META_DNODE_OBJECT) {
 		if (record->zi_type == DMU_OT_NONE ||
 		    type == record->zi_type)
 			return (record->zi_freq == 0 ||
 			    spa_get_random(100) < record->zi_freq);
 		else
 			return (B_FALSE);
 	}
 
 	/*
 	 * Check for an exact match.
 	 */
 	if (zb->zb_objset == record->zi_objset &&
 	    zb->zb_object == record->zi_object &&
 	    zb->zb_level == record->zi_level &&
 	    zb->zb_blkid >= record->zi_start &&
 	    zb->zb_blkid <= record->zi_end &&
 	    error == record->zi_error)
 		return (record->zi_freq == 0 ||
 		    spa_get_random(100) < record->zi_freq);
 
 	return (B_FALSE);
 }
 
 /*
  * Panic the system when a config change happens in the function
  * specified by tag.
  */
 void
 zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
 {
 	inject_handler_t *handler;
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (spa != handler->zi_spa)
 			continue;
 
 		if (handler->zi_record.zi_type == type &&
 		    strcmp(tag, handler->zi_record.zi_func) == 0)
 			panic("Panic requested in function %s\n", tag);
 	}
 
 	rw_exit(&inject_lock);
 }
 
 /*
  * Determine if the I/O in question should return failure.  Returns the errno
  * to be returned to the caller.
  */
 int
 zio_handle_fault_injection(zio_t *zio, int error)
 {
 	int ret = 0;
 	inject_handler_t *handler;
 
 	/*
 	 * Ignore I/O not associated with any logical data.
 	 */
 	if (zio->io_logical == NULL)
 		return (0);
 
 	/*
 	 * Currently, we only support fault injection on reads.
 	 */
 	if (zio->io_type != ZIO_TYPE_READ)
 		return (0);
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (zio->io_spa != handler->zi_spa ||
 		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
 			continue;
 
 		/* If this handler matches, return EIO */
 		if (zio_match_handler(&zio->io_logical->io_bookmark,
 		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
 		    &handler->zi_record, error)) {
 			ret = error;
 			break;
 		}
 	}
 
 	rw_exit(&inject_lock);
 
 	return (ret);
 }
 
 /*
  * Determine if the zio is part of a label update and has an injection
  * handler associated with that portion of the label. Currently, we
  * allow error injection in either the nvlist or the uberblock region of
  * of the vdev label.
  */
 int
 zio_handle_label_injection(zio_t *zio, int error)
 {
 	inject_handler_t *handler;
 	vdev_t *vd = zio->io_vd;
 	uint64_t offset = zio->io_offset;
 	int label;
 	int ret = 0;
 
 	if (offset >= VDEV_LABEL_START_SIZE &&
 	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
 		return (0);
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 		uint64_t start = handler->zi_record.zi_start;
 		uint64_t end = handler->zi_record.zi_end;
 
 		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
 			continue;
 
 		/*
 		 * The injection region is the relative offsets within a
 		 * vdev label. We must determine the label which is being
 		 * updated and adjust our region accordingly.
 		 */
 		label = vdev_label_number(vd->vdev_psize, offset);
 		start = vdev_label_offset(vd->vdev_psize, label, start);
 		end = vdev_label_offset(vd->vdev_psize, label, end);
 
 		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
 		    (offset >= start && offset <= end)) {
 			ret = error;
 			break;
 		}
 	}
 	rw_exit(&inject_lock);
 	return (ret);
 }
 
 
 int
 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
 {
 	inject_handler_t *handler;
 	int ret = 0;
 
 	/*
 	 * We skip over faults in the labels unless it's during
 	 * device open (i.e. zio == NULL).
 	 */
 	if (zio != NULL) {
 		uint64_t offset = zio->io_offset;
 
 		if (offset < VDEV_LABEL_START_SIZE ||
 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
 			return (0);
 	}
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
 			if (handler->zi_record.zi_failfast &&
 			    (zio == NULL || (zio->io_flags &
 			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
 				continue;
 			}
 
 			/* Handle type specific I/O failures */
 			if (zio != NULL &&
 			    handler->zi_record.zi_iotype != ZIO_TYPES &&
 			    handler->zi_record.zi_iotype != zio->io_type)
 				continue;
 
 			if (handler->zi_record.zi_error == error) {
 				/*
 				 * For a failed open, pretend like the device
 				 * has gone away.
 				 */
 				if (error == ENXIO)
 					vd->vdev_stat.vs_aux =
 					    VDEV_AUX_OPEN_FAILED;
 
 				/*
 				 * Treat these errors as if they had been
 				 * retried so that all the appropriate stats
 				 * and FMA events are generated.
 				 */
 				if (!handler->zi_record.zi_failfast &&
 				    zio != NULL)
 					zio->io_flags |= ZIO_FLAG_IO_RETRY;
 
 				ret = error;
 				break;
 			}
 			if (handler->zi_record.zi_error == ENXIO) {
 				ret = SET_ERROR(EIO);
 				break;
 			}
 		}
 	}
 
 	rw_exit(&inject_lock);
 
 	return (ret);
 }
 
 /*
  * Simulate hardware that ignores cache flushes.  For requested number
  * of seconds nix the actual writing to disk.
  */
 void
 zio_handle_ignored_writes(zio_t *zio)
 {
 	inject_handler_t *handler;
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		/* Ignore errors not destined for this pool */
 		if (zio->io_spa != handler->zi_spa ||
 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		/*
 		 * Positive duration implies # of seconds, negative
 		 * a number of txgs
 		 */
 		if (handler->zi_record.zi_timer == 0) {
 			if (handler->zi_record.zi_duration > 0)
 				handler->zi_record.zi_timer = ddi_get_lbolt64();
 			else
 				handler->zi_record.zi_timer = zio->io_txg;
 		}
 
 		/* Have a "problem" writing 60% of the time */
 		if (spa_get_random(100) < 60)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		break;
 	}
 
 	rw_exit(&inject_lock);
 }
 
 void
 spa_handle_ignored_writes(spa_t *spa)
 {
 	inject_handler_t *handler;
 
 	if (zio_injection_enabled == 0)
 		return;
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (spa != handler->zi_spa ||
 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		if (handler->zi_record.zi_duration > 0) {
 			VERIFY(handler->zi_record.zi_timer == 0 ||
 			    ddi_time_after64(
 			    (int64_t)handler->zi_record.zi_timer +
 			    handler->zi_record.zi_duration * hz,
 			    ddi_get_lbolt64()));
 		} else {
 			/* duration is negative so the subtraction here adds */
 			VERIFY(handler->zi_record.zi_timer == 0 ||
 			    handler->zi_record.zi_timer -
 			    handler->zi_record.zi_duration >=
 			    spa_syncing_txg(spa));
 		}
 	}
 
 	rw_exit(&inject_lock);
 }
 
 uint64_t
 zio_handle_io_delay(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	inject_handler_t *handler;
 	uint64_t seconds = 0;
 
 	if (zio_injection_enabled == 0)
 		return (0);
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
 			seconds = handler->zi_record.zi_timer;
 			break;
 		}
 
 	}
 	rw_exit(&inject_lock);
 	return (seconds);
 }
 
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
  * which is the switch to trigger all fault injection.
  */
 int
 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 {
 	inject_handler_t *handler;
 	int error;
 	spa_t *spa;
 
 	/*
 	 * If this is pool-wide metadata, make sure we unload the corresponding
 	 * spa_t, so that the next attempt to load it will trigger the fault.
 	 * We call spa_reset() to unload the pool appropriately.
 	 */
 	if (flags & ZINJECT_UNLOAD_SPA)
 		if ((error = spa_reset(name)) != 0)
 			return (error);
 
 	if (!(flags & ZINJECT_NULL)) {
 		/*
 		 * spa_inject_ref() will add an injection reference, which will
 		 * prevent the pool from being removed from the namespace while
 		 * still allowing it to be unloaded.
 		 */
 		if ((spa = spa_inject_addref(name)) == NULL)
 			return (SET_ERROR(ENOENT));
 
 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
 
 		rw_enter(&inject_lock, RW_WRITER);
 
 		*id = handler->zi_id = inject_next_id++;
 		handler->zi_spa = spa;
 		handler->zi_record = *record;
 		list_insert_tail(&inject_handlers, handler);
 		atomic_add_32(&zio_injection_enabled, 1);
 
 		rw_exit(&inject_lock);
 	}
 
 	/*
 	 * Flush the ARC, so that any attempts to read this data will end up
 	 * going to the ZIO layer.  Note that this is a little overkill, but
 	 * we don't have the necessary ARC interfaces to do anything else, and
 	 * fault injection isn't a performance critical path.
 	 */
 	if (flags & ZINJECT_FLUSH_ARC)
-		arc_flush(NULL);
+		/*
+		 * We must use FALSE to ensure arc_flush returns, since
+		 * we're not preventing concurrent ARC insertions.
+		 */
+		arc_flush(NULL, FALSE);
 
 	return (0);
 }
 
 /*
  * Returns the next record with an ID greater than that supplied to the
  * function.  Used to iterate over all handlers in the system.
  */
 int
 zio_inject_list_next(int *id, char *name, size_t buflen,
     zinject_record_t *record)
 {
 	inject_handler_t *handler;
 	int ret;
 
 	mutex_enter(&spa_namespace_lock);
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler))
 		if (handler->zi_id > *id)
 			break;
 
 	if (handler) {
 		*record = handler->zi_record;
 		*id = handler->zi_id;
 		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
 		ret = 0;
 	} else {
 		ret = SET_ERROR(ENOENT);
 	}
 
 	rw_exit(&inject_lock);
 	mutex_exit(&spa_namespace_lock);
 
 	return (ret);
 }
 
 /*
  * Clear the fault handler with the given identifier, or return ENOENT if none
  * exists.
  */
 int
 zio_clear_fault(int id)
 {
 	inject_handler_t *handler;
 
 	rw_enter(&inject_lock, RW_WRITER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler))
 		if (handler->zi_id == id)
 			break;
 
 	if (handler == NULL) {
 		rw_exit(&inject_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	list_remove(&inject_handlers, handler);
 	rw_exit(&inject_lock);
 
 	spa_inject_delref(handler->zi_spa);
 	kmem_free(handler, sizeof (inject_handler_t));
 	atomic_add_32(&zio_injection_enabled, -1);
 
 	return (0);
 }
 
 void
 zio_inject_init(void)
 {
 	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
 	list_create(&inject_handlers, sizeof (inject_handler_t),
 	    offsetof(inject_handler_t, zi_link));
 }
 
 void
 zio_inject_fini(void)
 {
 	list_destroy(&inject_handlers);
 	rw_destroy(&inject_lock);
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(zio_injection_enabled);
 EXPORT_SYMBOL(zio_inject_fault);
 EXPORT_SYMBOL(zio_inject_list_next);
 EXPORT_SYMBOL(zio_clear_fault);
 EXPORT_SYMBOL(zio_handle_fault_injection);
 EXPORT_SYMBOL(zio_handle_device_injection);
 EXPORT_SYMBOL(zio_handle_label_injection);
 #endif