Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 337024) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 337025) @@ -1,8029 +1,8057 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif #include #include #include #include #include #include #include #include #ifdef illumos #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; int arc_procfd; #endif #endif /* illumos */ static kmutex_t arc_reclaim_lock; static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; static kmutex_t arc_dnlc_evicts_lock; static kcondvar_t arc_dnlc_evicts_cv; static boolean_t arc_dnlc_evicts_thread_exit; uint_t arc_reduce_dnlc_percent = 3; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; /* number of milliseconds before attempting a kmem-cache-reap */ static int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ static int arc_shrink_shift = 7; /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ int arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int zfs_arc_min_prefetch_ms = 1; static int zfs_arc_min_prescient_prefetch_ms = 6; /* * If this percent of memory is free, don't throttle. */ int arc_lotsfree_percent = 10; static int arc_dead; extern boolean_t zfs_prefetch_disable; /* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; /* * log2 fraction of the zio arena to keep free. */ int arc_zio_arena_free_shift = 2; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_no_grow_shift = 0; int zfs_arc_p_min_shift = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; /* Absolute min for arc min / max is 16MB. */ static uint64_t arc_abs_min = 16 << 20; boolean_t zfs_compressed_arc_enabled = B_TRUE; static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); #if defined(__FreeBSD__) && defined(_KERNEL) static void arc_free_target_init(void *unused __unused) { zfs_arc_free_target = vm_cnt.v_free_target; } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); SYSCTL_DECL(_vfs_zfs); SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", "log2(fraction of ARC which must be free to allow growing)"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, &arc_shrink_shift, 0, "log2(fraction of arc to reclaim)"); SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, &arc_grow_retry, 0, "Wait in seconds before considering growing ARC"); SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); /* * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), sysctl_vfs_zfs_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim"); static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) { u_int val; int err; val = zfs_arc_free_target; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < minfree) return (EINVAL); if (val > vm_cnt.v_page_count) return (EINVAL); zfs_arc_free_target = val; return (0); } /* * Must be declared here, before the definition of corresponding kstat * macro which uses the same names will confuse the compiler. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_meta_limit, "QU", "ARC metadata limit"); #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { /* * list of evictable buffers */ multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; /* * total amount of evictable data in this state */ refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. */ refcount_t arcs_size; } arc_state_t; /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped when updating the access state due to the * header having already been released after acquiring the hash lock. */ kstat_named_t arcstat_access_skip; /* * Number of buffers skipped because they have I/O in progress, are * indirect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach it's target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_p; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; /* Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_size; /* * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ kstat_named_t arcstat_uncompressed_size; /* * Number of bytes stored in all the arc_buf_t's. This is classified * as "overhead" since this data is typically short-lived and will * be evicted from the arc when it becomes unreferenced unless the * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level * values have been set (see comment in dbuf.c for more information). */ kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by various buffers and structures * not actually backed with ARC buffers. This includes bonus * buffers (allocated directly via zio_buf_* functions), * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t * cache), and dnode_t structures (allocated via dnode_t cache). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_other_size; /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_anon_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mru_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mru_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mfu_size; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mfu_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. * Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_lsize; kstat_named_t arcstat_l2_psize; /* Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; kstat_named_t arcstat_l2_write_spa_mismatch; kstat_named_t arcstat_l2_write_in_l2; kstat_named_t arcstat_l2_write_hdr_io_in_progress; kstat_named_t arcstat_l2_write_not_cacheable; kstat_named_t arcstat_l2_write_full; kstat_named_t arcstat_l2_write_buffer_iter; kstat_named_t arcstat_l2_write_pios; kstat_named_t arcstat_l2_write_buffer_bytes_scanned; kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; /* Not updated directly; only synced in arc_kstat_update. */ kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; kstat_named_t arcstat_async_upgrade_sync; kstat_named_t arcstat_demand_hit_predictive_prefetch; kstat_named_t arcstat_demand_hit_prescient_prefetch; } arc_stats_t; static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "access_skip", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, { "l2_write_in_l2", KSTAT_DATA_UINT64 }, { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, { "l2_write_full", KSTAT_DATA_UINT64 }, { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, { "l2_write_pios", KSTAT_DATA_UINT64 }, { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ /* compressed size of entire arc */ #define arc_compressed_size ARCSTAT(arcstat_compressed_size) /* uncompressed size of entire arc */ #define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) /* number of bytes in the arc from arc_buf_t's */ #define arc_overhead_size ARCSTAT(arcstat_overhead_size) /* * There are also some ARC variables that we want to export, but that are * updated so often that having the canonical representation be the statistic * variable causes a performance bottleneck. We want to use aggsum_t's for these * instead, but still be able to export the kstat in the same way as before. * The solution is to always use the aggsum version, except in the kstat update * callback. */ aggsum_t arc_size; aggsum_t arc_meta_used; aggsum_t astat_data_size; aggsum_t astat_metadata_size; aggsum_t astat_hdr_size; aggsum_t astat_other_size; aggsum_t astat_l2_hdr_size; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_compressed; zio_t *acb_zio_dummy; zio_t *acb_zio_head; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; arc_write_done_func_t *awcb_physdone; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; /* * ARC buffers are separated into multiple structs as a memory saving measure: * - Common fields struct, always defined, and embedded within it: * - L2-only fields, always allocated but undefined when not in L2ARC * - L1-only fields, only allocated when in L1ARC * * Buffer in L1 Buffer only in L2 * +------------------------+ +------------------------+ * | arc_buf_hdr_t | | arc_buf_hdr_t | * | | | | * | | | | * | | | | * +------------------------+ +------------------------+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | * | (undefined if L1-only) | | | * +------------------------+ +------------------------+ * | l1arc_buf_hdr_t | * | | * | | * | | * | | * +------------------------+ * * Because it's possible for the L2ARC to become extremely large, we can wind * up eating a lot of memory in L2ARC buffer headers, so the size of a header * is minimized by only allocating the fields necessary for an L1-cached buffer * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple * words in pointers. arc_hdr_realloc() is used to switch a header between * these two allocation states. */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; #ifdef ZFS_DEBUG /* * Used for debugging with kmem_flags - by allocating and freeing * b_thawed when the buffer is thawed, we get a record of the stack * trace that thawed it. */ void *b_thawed; #endif arc_buf_t *b_buf; uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; /* self protecting */ refcount_t b_refcnt; arc_callback_t *b_acb; abd_t *b_pabd; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ list_node_t b_l2node; } l2arc_buf_hdr_t; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; /* * This field stores the size of the data buffer after * compression, and is set in the arc's zio completion handlers. * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). * * While the block pointers can store up to 32MB in their psize * field, we can only store up to 32MB minus 512B. This is due * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. * a field of zeros represents 512B in the bp). We can't use a * bias of 1 since we need to reserve a psize of zero, here, to * represent holes and embedded blocks. * * This isn't a problem in practice, since the maximum size of a * buffer is limited to 16MB, so we never need to store 32MB in * this field. Even in the upstream illumos code base, the * maximum size of a buffer is limited to 16MB. */ uint16_t b_psize; /* * This field stores the size of the data buffer before * compression, and cannot change once set. It is in units * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) */ uint16_t b_lsize; /* immutable */ uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; }; #if defined(__FreeBSD__) && defined(_KERNEL) static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = arc_meta_limit; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val <= 0 || val > arc_c_max) return (EINVAL); arc_meta_limit = val; return (0); } static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { uint32_t val; int err; val = arc_no_grow_shift; err = sysctl_handle_32(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val >= arc_shrink_shift) return (EINVAL); arc_no_grow_shift = val; return (0); } static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_arc_max; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (zfs_arc_max == 0) { /* Loader tunable so blindly set */ zfs_arc_max = val; return (0); } if (val < arc_abs_min || val > kmem_size()) return (EINVAL); if (val < arc_c_min) return (EINVAL); if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) return (EINVAL); arc_c_max = val; arc_c = arc_c_max; arc_p = (arc_c >> 1); if (zfs_arc_meta_limit == 0) { /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; } /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; zfs_arc_max = arc_c; return (0); } static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_arc_min; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (zfs_arc_min == 0) { /* Loader tunable so blindly set */ zfs_arc_min = val; return (0); } if (val < arc_abs_min || val > arc_c_max) return (EINVAL); arc_c_min = val; if (zfs_arc_meta_min == 0) arc_meta_min = arc_c_min / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; zfs_arc_min = arc_c_min; return (0); } #endif #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_PRESCIENT_PREFETCH(hdr) \ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) /* * Other sizes */ #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define HT_LOCK_PAD CACHE_LINE_SIZE struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; #endif }; #define BUF_LOCKS 256 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, &l2arc_write_max, 0, "max write size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, &l2arc_write_boost, 0, "extra write during warmup"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, &l2arc_headroom, 0, "number of dev writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, &l2arc_feed_secs, 0, "interval seconds"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, &l2arc_feed_min_ms, 0, "min interval milliseconds"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, &l2arc_noprefetch, 0, "don't cache prefetch bufs"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, &l2arc_feed_again, 0, "turbo warmup"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, &l2arc_norw, 0, "no reads during writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW, &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms"); /* * L2ARC Internals */ struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ refcount_t l2ad_alloc; /* allocated bytes */ }; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); static void arc_hdr_free_pabd(arc_buf_hdr_t *); static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_trim(const arc_buf_hdr_t *hdr) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT(HDR_HAS_L2HDR(hdr)); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); if (HDR_GET_PSIZE(hdr) != 0) { trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, HDR_GET_PSIZE(hdr), 0); } } /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_FULL_SIZE); cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* * Reclaim callback -- invoked when memory is low. */ /* ARGSUSED */ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ if (!arc_dead) cv_signal(&arc_reclaim_thread_cv); } static void buf_init(void) { uint64_t *ct; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ int32_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } int32_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } #define ARC_MINTIME (hz>>4) /* 62 ms */ static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } /* * Free the checksum associated with this header. If there is no checksum, this * is a no-op. */ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * Return true iff at least one of the bufs on hdr is not compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { return (B_TRUE); } } return (B_FALSE); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) { ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || arc_hdr_has_uncompressed_buf(hdr)); return; } ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); boolean_t valid_cksum; ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * We rely on the blkptr's checksum to determine if the block * is valid or not. When compressed arc is enabled, the l2arc * writes the block to the l2arc just as it appears in the pool. * This allows us to use the blkptr's checksum to validate the * data that we just read off of the l2arc without having to store * a separate checksum in the arc_buf_hdr_t. However, if compressed * arc is disabled, then the data written to the l2arc is always * uncompressed and won't match the block as it exists in the main * pool. When this is the case, we must first compress it if it is * compressed on the main pool before we can validate the checksum. */ if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t csize; abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); csize = zio_compress_data(compress, zio->io_abd, abd_to_buf(cdata), lsize); ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); if (csize < HDR_GET_PSIZE(hdr)) { /* * Compressed blocks are always a multiple of the * smallest ashift in the pool. Ideally, we would * like to round up the csize to the next * spa_min_ashift but that value may have changed * since the block was last written. Instead, * we rely on the fact that the hdr's psize * was set to the psize of the block when it was * last written. We set the csize to that value * and zero out any part that should not contain * data. */ abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); csize = HDR_GET_PSIZE(hdr); } zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); } /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); zio_pop_transforms(zio); return (valid_cksum); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { ASSERT(arc_hdr_has_uncompressed_buf(hdr)); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } else if (ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif } #ifdef illumos #ifndef _KERNEL typedef struct procctl { long cmd; prwatch_t prwatch; } procctl_t; #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = 0; ctl.prwatch.pr_wflags = 0; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = arc_buf_size(buf); ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } #endif /* illumos */ static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum or * allocate b_thawed. */ if (ARC_BUF_COMPRESSED(buf)) { ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || arc_hdr_has_uncompressed_buf(hdr)); return; } ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { if (hdr->b_l1hdr.b_thawed != NULL) kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } #endif mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); #endif } void arc_buf_freeze(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) { ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || arc_hdr_has_uncompressed_buf(hdr)); return; } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf); mutex_exit(hash_lock); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. * Holes and embedded blocks remain anonymous so we don't * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, boolean_t compressed) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf it's own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); return (0); } else { int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %p, compress %d, psize %d, lsize %d", hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); return (SET_ERROR(EIO)); } } } /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } int arc_decompress(arc_buf_t *buf) { return (arc_buf_fill(buf, B_FALSE)); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); if (!MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } arc_state_t *state = hdr->b_l1hdr.b_state; if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { multilist_remove(state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); /* * arc_l2c_only counts as a ghost state so we don't need to explicitly * check to prevent usage of the arc_l2c_only list. */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *old_state; int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); } else { old_state = arc_l2c_only; refcnt = 0; bufcnt = 0; update_old = B_FALSE; } update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_new = B_TRUE; } arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) refcount_add_many(&new_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); } else { ASSERT(GHOST_STATE(old_state)); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) refcount_remove_many( &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); (void) refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } if (HDR_HAS_L1HDR(hdr)) hdr->b_l1hdr.b_state = new_state; /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: aggsum_add(&astat_data_size, space); break; case ARC_SPACE_META: aggsum_add(&astat_metadata_size, space); break; case ARC_SPACE_OTHER: aggsum_add(&astat_other_size, space); break; case ARC_SPACE_HDRS: aggsum_add(&astat_hdr_size, space); break; case ARC_SPACE_L2HDRS: aggsum_add(&astat_l2_hdr_size, space); break; } if (type != ARC_SPACE_DATA) aggsum_add(&arc_meta_used, space); aggsum_add(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: aggsum_add(&astat_data_size, -space); break; case ARC_SPACE_META: aggsum_add(&astat_metadata_size, -space); break; case ARC_SPACE_OTHER: aggsum_add(&astat_other_size, -space); break; case ARC_SPACE_HDRS: aggsum_add(&astat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: aggsum_add(&astat_l2_hdr_size, -space); break; } if (type != ARC_SPACE_DATA) { ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); /* * We use the upper bound here rather than the precise value * because the arc_meta_max value doesn't need to be * precise. It's only consumed by humans via arcstats. */ if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) arc_meta_max = aggsum_upper_bound(&arc_meta_used); aggsum_add(&arc_meta_used, -space); } ASSERT(aggsum_compare(&arc_size, space) >= 0); aggsum_add(&arc_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: * 1. the hdr's compression matches the buf's compression * 2. the hdr doesn't need to be byteswapped * 3. the hdr isn't already being shared * 4. the buf is either compressed or it is the last buf in the hdr list * * Criterion #4 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is sharable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. */ if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); } return (0); } static char *arc_onloan_tag = "onloan"; static inline void arc_loaned_bytes_update(int64_t delta) { atomic_add_64(&arc_loaned_bytes, delta); /* assert that it did not wrap around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); } /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type); arc_loaned_bytes_update(arc_buf_size(buf)); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); arc_loaned_bytes_update(-arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); arc_loaned_bytes_update(arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) refcount_remove_many(&state->arcs_size, size, hdr); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ refcount_transfer_ownership(&state->arcs_size, buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ refcount_transfer_ownership(&state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_put(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. */ if (lastbuf != NULL) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_pabd(hdr); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); } /* * Free the checksum if we're removing the last uncompressed buf from * this hdr. */ if (!arc_hdr_has_uncompressed_buf(hdr)) { arc_cksum_free(hdr); } /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) { ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_pabd(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); } hdr->b_l1hdr.b_pabd = NULL; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, enum zio_compress compression_type, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; /* * Allocate the hdr's buffer. This will contain either * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ arc_hdr_alloc_pabd(hdr); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, ZIO_COMPRESS_OFF, type); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type) { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT(compression_type > ZIO_COMPRESS_OFF); ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, compression_type, ARC_BUFC_DATA); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); if (!arc_buf_is_shared(buf)) { /* * To ensure that the hdr has the correct data in it if we call * arc_decompress() on this buf before it's been written to * disk, it's easiest if we just set up sharing between the * buf and the hdr. */ ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); arc_hdr_free_pabd(hdr); arc_share_buf(hdr, buf); } return (buf); } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t psize = arc_hdr_size(hdr); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_psize, -psize); ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { l2arc_trim(hdr); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif if (hdr->b_l1hdr.b_pabd != NULL) { arc_hdr_free_pabd(hdr); } } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, NULL, tag)); arc_hdr_destroy(hdr); return; } mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { ARCSTAT_BUMP(arcstat_mutex_miss); break; } if (buf->b_data != NULL) bytes_evicted += HDR_GET_LSIZE(hdr); mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } if (hdr->b_l1hdr.b_bufcnt == 0) { arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly * in arc_free_data_impl(). */ arc_hdr_free_pabd(hdr); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, int64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; int evict_count = 0; ASSERT3P(marker, !=, NULL); IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; hdr = multilist_sublist_prev(mls, marker)) { if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || (evict_count >= zfs_arc_evict_batch_limit)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t evicted = arc_evict_hdr(hdr, hash_lock); mutex_exit(hash_lock); bytes_evicted += evicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count++; /* * If arc_size isn't overflowing, signal any * threads that might happen to be waiting. * * For each header evicted, we wake up a single * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since * arc_get_data_impl() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the * function should proceed in this case). * * If threads are left sleeping, due to not * using cv_broadcast, they will be woken up * just before arc_reclaim_thread() sleeps. */ mutex_enter(&arc_reclaim_lock); if (!arc_is_overflowing()) cv_signal(&arc_reclaim_waiters_cv); mutex_exit(&arc_reclaim_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); return (bytes_evicted); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; multilist_t *ml = state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); for (int i = 0; i < num_sublists; i++) { markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_adjust_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (bytes == ARC_EVICT_ALL) bytes_remaining = ARC_EVICT_ALL; else if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); kmem_cache_free(hdr_full_cache, markers[i]); } kmem_free(markers, sizeof (*markers) * num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function * prevents us from trying to evict more from a state's list than * is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { int64_t delta; if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } return (0); } /* * Evict metadata buffers from the cache, such that arc_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t arc_adjust_meta(uint64_t meta_used) { uint64_t total_evicted = 0; int64_t target; /* * If we're over the meta limit, we want to evict enough * metadata to get back under the meta limit. We don't want to * evict so much that we drop the MRU below arc_p, though. If * we're over the meta limit more than we're over arc_p, we * evict some from the MRU here, and some from the MFU below. */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) - arc_p)); total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } /* * Return the type of the oldest buffer in the given arc state * * This function will select a random sublist of type ARC_BUFC_DATA and * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist * is compared, and the type which contains the "older" buffer will be * returned. */ static arc_buf_contents_t arc_adjust_type(arc_state_t *state) { multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; multilist_sublist_t *meta_mls; arc_buf_contents_t type; arc_buf_hdr_t *data_hdr; arc_buf_hdr_t *meta_hdr; /* * We keep the sublist lock until we're finished, to prevent * the headers from being destroyed via arc_evict_state(). */ data_mls = multilist_sublist_lock(data_ml, data_idx); meta_mls = multilist_sublist_lock(meta_ml, meta_idx); /* * These two loops are to ensure we skip any markers that * might be at the tail of the lists due to arc_evict_state(). */ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { if (data_hdr->b_spa != 0) break; } for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { if (meta_hdr->b_spa != 0) break; } if (data_hdr == NULL && meta_hdr == NULL) { type = ARC_BUFC_DATA; } else if (data_hdr == NULL) { ASSERT3P(meta_hdr, !=, NULL); type = ARC_BUFC_METADATA; } else if (meta_hdr == NULL) { ASSERT3P(data_hdr, !=, NULL); type = ARC_BUFC_DATA; } else { ASSERT3P(data_hdr, !=, NULL); ASSERT3P(meta_hdr, !=, NULL); /* The headers can't be on the sublist without an L1 header */ ASSERT(HDR_HAS_L1HDR(data_hdr)); ASSERT(HDR_HAS_L1HDR(meta_hdr)); if (data_hdr->b_l1hdr.b_arc_access < meta_hdr->b_l1hdr.b_arc_access) { type = ARC_BUFC_DATA; } else { type = ARC_BUFC_METADATA; } } multilist_sublist_unlock(meta_mls); multilist_sublist_unlock(data_mls); return (type); } /* * Evict buffers from the cache, such that arc_size is capped by arc_c. */ static uint64_t arc_adjust(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; uint64_t asize = aggsum_value(&arc_size); uint64_t ameta = aggsum_value(&arc_meta_used); /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ total_evicted += arc_adjust_meta(ameta); /* * Adjust MRU size * * If we're over the target cache size, we want to evict enough * from the list to get back to our target size. We don't want * to evict too much from the MRU, such that it drops below * arc_p. So, if we're over our target cache size more than * the MRU is over arc_p, we'll evict enough to get back to * arc_p here, and then evict more from the MFU below. */ target = MIN((int64_t)(asize - arc_c), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); /* * If we're below arc_meta_min, always prefer to evict data. * Otherwise, try to satisfy the requested number of bytes to * evict from the type which contains older buffers; in an * effort to keep newer buffers in the cache regardless of their * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from metadata. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Adjust MFU size * * Now that we've tried to evict enough from the MRU to get its * size back to arc_p, if we're still above the target cache * size, we evict the rest from the MFU. */ target = asize - arc_c; if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && ameta > arc_meta_min) { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists * * In addition to the above, the ARC also defines target values * for the ghost lists. The sum of the mru list and mru ghost * list should never exceed the target size of the cache, and * the sum of the mru list, mfu list, mru ghost list, and mfu * ghost list should never exceed twice the target size of the * cache. The following logic enforces these limits on the ghost * caches, and evicts from them as needed. */ target = refcount_count(&arc_mru->arcs_size) + refcount_count(&arc_mru_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than * or equal to arc_c (we enforced this above), which means we * can use the simpler of the two equations below: * * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c * mru ghost + mfu ghost <= arc_c */ target = refcount_count(&arc_mru_ghost->arcs_size) + refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } void arc_shrink(int64_t to_free) { uint64_t asize = aggsum_value(&arc_size); if (arc_c > arc_c_min) { DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, arc_c_min, uint64_t, arc_p, uint64_t, to_free); if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (asize < arc_c) arc_c = MAX(asize, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, arc_p); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } if (asize > arc_c) { DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize, uint64_t, arc_c); (void) arc_adjust(); } } typedef enum free_memory_reason_t { FMR_UNKNOWN, FMR_NEEDFREE, FMR_LOTSFREE, FMR_SWAPFS_MINFREE, FMR_PAGES_PP_MAXIMUM, FMR_HEAP_ARENA, FMR_ZIO_ARENA, } free_memory_reason_t; int64_t last_free_memory; free_memory_reason_t last_free_reason; /* * Additional reserve of pages for pp_reserve. */ int64_t arc_pages_pp_reserve = 64; /* * Additional reserve of pages for swapfs. */ int64_t arc_swapfs_reserve = 64; /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. */ static int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; int64_t n; free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL #ifdef __FreeBSD__ /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } #else if (needfree > 0) { n = PAGESIZE * (-needfree); if (n < lowest) { lowest = n; r = FMR_NEEDFREE; } } /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ n = PAGESIZE * (freemem - lotsfree - needfree - desfree); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } /* * check to make sure that swapfs has enough space so that anon * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - desfree - arc_swapfs_reserve); if (n < lowest) { lowest = n; r = FMR_SWAPFS_MINFREE; } /* * Check that we have enough availrmem that memory locking (e.g., via * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum * stores the number of pages that cannot be locked; when availrmem * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ n = PAGESIZE * (availrmem - pages_pp_maximum - arc_pages_pp_reserve); if (n < lowest) { lowest = n; r = FMR_PAGES_PP_MAXIMUM; } #endif /* __FreeBSD__ */ #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ n = uma_avail() - (long)(uma_limit() / 4); if (n < lowest) { lowest = n; r = FMR_HEAP_ARENA; } #endif /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this arena remains * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. * * Note that reducing the arc_zio_arena_free_shift keeps more virtual * memory (in the zio_arena) free, which can avoid memory * fragmentation issues. */ if (zio_arena != NULL) { n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - (vmem_size(zio_arena, VMEM_ALLOC) >> arc_zio_arena_free_shift); if (n < lowest) { lowest = n; r = FMR_ZIO_ARENA; } } #else /* _KERNEL */ /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) lowest = -1024; #endif /* _KERNEL */ last_free_memory = lowest; last_free_reason = r; DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); return (lowest); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; extern kmem_cache_t *abd_chunk_cache; static __noinline void arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { /* * We are exceeding our meta-data cache limit. * Purge some DNLC entries to release holds on meta-data. */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif /* * If a kmem reap is already active, don't schedule more. We must * check for this because kmem_cache_reap_soon() won't actually * block on the cache being reaped (this is to prevent callers from * becoming implicitly blocked by a system-wide kmem reap -- which, * on a system with many, many full magazines, can take minutes). */ if (kmem_cache_reap_active()) return; for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_soon(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_soon(zio_data_buf_cache[i]); } } kmem_cache_reap_soon(abd_chunk_cache); kmem_cache_reap_soon(buf_cache); kmem_cache_reap_soon(hdr_full_cache); kmem_cache_reap_soon(hdr_l2only_cache); kmem_cache_reap_soon(range_seg_cache); #ifdef illumos if (zio_arena != NULL) { /* * Ask the vmem arena to reclaim unused memory from its * quantum caches. */ vmem_qcache_reap(zio_arena); } #endif DTRACE_PROBE(arc__kmem_reap_end); } /* * Threads can block in arc_get_data_impl() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in * arc_get_data_impl() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, * fails, and goes to sleep forever. * * This possible deadlock is avoided by always acquiring a hash lock * using mutex_tryenter() from arc_reclaim_thread(). */ /* ARGSUSED */ static void arc_reclaim_thread(void *unused __unused) { hrtime_t growtime = 0; hrtime_t kmem_reap_time = 0; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_lock); while (!arc_reclaim_thread_exit) { uint64_t evicted = 0; /* * This is necessary in order for the mdb ::arc dcmd to * show up to date information. Since the ::arc command * does not call the kstat's update function, without * this call, the command may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this change, the data might be up to 1 second * out of date; but that should suffice. The arc_state_t * structures can be queried directly if more accurate * information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); mutex_exit(&arc_reclaim_lock); /* * We call arc_adjust() before (possibly) calling * arc_kmem_reap_now(), so that we can wake up * arc_get_data_impl() sooner. */ evicted = arc_adjust(); int64_t free_memory = arc_available_memory(); if (free_memory < 0) { hrtime_t curtime = gethrtime(); arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 60) seconds * before considering growing. */ growtime = curtime + SEC2NSEC(arc_grow_retry); /* * Wait at least arc_kmem_cache_reap_retry_ms * between arc_kmem_reap_now() calls. Without * this check it is possible to end up in a * situation where we spend lots of time * reaping caches, while we're near arc_c_min. */ if (curtime >= kmem_reap_time) { arc_kmem_reap_now(); kmem_reap_time = gethrtime() + MSEC2NSEC(arc_kmem_cache_reap_retry_ms); } /* * If we are still low on memory, shrink the ARC * so that we have arc_shrink_min free space. */ free_memory = arc_available_memory(); int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { #ifdef _KERNEL #ifdef illumos to_free = MAX(to_free, ptob(needfree)); #endif #endif arc_shrink(to_free); } } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= growtime) { arc_no_grow = B_FALSE; } mutex_enter(&arc_reclaim_lock); /* * If evicted is zero, we couldn't evict anything via * arc_adjust(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. */ if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * up any threads before we go to sleep. */ cv_broadcast(&arc_reclaim_waiters_cv); /* * Block until signaled, or after one second (we * might need to perform arc_kmem_reap_now() * even if we aren't being signalled) */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_hires(&arc_reclaim_thread_cv, &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); } } arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ thread_exit(); } static u_int arc_dnlc_evicts_arg; extern struct vfsops zfs_vfsops; static void arc_dnlc_evicts_thread(void *dummy __unused) { callb_cpr_t cpr; u_int percent; CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_dnlc_evicts_lock); while (!arc_dnlc_evicts_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); if (arc_dnlc_evicts_arg != 0) { percent = arc_dnlc_evicts_arg; mutex_exit(&arc_dnlc_evicts_lock); #ifdef _KERNEL vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); #endif mutex_enter(&arc_dnlc_evicts_lock); /* * Clear our token only after vnlru_free() * pass is done, to avoid false queueing of * the requests. */ arc_dnlc_evicts_arg = 0; } } arc_dnlc_evicts_thread_exit = FALSE; cv_broadcast(&arc_dnlc_evicts_cv); CALLB_CPR_EXIT(&cpr); thread_exit(); } void dnlc_reduce_cache(void *arg) { u_int percent; percent = (u_int)(uintptr_t)arg; mutex_enter(&arc_dnlc_evicts_lock); if (arc_dnlc_evicts_arg == 0) { arc_dnlc_evicts_arg = percent; cv_broadcast(&arc_dnlc_evicts_cv); } mutex_exit(&arc_dnlc_evicts_lock); } /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); if (state == arc_l2c_only) return; ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thread_cv); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > 0) { DTRACE_PROBE1(arc__inc_adapt, int, bytes); atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static boolean_t arc_is_overflowing(void) { /* Always allow at least one block of overflow */ uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without * bound, and that it can reach its maximum size. This check * accomplishes both goals. The maximum amount we could run over by is * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ return (aggsum_lower_bound(&arc_size) >= arc_c + overflow); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { ASSERT(type == ARC_BUFC_DATA); return (abd_alloc(size, B_FALSE)); } } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); /* * If arc_size is currently overflowing, and has grown past our * upper limit, we must be adding data faster than the evict * thread can evict. Thus, to ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we halt and wait for the * eviction thread to catch up. * * It's also possible that the reclaim thread is unable to evict * enough buffers to get arc_size below the overflow limit (e.g. * due to buffers being un-evictable, or hash lock collisions). * In this case, we want to proceed regardless if we're * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { mutex_enter(&arc_reclaim_lock); /* * Now that we've acquired the lock, we may no longer be * over the overflow limit, lets check. * * We're ignoring the case of spurious wake ups. If that * were to happen, it'd let this thread consume an ARC * buffer before it should have (i.e. before we're under * the overflow limit and were signalled by the reclaim * thread). As long as that is a rare occurrence, it * shouldn't cause any harm. */ if (arc_is_overflowing()) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); } VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(state)) { (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) refcount_add_many(&state->arcs_esize[type], size, tag); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (aggsum_compare(&arc_size, arc_c) < 0 && hdr->b_l1hdr.b_state == arc_anon && (refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) refcount_remove_many(&state->arcs_size, size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ new_state = arc_mru; } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { ASSERT(!"invalid arc state"); } } /* * This routine is called by dbuf_hold() to update the arc_access() state * which otherwise would be skipped for entries in the dbuf cache. */ void arc_buf_access(arc_buf_t *buf) { mutex_enter(&buf->b_evict_lock); arc_buf_hdr_t *hdr = buf->b_hdr; /* * Avoid taking the hash_lock when possible as an optimization. * The header must be checked again under the hash_lock in order * to handle the case where it is concurrently being released. */ if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(&buf->b_evict_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { mutex_exit(hash_lock); mutex_exit(&buf->b_evict_lock); ARCSTAT_BUMP(arcstat_access_skip); return; } mutex_exit(&buf->b_evict_lock); ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } /* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { if (buf == NULL) return; bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_read_done_func_t */ /* ARGSUSED */ void arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; - if (buf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { + ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; - ASSERT(buf->b_data); + ASSERT(buf->b_data != NULL); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); } } static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; + boolean_t no_zio_error = (zio->io_error == 0); /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } - if (zio->io_error == 0) { + if (no_zio_error) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { if (!acb->acb_done) continue; callback_cnt++; - if (zio->io_error != 0) - continue; - - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, - B_TRUE, &acb->acb_buf); - if (error != 0) { - arc_buf_destroy(acb->acb_buf, acb->acb_private); - acb->acb_buf = NULL; + if (no_zio_error) { + int error = arc_buf_alloc_impl(hdr, acb->acb_private, + acb->acb_compressed, zio->io_error == 0, + &acb->acb_buf); + if (error != 0) { + /* + * Decompression failed. Set io_error + * so that when we call acb_done (below), + * we will indicate that the read failed. + * Note that in the unusual case where one + * callback is compressed and another + * uncompressed, we will mark all of them + * as failed, even though the uncompressed + * one can't actually fail. In this case, + * the hdr will not be anonymous, because + * if there are multiple callbacks, it's + * because multiple threads found the same + * arc buf in the hash table. + */ + zio->io_error = error; + } } - - if (zio->io_error == 0) - zio->io_error = error; } + /* + * If there are multiple callbacks, we must have the hash lock, + * because the only way for multiple threads to find this hdr is + * in the hash table. This ensures that if there are multiple + * callbacks, the hdr is not anonymous. If it were anonymous, + * we couldn't use arc_buf_destroy() in the error case below. + */ + ASSERT(callback_cnt < 2 || hash_lock != NULL); + hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error == 0) { + if (no_zio_error) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + if (zio->io_error != 0 && acb->acb_buf != NULL) { + /* + * If arc_buf_alloc_impl() fails during + * decompression, the buf will still be + * allocated, and needs to be freed here. + */ + arc_buf_destroy(acb->acb_buf, acb->acb_private); + acb->acb_buf = NULL; + } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); top: if (!BP_IS_EMBEDDED(bp)) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This is a sync read that needs to wait for * an in-flight async read. Request that the * zio have its priority upgraded. */ zio_change_priority(head_zio, priority); DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); return (0); } mutex_exit(hash_lock); return (0); } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); if (done) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to * wait for i/o because we did a predictive * prefetch i/o for it, which has completed. */ DTRACE_PROBE1( arc__demand__hit__predictive__prefetch, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { ARCSTAT_BUMP( arcstat_demand_hit_prescient_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); } ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ rc = arc_buf_alloc_impl(hdr, private, compressed_read, B_TRUE, &buf); if (rc != 0) { arc_buf_destroy(buf, private); buf = NULL; } ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0 || rc != ENOENT); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_GET_COMPRESS(bp), type); if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache. If it was L2-only * (and thus didn't have an L1 hdr), we realloc the * header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * This is a delicate dance that we play here. * This hdr is in the ghost list so we access it * to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); arc_hdr_alloc_pabd(hdr); } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); /* * If compression is enabled on the hdr, then will do * RAW I/O and will store the compressed data in the hdr's * data block. Otherwise, the hdr's data block will contain * the uncompressed data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW; } if (*arc_flags & ARC_FLAG_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } /* * We count both async reads and scrub IOs as asynchronous so * that both can be upgraded in the event of a cache hit while * the read IO is still in-flight. */ if (priority == ZIO_PRIORITY_ASYNC_READ || priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); #ifdef _KERNEL #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_READBPS, size); racct_add_force(curproc, RACCT_READIOPS, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; #endif if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; abd_t *abd; uint64_t asize; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { abd = hdr->b_l1hdr.b_pabd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); return (0); } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) return (0); /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); if (l2arc_ndev != 0) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; if (hash_lock != NULL) mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } return (0); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has a reference (i.e. a dedup-ed, * dmu_sync-ed block). If this block is being prefetched, then it * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr * until the I/O completes. A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if it's DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT(HDR_EMPTY(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) { l2arc_trim(hdr); arc_hdr_l2hdr_destroy(hdr); } mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_pabd(hdr); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, arc_buf_size(buf), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif mutex_exit(hash_lock); /* * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); (void) refcount_add_many(&arc_anon->arcs_size, arc_buf_size(buf), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); #ifdef illumos arc_buf_unwatch(buf); #endif if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pabd(hdr); } } } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); arc_cksum_compute(buf); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); enum zio_compress compress; if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); compress = BP_GET_COMPRESS(zio->io_bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); /* * Fill the hdr with data. If the hdr is compressed, the data we want * is available from the zio, otherwise we can take it from the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { arc_hdr_alloc_pabd(hdr); /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); ASSERT3U(psize, >, 0); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); arc_share_buf(hdr, buf); } arc_hdr_verify(hdr, zio->io_bp); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); abd_put(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; zio_prop_t localprop = *zp; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_COMPRESSED(buf)) { /* * We're writing a pre-compressed buffer. Make the * compression algorithm requested by the zio_prop_t match * the pre-compressed buffer's compression algorithm. */ localprop.zp_compress = HDR_GET_COMPRESS(hdr); ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); zio_flags |= ZIO_FLAG_RAW; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pabd(hdr); } VERIFY3P(buf->b_data, !=, NULL); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } static int arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = MIN(available_memory, uma_avail()); #endif if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { last_txg = txg; page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { if (page_load > MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (SET_ERROR(EAGAIN)); } page_load = 0; #endif return (0); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (reserve > arc_c/4 && !arc_no_grow) { arc_c = MIN(arc_c_max, reserve * 4); DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); } if (reserve > arc_c) return (SET_ERROR(ENOMEM)); /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ /* assert that it has not wrapped around */ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { uint64_t meta_esize = refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve >> 10, meta_esize >> 10, data_esize >> 10, reserve >> 10, arc_c >> 10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); evict_data->value.ui64 = refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) { return (EACCES); } else { arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); ARCSTAT(arcstat_size) = aggsum_value(&arc_size); ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); ARCSTAT(arcstat_metadata_size) = aggsum_value(&astat_metadata_size); ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size); ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); } return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout it's lifetime * (i.e. it's b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } #ifdef _KERNEL static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { mutex_enter(&arc_reclaim_lock); DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE); cv_signal(&arc_reclaim_thread_cv); /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); mutex_exit(&arc_reclaim_lock); } #endif static void arc_state_init(void) { arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; arc_mru->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mru->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); arc_l2c_only->arcs_list[ARC_BUFC_DATA] = multilist_create(sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), arc_state_multilist_index_func); refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_anon->arcs_size); refcount_create(&arc_mru->arcs_size); refcount_create(&arc_mru_ghost->arcs_size); refcount_create(&arc_mfu->arcs_size); refcount_create(&arc_mfu_ghost->arcs_size); refcount_create(&arc_l2c_only->arcs_size); aggsum_init(&arc_meta_used, 0); aggsum_init(&arc_size, 0); aggsum_init(&astat_data_size, 0); aggsum_init(&astat_metadata_size, 0); aggsum_init(&astat_hdr_size, 0); aggsum_init(&astat_other_size, 0); aggsum_init(&astat_l2_hdr_size, 0); } static void arc_state_fini(void) { refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_anon->arcs_size); refcount_destroy(&arc_mru->arcs_size); refcount_destroy(&arc_mru_ghost->arcs_size); refcount_destroy(&arc_mfu->arcs_size); refcount_destroy(&arc_mfu_ghost->arcs_size); refcount_destroy(&arc_l2c_only->arcs_size); multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); } uint64_t arc_max_bytes(void) { return (arc_c_max); } void arc_init(void) { int i, prefetch_tunable_set = 0; /* * allmem is "all memory that we could possibly use". */ #ifdef illumos #ifdef _KERNEL uint64_t allmem = ptob(physmem - swapfs_minfree); #else uint64_t allmem = (physmem * PAGESIZE) / 2; #endif #else uint64_t allmem = kmem_size(); #endif mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ arc_c_min = MAX(allmem / 32, arc_abs_min); /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ if (allmem >= 1 << 30) arc_c_max = allmem - (1 << 30); else arc_c_max = arc_c_min; arc_c_max = MAX(allmem * 5 / 8, arc_c_max); /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ #ifndef _KERNEL arc_c_min = arc_c_max / 2; #endif #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are * reasonable. */ if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { arc_c_max = zfs_arc_max; arc_c_min = MIN(arc_c_min, arc_c_max); } if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif arc_c = arc_c_max; arc_p = (arc_c >> 1); /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; #ifdef _KERNEL /* * Metadata is stored in the kernel's heap. Don't let us * use more than half the heap for the ARC. */ #ifdef __FreeBSD__ arc_meta_limit = MIN(arc_meta_limit, uma_limit() / 2); #else arc_meta_limit = MIN(arc_meta_limit, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); #endif #endif /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; if (zfs_arc_meta_min > 0) { arc_meta_min = zfs_arc_meta_min; } else { arc_meta_min = arc_c_min / 2; } if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; if (zfs_arc_no_grow_shift > 0) arc_no_grow_shift = zfs_arc_no_grow_shift; /* * Ensure that arc_no_grow_shift is less than arc_shrink_shift. */ if (arc_no_grow_shift >= arc_shrink_shift) arc_no_grow_shift = arc_shrink_shift - 1; if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; zfs_arc_min = arc_c_min; zfs_arc_max = arc_c_max; arc_state_init(); buf_init(); arc_reclaim_thread_exit = B_FALSE; arc_dnlc_evicts_thread_exit = FALSE; arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); #ifdef _KERNEL arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); #endif (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, TS_RUN, minclsyspri); arc_dead = B_FALSE; arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by /etc/system, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4GB). */ if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = ptob(physmem) * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } #ifdef _KERNEL if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) prefetch_tunable_set = 1; #ifdef __i386__ if (prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default on i386 " "-- to enable,\n"); printf(" add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #else if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default if less " "than 4GB of RAM is present;\n" " to enable, add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #endif /* Warn about ZFS memory and address space requirements. */ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " "expect unstable behavior.\n"); } if (allmem < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); printf(" in /boot/loader.conf.\n"); } #endif } void arc_fini(void) { #ifdef _KERNEL if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif mutex_enter(&arc_reclaim_lock); arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); mutex_enter(&arc_dnlc_evicts_lock); arc_dnlc_evicts_thread_exit = TRUE; /* * The user evicts thread will set arc_user_evicts_thread_exit * to FALSE when it is finished exiting; we're waiting for that. */ while (arc_dnlc_evicts_thread_exit) { cv_signal(&arc_dnlc_evicts_cv); cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); } mutex_exit(&arc_dnlc_evicts_lock); arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } mutex_destroy(&arc_reclaim_lock); cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); mutex_destroy(&arc_dnlc_evicts_lock); cv_destroy(&arc_dnlc_evicts_cv); arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid) { ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); return (B_FALSE); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_in_l2); return (B_FALSE); } if (HDR_IO_IN_PROGRESS(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); return (B_FALSE); } if (!HDR_L2CACHE(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); return (B_FALSE); } return (B_TRUE); } static uint64_t l2arc_write_size(void) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write() { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); l2arc_trim(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, arc_hdr_size(hdr)); } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; } ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ valid_cksum = arc_cksum_is_equal(hdr, zio); if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = arc_mru->arcs_list[ARC_BUFC_DATA]; break; } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. */ return; } if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */ taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * A header can't be on this list if it doesn't have L2 header. */ ASSERT(HDR_HAS_L2HDR(hdr)); /* Ensure this header has finished being written. */ ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); if (!all && (hdr->b_l2hdr.b_daddr >= taddr || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_lsize. */ arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_lsize, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); int try; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_lsize = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * Copy buffers for L2ARC writing. */ for (try = 0; try <= 3; try++) { multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); if (hdr == NULL) ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, HDR_GET_LSIZE(hdr)); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); uint64_t psize = arc_hdr_size(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); if ((write_asize + asize) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_full); break; } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); ARCSTAT_BUMP(arcstat_l2_write_pios); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); /* * Normally the L2ARC can use the hdr's data, but if * we're sharing data between the hdr and one of its * bufs, L2ARC needs its own copy of the data so that * the ZIO below can't race with the buf consumer. * Another case where we need to create a copy of the * data is when the buffer size is not device-aligned * and we need to pad the block to make it such. * That also keeps the clock hand suitably aligned. * * To ensure that the copy will be available for the * lifetime of the ZIO and be cleaned up afterwards, we * add it to the l2arc_free_on_write queue. */ abd_t *to_write; if (!HDR_SHARED_DATA(hdr) && psize == asize) { to_write = hdr->b_l1hdr.b_pabd; } else { to_write = abd_alloc_for_io(asize, HDR_ISTYPE_METADATA(hdr)); abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); if (asize != psize) { abd_zero_off(to_write, psize, asize - psize); } l2arc_free_abd_on_write(to_write, asize, arc_buf_type(hdr)); } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_lsize += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; mutex_exit(hash_lock); (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); return (0); } ASSERT3U(write_psize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); ARCSTAT_INCR(arcstat_l2_psize, write_psize); vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { dev->l2ad_hand = dev->l2ad_start; dev->l2ad_first = B_FALSE; } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; return (write_asize); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ /* ARGSUSED */ static void l2arc_feed_thread(void *unused __unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next - ddi_get_lbolt()); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); vdev_ashift_optimize(vd); /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); refcount_create(&adddev->l2ad_alloc); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *dev, *nextdev, *remdev = NULL; /* * Find the device by vdev */ mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { nextdev = list_next(l2arc_dev_list, dev); if (vd == dev->l2ad_vdev) { remdev = dev; break; } } ASSERT3P(remdev, !=, NULL); /* * Remove device from global list */ list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); mutex_destroy(&remdev->l2ad_mtx); refcount_destroy(&remdev->l2ad_alloc); kmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { /* * This is called from dmu_fini(), which is called from spa_fini(); * Because of this, we can assume that all l2arc devices have * already been removed when the pools themselves were removed. */ l2arc_do_free_on_write(); mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 337024) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 337025) @@ -1,3879 +1,3891 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uint_t zfs_dbuf_evict_key; static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); #ifndef __lint extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp); #endif /* ! __lint */ /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* * There are two dbuf caches; each dbuf can only be in one of them at a time. * * 1. Cache of metadata dbufs, to help make read-heavy administrative commands * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs * that represent the metadata that describes filesystems/snapshots/ * bookmarks/properties/etc. We only evict from this cache when we export a * pool, to short-circuit as much I/O as possible for all administrative * commands that need the metadata. There is no eviction policy for this * cache, because we try to only include types in it which would occupy a * very small amount of space per object but create a large impact on the * performance of these commands. Instead, after it reaches a maximum size * (which should only happen on very small memory systems with a very large * number of filesystem objects), we stop taking new dbufs into the * metadata cache, instead putting them in the normal dbuf cache. * * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that * are not currently held but have been recently released. These dbufs * are not eligible for arc eviction until they are aged out of the cache. * Dbufs that are aged out of the cache will be immediately destroyed and * become eligible for arc eviction. * * Dbufs are added to these caches once the last hold is released. If a dbuf is * later accessed and still exists in the dbuf cache, then it will be removed * from the cache and later re-added to the head of the cache. * * If a given dbuf meets the requirements for the metadata cache, it will go * there, otherwise it will be considered for the generic LRU dbuf cache. The * caches and the refcounts tracking their sizes are stored in an array indexed * by those caches' matching enum values (from dbuf_cached_state_t). */ typedef struct dbuf_cache { multilist_t *cache; refcount_t size; } dbuf_cache_t; dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ uint64_t dbuf_cache_max_bytes = 0; uint64_t dbuf_metadata_cache_max_bytes = 0; /* Set the default sizes of the caches to log2 fraction of arc size */ int dbuf_cache_shift = 5; int dbuf_metadata_cache_shift = 6; /* * For diagnostic purposes, this is incremented whenever we can't add * something to the metadata cache because it's full, and instead put * the data in the regular dbuf cache. */ uint64_t dbuf_metadata_cache_overflow; /* * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we * signal the eviction thread to run. * - The high water mark indicates when the eviction thread * is unable to keep up with the incoming load and eviction must * happen in the context of the calling thread. * * The dbuf cache: * (max size) * low water mid water hi water * +----------------------------------------+----------+----------+ * | | | | * | | | | * | | | | * | | | | * +----------------------------------------+----------+----------+ * stop signal evict * evicting eviction directly * thread * * The high and low water marks indicate the operating range for the eviction * thread. The low water mark is, by default, 90% of the total size of the * cache and the high water mark is at 110% (both of these percentages can be * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, * respectively). The eviction thread will try to ensure that the cache remains * within this range by waking up every second and checking if the cache is * above the low water mark. The thread can also be woken up by callers adding * elements into the cache if the cache is larger than the mid water (i.e max * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high * water mark, then callers adding elments to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ /* * The percentage above and below the maximum cache size. */ uint_t dbuf_cache_hiwater_pct = 10; uint_t dbuf_cache_lowater_pct = 10; SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, dbuf_cache_max_bytes, CTLFLAG_RWTUN, &dbuf_cache_max_bytes, 0, "dbuf cache size in bytes"); SYSCTL_INT(_vfs_zfs, OID_AUTO, dbuf_cache_shift, CTLFLAG_RDTUN, &dbuf_cache_shift, 0, "dbuf cache size as log2 fraction of ARC"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_hiwater_pct, CTLFLAG_RWTUN, &dbuf_cache_hiwater_pct, 0, "max percents above the dbuf cache size"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, dbuf_cache_lowater_pct, CTLFLAG_RWTUN, &dbuf_cache_lowater_pct, 0, "max percents below the dbuf cache size"); /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; /* * We use Cityhash for this. It's fast, and has good hash properties without * requiring any large static buffers. */ static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); } #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_inc_64(&dbuf_hash_count); return (NULL); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; /* * We musn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_hash_count); } typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING } dbvu_verify_type_t; static void dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) { #ifdef ZFS_DEBUG int64_t holds; if (db->db_user == NULL) return; /* Only data blocks support the attachment of user data. */ ASSERT(db->db_level == 0); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); ASSERT3U(db->db_state, ==, DB_CACHED); holds = refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { /* * Immediate eviction occurs when holds == dirtycnt. * For normal eviction buffers, holds is zero on * eviction, except when dbuf_fix_old_data() calls * dbuf_clear_data(). However, the hold count can grow * during eviction even though db_mtx is held (see * dmu_bonus_hold() for an example), so we can only * test the generic invariant that holds >= dirtycnt. */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { if (db->db_user_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); } #endif } static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; ASSERT(MUTEX_HELD(&db->db_mtx)); if (dbu == NULL) return; dbuf_verify_user(db, DBVU_EVICTING); db->db_user = NULL; #ifdef ZFS_DEBUG if (dbu->dbu_clear_on_evict_dbufp != NULL) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif /* * There are two eviction callbacks - one that we call synchronously * and one that we invoke via a taskq. The async one is useful for * avoiding lock order reversals and limiting stack depth. * * Note that if we have a sync callback but no async callback, * it's likely that the sync callback will free the structure * containing the dbu. In that case we need to take care to not * dereference dbu after calling the sync evict func. */ boolean_t has_async = (dbu->dbu_evict_func_async != NULL); if (dbu->dbu_evict_func_sync != NULL) dbu->dbu_evict_func_sync(dbu); if (has_async) { taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, dbu, 0, &dbu->dbu_tqent); } } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { if (db->db_level > 0) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } /* * This returns whether this dbuf should be stored in the metadata cache, which * is based on whether it's from one of the dnode types that store data related * to traversing dataset hierarchies. */ static boolean_t dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) { DB_DNODE_ENTER(db); dmu_object_type_t type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); /* Check if this dbuf is one of the types we care about */ if (DMU_OT_IS_METADATA_CACHED(type)) { /* If we hit this, then we set something up wrong in dmu_ot */ ASSERT(DMU_OT_IS_METADATA(type)); /* * Sanity check for small-memory systems: don't allocate too * much memory for this purpose. */ if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) > dbuf_metadata_cache_max_bytes) { dbuf_metadata_cache_overflow++; DTRACE_PROBE1(dbuf__metadata__cache__overflow, dmu_buf_impl_t *, db); return (B_FALSE); } return (B_TRUE); } return (B_FALSE); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ unsigned int dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { dmu_buf_impl_t *db = obj; /* * The assumption here, is the hash value for a given * dmu_buf_impl_t will remain constant throughout it's lifetime * (i.e. it's objset, object, level and blkid fields don't change). * Thus, we don't need to store the dbuf's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid) % multilist_get_num_sublists(ml)); } static inline boolean_t dbuf_cache_above_hiwater(void) { uint64_t dbuf_cache_hiwater_bytes = (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); } static inline boolean_t dbuf_cache_above_lowater(void) { uint64_t dbuf_cache_lowater_bytes = (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); } /* * Evict the oldest eligible dbuf from the dbuf cache. */ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); multilist_sublist_t *mls = multilist_sublist_lock( dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); /* * Set the thread's tsd to indicate that it's processing evictions. * Once a thread stops evicting from the dbuf cache it will * reset its tsd to NULL. */ ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); } DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, multilist_sublist_t *, mls); if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); } else { multilist_sublist_unlock(mls); } (void) tsd_set(zfs_dbuf_evict_key, NULL); } /* * The dbuf evict thread is responsible for aging out dbufs from the * cache. Once the cache has reached it's maximum size, dbufs are removed * and destroyed. The eviction thread will continue running until the size * of the dbuf cache is at or below the maximum size. Once the dbuf is aged * out of the cache it is destroyed and becomes eligible for arc eviction. */ /* ARGSUSED */ static void dbuf_evict_thread(void *unused __unused) { callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); mutex_enter(&dbuf_evict_lock); while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); /* * Keep evicting as long as we're above the low water mark * for the cache. We do this without holding the locks to * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { dbuf_evict_one(); } mutex_enter(&dbuf_evict_lock); } dbuf_evict_thread_exit = B_FALSE; cv_broadcast(&dbuf_evict_cv); CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ thread_exit(); } /* * Wake up the dbuf eviction thread if the dbuf cache is at its max size. * If the dbuf cache is at its high water mark, then evict a dbuf from the * dbuf cache using the callers context. */ static void dbuf_evict_notify(void) { /* * We use thread specific data to track when a thread has * started processing evictions. This allows us to avoid deeply * nested stacks that would have a call flow similar to this: * * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ * * The dbuf_eviction_thread will always have its tsd set until * that thread exits. All other threads will only set their tsd * if they are participating in the eviction process. This only * happens if the eviction thread is unable to process evictions * fast enough. To keep the dbuf cache size in check, other threads * can evict from the dbuf cache directly. Those threads will set * their tsd values so that we ensure that they only evict one dbuf * from the dbuf cache. */ if (tsd_get(zfs_dbuf_evict_key) != NULL) return; /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes) { if (dbuf_cache_above_hiwater()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); } } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average 4K block size. The table will take up * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). */ while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); /* * Setup the parameters for the dbuf caches. We set the sizes of the * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) * of the size of the ARC, respectively. If the values are set in * /etc/system and they're not greater than the size of the ARC, then * we honor that value. */ if (dbuf_cache_max_bytes == 0 || dbuf_cache_max_bytes >= arc_max_bytes()) { dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; } if (dbuf_metadata_cache_max_bytes == 0 || dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { dbuf_metadata_cache_max_bytes = arc_max_bytes() >> dbuf_metadata_cache_shift; } /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { dbuf_caches[dcs].cache = multilist_create(sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), dbuf_cache_multilist_index_func); refcount_create(&dbuf_caches[dcs].size); } tsd_create(&zfs_dbuf_evict_key, NULL); dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; while (dbuf_evict_thread_exit) { cv_signal(&dbuf_evict_cv); cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); tsd_destroy(&zfs_dbuf_evict_key); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { refcount_destroy(&dbuf_caches[dcs].size); multilist_destroy(dbuf_caches[dcs].cache); } } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. * * There is an exception to this rule for indirect blocks; in * this case, if the indirect block is a hole, we fill in a few * fields on each of the child blocks (importantly, birth time) * to prevent hole birth times from being lost when you * partially fill in a hole. */ if (db->db_dirtycnt == 0) { if (db->db_level == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } else { blkptr_t *bps = db->db.db_data; ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, db->db.db_size); /* * We want to verify that all the blkptrs in the * indirect block are holes, but we may have * automatically set up a few fields for them. * We iterate through each blkptr and verify * they only have those fields set. */ for (int i = 0; i < db->db.db_size / sizeof (blkptr_t); i++) { blkptr_t *bp = &bps[i]; ASSERT(ZIO_CHECKSUM_IS_ZERO( &bp->blk_cksum)); ASSERT( DVA_IS_EMPTY(&bp->blk_dva[0]) && DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); ASSERT0(bp->blk_pad[0]); ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); ASSERT0(bp->blk_phys_birth); } } } } DB_DNODE_EXIT(db); } #endif static void dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, B_FALSE, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } /* * Calculate which level n block references the data at the level 0 offset * provided. */ uint64_t dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { /* * The level n blkid is equal to the level 0 blkid divided by * the number of level 0s in a level n block. * * The level 0 blkid is offset >> datablkshift = * offset / 2^datablkshift. * * The number of level 0s in a level n is the number of block * pointers in an indirect block, raised to the power of level. * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). * * Thus, the level n blkid is: offset / * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) * = offset / 2^(datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) * = offset >> (datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) */ return (offset >> (dn->dn_datablkshift + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - if (db->db_level == 0 && db->db_freed_in_flight) { - /* we were freed in flight; disregard any error */ + if (buf == NULL) { + /* i/o error */ + ASSERT(zio == NULL || zio->io_error != 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + db->db_state = DB_UNCACHED; + } else if (db->db_level == 0 && db->db_freed_in_flight) { + /* freed in flight */ + ASSERT(zio == NULL || zio->io_error == 0); if (buf == NULL) { buf = arc_alloc_buf(db->db_objset->os_spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); } arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else if (buf != NULL) { + } else { + /* success */ + ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); - db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); } static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { dnode_t *dn; zbookmark_phys_t zb; arc_flags_t aflags = ARC_FLAG_NOWAIT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } /* * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, db->db.db_size)); bzero(db->db.db_data, db->db.db_size); if (db->db_blkptr != NULL && db->db_level > 0 && BP_IS_HOLE(db->db_blkptr) && db->db_blkptr->blk_birth != 0) { blkptr_t *bps = db->db.db_data; for (int i = 0; i < ((1 << DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); i++) { blkptr_t *bp = &bps[i]; ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 1 << dn->dn_indblkshift); BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); } } DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } DB_DNODE_EXIT(db); db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); } /* * This is our just-in-time copy function. It makes a copy of buffers that * have been modified in a previous transaction group before we access them in * the current active group. * * This function is used in three places: when we are dirtying a buffer for the * first time in a txg, when we are freeing a range in a dnode that includes * this buffer, and when we are accessing a buffer which was received compressed * and later referenced in a WRITE_BYREF record. * * Note that when we are called from dbuf_free_range() we do not put a hold on * the buffer, we just traverse the active dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); if (compress_type == ZIO_COMPRESS_OFF) { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } else { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type); } bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { db->db_buf = NULL; dbuf_clear_data(db); } } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { /* * If the arc buf is compressed, we need to decompress it to * read the data. This could happen during the "zfs receive" of * a stream which is compressed and deduplicated. */ if (db->db_buf != NULL && arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { dbuf_fix_old_data(db, spa_syncing_txg(dmu_objset_spa(db->db_objset))); err = arc_decompress(db->db_buf); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } dbuf_read_impl(db, zio, flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); if (need_wait) err = zio_wait(zio); } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); } mutex_exit(&db->db_mtx); } return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only * comes from dbuf_dirty() callers who must also hold a range lock. */ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; if (end_blkid > dn->dn_maxblkid && !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); db_search.db_level = 0; db_search.db_blkid = start_blkid; db_search.db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_destroy(db); continue; } /* The dbuf is referenced */ if (db->db_last_dirty != NULL) { dbuf_dirty_record_t *dr = db->db_last_dirty; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ obuf = db->db_buf; bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } /* * We already have a dirty record for this TXG, and we are being * dirtied again. */ static void dbuf_redirty(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) { /* Already released on initial dirty, so just thaw. */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } } } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ #ifdef DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); } ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED) { if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); } if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); } if (dn->dn_objset->os_dsl_dataset != NULL) { rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); } } mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); dbuf_redirty(dr); mutex_exit(&db->db_mtx); return (dr); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); #ifdef DEBUG if (dn->dn_objset->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { dmu_objset_willuse_space(os, db->db.db_size, tx); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr; /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * The dn_struct_rwlock prevents db_blkptr from changing * due to a write from syncing context completing * while we are running, so we want to acquire it before * looking at db_blkptr. */ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } /* * We need to hold the dn_struct_rwlock to make this assertion, * because it protects dn_phys / dn_next_nlevels from changing. */ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); /* * If we are overwriting a dedup BP, then unless it is snapshotted, * when we get to syncing context we will need to decrement its * refcount in the DDT. Prefetch the relevant DDT block so that * syncing context won't have to wait for the i/o. */ ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } return (B_FALSE); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); /* * Quick check for dirtyness. For already dirty blocks, this * reduces runtime of this function by >90%, and overall performance * by 50% for some workloads (e.g. file deletion with indirect blocks * cached). */ mutex_enter(&db->db_mtx); dbuf_dirty_record_t *dr; for (dr = db->db_last_dirty; dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); return; } } mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), SPA_FEATURE_EMBEDDED_DATA)); } DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); dl = &db->db_last_dirty->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT(arc_buf_lsize(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } void dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(db->db.db_data != NULL); zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); db->db_state = DB_UNCACHED; } dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(dbuf_caches[db->db_caching_status].cache, db); (void) refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; /* * Now that db_state is DB_EVICTING, nobody else can find this via * the hash table. We can now drop db_mtx, which allows us to * acquire the dn_dbufs_mtx. */ mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); if (needlock) mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } ASSERT(refcount_is_zero(&db->db_holds)); db->db_parent = NULL; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); kmem_cache_free(dbuf_kmem_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) dbuf_rele(parent, db); } /* * Note: While bpp will always be updated if the function returns success, * parentp will not be updated if the dnode does not have dn_dbuf filled in; * this happens when the dnode is the meta-dnode, or a userused or groupused * object. */ static int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) { *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = &dn->dn_phys->dn_spill; else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } int nlevels = (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); /* * This assertion shouldn't trip as long as the max indirect block size * is less than 1M. The reason for this is that up to that point, * the number of levels required to address an entire object with blocks * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 * (i.e. we can address the entire object), objects will all use at most * N-1 levels and the assertion won't overflow. However, once epbs is * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be * enough to address an entire object, so objects will have 5 levels, * but then this assertion will overflow. * * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we * need to redo this logic to handle overflows. */ ASSERT(level >= nlevels || ((nlevels - level - 1) * epbs) + highbit64(dn->dn_phys->dn_nblkptr) <= 64); if (level >= nlevels || blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << ((nlevels - level - 1) * epbs)) || (fail_sparse && blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before its added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ } dbuf_prefetch_arg_t; /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return; arc_flags_t aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &dpa->dpa_zb); } /* * Called when an indirect block above our prefetch target is read in. This * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ static void dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + + if (abuf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); + kmem_free(dpa, sizeof (*dpa)); + return; + } + ASSERT(zio == NULL || zio->io_error == 0); /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without * first calling zio_read() to issue a physical read. Once * a physical read is made the dpa_dnode must be invalidated * as the locks guarding it may have been dropped. If the * dpa_dnode is still valid, then we want to add it to the dbuf * cache. To do so, we must hold the dbuf associated with the block * we just prefetched, read its contents so that we associate it * with an arc_buf_t, and then release it. */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); if (zio->io_flags & ZIO_FLAG_RAW) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { uint64_t curblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); } if (abuf == NULL) { kmem_free(dpa, sizeof(*dpa)); return; } dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); if (BP_IS_HOLE(bp)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) iter_aflags |= ARC_FLAG_L2CACHE; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } arc_buf_destroy(abuf, private); } /* * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to * complete. */ void dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags) { blkptr_t bp; int epbs, nlevels, curlevel; uint64_t curblkid; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) return; if (dnode_block_freed(dn, blkid)) return; /* * This dnode hasn't been written to disk yet, so there's nothing to * prefetch. */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) return; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) return; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db != NULL) { mutex_exit(&db->db_mtx); /* * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ return; } /* * Find the closest ancestor (indirect block) of the target block * that is present in the cache. In this indirect block, we will * find the bp that is at curlevel, curblkid. */ curlevel = level; curblkid = blkid; while (curlevel < nlevels - 1) { int parent_level = curlevel + 1; uint64_t parent_blkid = curblkid >> epbs; dmu_buf_impl_t *db; if (dbuf_hold_impl(dn, parent_level, parent_blkid, FALSE, TRUE, FTAG, &db) == 0) { blkptr_t *bpp = db->db_buf->b_data; bp = bpp[P2PHASE(curblkid, 1 << epbs)]; dbuf_rele(db, FTAG); break; } curlevel = parent_level; curblkid = parent_blkid; } if (curlevel == nlevels - 1) { /* No cached indirect blocks found. */ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } if (BP_IS_HOLE(&bp)) return; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, level, blkid); dpa->dpa_curlevel = curlevel; dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* * If we have the indirect just above us, no need to do the asynchronous * prefetch chain; we'll just run the last step ourselves. If we're at * a higher level, though, we want to issue the prefetches for all the * indirect blocks asynchronously, so we can go on with whatever we were * doing. */ if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, &bp, dbuf_prefetch_indirect_done, dpa, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } /* * We use pio here instead of dpa_zio since it's possible that * dpa may have already been freed. */ zio_nowait(pio); } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; top: /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db == NULL) { blkptr_t *bp = NULL; int err; if (fail_uncached) return (SET_ERROR(ENOENT)); ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); return (err); } } if (err && err != ENOENT) return (err); db = dbuf_create(dn, level, blkid, parent, bp); } if (fail_uncached && db->db_state != DB_CACHED) { mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } if (db->db_buf != NULL) { arc_buf_access(db->db_buf); ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; if (dr->dt.dl.dr_data == db->db_buf) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, type, db->db.db_size)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } if (multilist_link_active(&db->db_cache_link)) { ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_caching_status == DB_DBUF_CACHE || db->db_caching_status == DB_DBUF_METADATA_CACHE); multilist_remove(dbuf_caches[db->db_caching_status].cache, db); (void) refcount_remove_many( &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); db->db_caching_status = DB_NO_CACHE; } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; return (0); } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); ASSERT3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (db->db_blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag); } void dmu_buf_rele(dmu_buf_t *db, void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf != NULL && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; boolean_t evict_dbuf = db->db_pending_evict; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); if (evict_dbuf) dnode_evict_bonus(dn); dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_destroy(db); } else if (arc_released(db->db_buf)) { /* * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); } else { boolean_t do_arc_evict = B_FALSE; blkptr_t bp; spa_t *spa = dmu_objset_spa(db->db_objset); if (!DBUF_IS_CACHEABLE(db) && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { do_arc_evict = B_TRUE; bp = *db->db_blkptr; } if (!DBUF_IS_CACHEABLE(db) || db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); dbuf_cached_state_t dcs = dbuf_include_in_metadata_cache(db) ? DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; db->db_caching_status = dcs; multilist_insert(dbuf_caches[dcs].cache, db); (void) refcount_add_many(&dbuf_caches[dcs].size, db->db.db_size, db); mutex_exit(&db->db_mtx); if (db->db_caching_status == DB_DBUF_CACHE) { dbuf_evict_notify(); } } if (do_arc_evict) arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (refcount_count(&db->db_holds)); } void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); dbuf_verify_user(db, DBVU_NOT_EVICTING); if (db->db_user == old_user) db->db_user = new_user; else old_user = db->db_user; dbuf_verify_user(db, DBVU_NOT_EVICTING); mutex_exit(&db->db_mtx); return (old_user); } void * dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, NULL, user)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_user_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } void * dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_verify_user(db, DBVU_NOT_EVICTING); return (db->db_user); } void dmu_buf_user_evict_wait() { taskq_wait(dbu_evict_taskq); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } objset_t * dmu_buf_get_objset(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_objset); } dnode_t * dmu_buf_dnode_enter(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_ENTER(dbi); return (DB_DNODE(dbi)); } void dmu_buf_dnode_exit(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_EXIT(dbi); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = &dn->dn_phys->dn_spill; BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mis-match). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT0(db->db_level); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); if (compress_type == ZIO_COMPRESS_OFF) { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } else { ASSERT3U(type, ==, ARC_BUFC_DATA); int lsize = arc_buf_lsize(*datap); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type); } bcopy(db->db.db_data, (*datap)->b_data, psize); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while (dr = list_head(list)) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, !=, NULL); ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && db->db_blkptr == &dn->dn_phys->dn_spill); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { dnode_phys_t *dnp = db->db.db_data; for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) fill++; } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) bp->blk_fill = fill; mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); *db->db_blkptr = *bp; rw_exit(&dn->dn_struct_rwlock); } /* ARGSUSED */ /* * This function gets called just prior to running through the compression * stage of the zio pipeline. If we're an indirect block comprised of only * holes, then we want this indirect to be compressed away to a hole. In * order to do that we must zero out any information about the holes that * this indirect points to prior to before we try to compress it. */ static void dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } /* * If all the children are holes, then zero them all out so that * we may get compressed away. */ if (i == 1 << epbs) { /* * We only found holes. Grab the rwlock to prevent * anybody from reading the blocks we're about to * zero out. */ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ /* ARGSUSED */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dsl_pool_sync()'s call to * dsl_pool_undirty_space(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) abd_put(zio->io_abd); } typedef struct dbuf_remap_impl_callback_arg { objset_t *drica_os; uint64_t drica_blk_birth; dmu_tx_t *drica_tx; } dbuf_remap_impl_callback_arg_t; static void dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, void *arg) { dbuf_remap_impl_callback_arg_t *drica = arg; objset_t *os = drica->drica_os; spa_t *spa = dmu_objset_spa(os); dmu_tx_t *tx = drica->drica_tx; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (os == spa_meta_objset(spa)) { spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); } else { dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, size, drica->drica_blk_birth, tx); } } static void dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); dbuf_remap_impl_callback_arg_t drica; ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; drica.drica_blk_birth = bp->blk_birth; drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* * The struct_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); *bp = bp_copy; rw_exit(&dn->dn_struct_rwlock); } } /* * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting * to remap a copy of every bp in the dbuf. */ boolean_t dbuf_can_remap(const dmu_buf_impl_t *db) { spa_t *spa = dmu_objset_spa(db->db_objset); blkptr_t *bp = db->db.db_data; boolean_t ret = B_FALSE; ASSERT3U(db->db_level, >, 0); ASSERT3S(db->db_state, ==, DB_CACHED); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { blkptr_t bp_copy = bp[i]; if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { ret = B_TRUE; break; } } spa_config_exit(spa, SCL_VDEV, FTAG); return (ret); } boolean_t dnode_needs_remap(const dnode_t *dn) { spa_t *spa = dmu_objset_spa(dn->dn_objset); boolean_t ret = B_FALSE; if (dn->dn_phys->dn_nlevels == 0) { return (B_FALSE); } ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) { blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j]; if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) { ret = B_TRUE; break; } } spa_config_exit(spa, SCL_VDEV, FTAG); return (ret); } /* * Remap any existing BP's to concrete vdevs, if possible. */ static void dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(db->db_objset); ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) return; if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { dbuf_remap_impl(dn, &bp[i], tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); } } } } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; ASSERT(dmu_tx_is_syncing(tx)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } dbuf_remap(dn, db, tx); } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(zio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read * db_blkptr because we are in syncing context. */ dr->dr_bp_copy = *db->db_blkptr; if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); /* * For indirect blocks, we want to setup the children * ready callback so that we can properly handle an indirect * block that only contains holes. */ arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c (revision 337024) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c (revision 337025) @@ -1,199 +1,215 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include typedef struct zcomp_stats { kstat_named_t zcompstat_attempts; kstat_named_t zcompstat_empty; kstat_named_t zcompstat_skipped_insufficient_gain; } zcomp_stats_t; static zcomp_stats_t zcomp_stats = { { "attempts", KSTAT_DATA_UINT64 }, { "empty", KSTAT_DATA_UINT64 }, { "skipped_insufficient_gain", KSTAT_DATA_UINT64 } }; #define ZCOMPSTAT_INCR(stat, val) \ atomic_add_64(&zcomp_stats.stat.value.ui64, (val)); #define ZCOMPSTAT_BUMP(stat) ZCOMPSTAT_INCR(stat, 1); kstat_t *zcomp_ksp; /* + * If nonzero, every 1/X decompression attempts will fail, simulating + * an undetected memory error. + */ +uint64_t zio_decompress_fail_fraction = 0; + +/* * Compression vectors. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {"inherit", 0, NULL, NULL}, {"on", 0, NULL, NULL}, {"uncompressed", 0, NULL, NULL}, {"lzjb", 0, lzjb_compress, lzjb_decompress}, {"empty", 0, NULL, NULL}, {"gzip-1", 1, gzip_compress, gzip_decompress}, {"gzip-2", 2, gzip_compress, gzip_decompress}, {"gzip-3", 3, gzip_compress, gzip_decompress}, {"gzip-4", 4, gzip_compress, gzip_decompress}, {"gzip-5", 5, gzip_compress, gzip_decompress}, {"gzip-6", 6, gzip_compress, gzip_decompress}, {"gzip-7", 7, gzip_compress, gzip_decompress}, {"gzip-8", 8, gzip_compress, gzip_decompress}, {"gzip-9", 9, gzip_compress, gzip_decompress}, {"zle", 64, zle_compress, zle_decompress}, {"lz4", 0, lz4_compress, lz4_decompress} }; enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent) { enum zio_compress result; ASSERT(child < ZIO_COMPRESS_FUNCTIONS); ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); ASSERT(parent != ZIO_COMPRESS_INHERIT); result = child; if (result == ZIO_COMPRESS_INHERIT) result = parent; if (result == ZIO_COMPRESS_ON) { if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS)) result = ZIO_COMPRESS_LZ4_ON_VALUE; else result = ZIO_COMPRESS_LEGACY_ON_VALUE; } return (result); } /*ARGSUSED*/ static int zio_compress_zeroed_cb(void *data, size_t len, void *private) { uint64_t *end = (uint64_t *)((char *)data + len); for (uint64_t *word = (uint64_t *)data; word < end; word++) if (*word != 0) return (1); return (0); } size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) { size_t c_len, d_len; zio_compress_info_t *ci = &zio_compress_table[c]; ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); ZCOMPSTAT_BUMP(zcompstat_attempts); /* * If the data is all zeroes, we don't even need to allocate * a block for it. We indicate this by returning zero size. */ if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) { ZCOMPSTAT_BUMP(zcompstat_empty); return (0); } if (c == ZIO_COMPRESS_EMPTY) return (s_len); /* Compress at least 12.5% */ d_len = s_len - (s_len >> 3); /* No compression algorithms can read from ABDs directly */ void *tmp = abd_borrow_buf_copy(src, s_len); c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); abd_return_buf(src, tmp, s_len); if (c_len > d_len) { ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain); return (s_len); } ASSERT3U(c_len, <=, d_len); return (c_len); } int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len) { zio_compress_info_t *ci = &zio_compress_table[c]; if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, size_t d_len) { void *tmp = abd_borrow_buf_copy(src, s_len); int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); abd_return_buf(src, tmp, s_len); + + /* + * Decompression shouldn't fail, because we've already verifyied + * the checksum. However, for extra protection (e.g. against bitflips + * in non-ECC RAM), we handle this error (and test it). + */ + ASSERT0(ret); + if (zio_decompress_fail_fraction != 0 && + spa_get_random(zio_decompress_fail_fraction) == 0) + ret = SET_ERROR(EINVAL); return (ret); } void zio_compress_init(void) { zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc", KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zcomp_ksp != NULL) { zcomp_ksp->ks_data = &zcomp_stats; kstat_install(zcomp_ksp); } } void zio_compress_fini(void) { if (zcomp_ksp != NULL) { kstat_delete(zcomp_ksp); zcomp_ksp = NULL; } } Index: head/sys/cddl/contrib/opensolaris =================================================================== --- head/sys/cddl/contrib/opensolaris (revision 337024) +++ head/sys/cddl/contrib/opensolaris (revision 337025) Property changes on: head/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor-sys/illumos/dist:r337022