Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -62,14 +63,15 @@ &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); /* - * Tunable to control percentage of dirtied blocks from frees in one TXG. - * After this threshold is crossed, additional dirty blocks from frees - * wait until the next TXG. + * Tunable to control percentage of dirtied L1 blocks from frees allowed into + * one TXG. After this threshold is crossed, additional dirty blocks from frees + * will wait until the next TXG. * A value of zero will disable this throttle. */ -uint32_t zfs_per_txg_dirty_frees_percent = 30; +uint32_t zfs_per_txg_dirty_frees_percent = 5; SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, - &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); + &zfs_per_txg_dirty_frees_percent, 0, + "Percentage of dirtied indirect blocks from frees allowed in one txg"); /* * This can be used for testing, to ensure that certain actions happen @@ -683,11 +685,13 @@ * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first - * offset that should be freed. + * offset that should be freed and l1blks is set to the number of level 1 + * indirect blocks found within the chunk. */ static int -get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) { + uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = @@ -695,13 +699,23 @@ ASSERT3U(minimum, <=, *start); - if (*start - minimum <= iblkrange * maxblks) { + /* + * Check if we can free the entire range assuming that all of the + * L1 blocks in this range have data. If we can, we use this + * worst case value as an estimate so we can avoid having to look + * at the object's actual data. + */ + uint64_t total_l1blks = + (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / + iblkrange; + if (total_l1blks <= maxblks) { + *l1blks = total_l1blks; *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); - for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { + for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* @@ -711,6 +725,7 @@ * to search. */ (*start)--; + err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); @@ -719,6 +734,7 @@ *start = minimum; break; } else if (err != 0) { + *l1blks = blks; return (err); } @@ -727,6 +743,8 @@ } if (*start < minimum) *start = minimum; + *l1blks = blks; + return (0); } @@ -762,14 +780,14 @@ dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else - dirty_frees_threshold = zfs_dirty_data_max / 4; + dirty_frees_threshold = zfs_dirty_data_max / 20; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; - uint64_t long_free_dirty_all_txgs = 0; + uint64_t l1blks; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) @@ -778,7 +796,7 @@ chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ - err = get_next_chunk(dn, &chunk_begin, offset); + err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); @@ -786,24 +804,6 @@ chunk_len = chunk_end - chunk_begin; - mutex_enter(&dp->dp_lock); - for (int t = 0; t < TXG_SIZE; t++) { - long_free_dirty_all_txgs += - dp->dp_long_free_dirty_pertxg[t]; - } - mutex_exit(&dp->dp_lock); - - /* - * To avoid filling up a TXG with just frees wait for - * the next TXG to open before freeing more chunks if - * we have reached the threshold of frees - */ - if (dirty_frees_threshold != 0 && - long_free_dirty_all_txgs >= dirty_frees_threshold) { - txg_wait_open(dp, 0); - continue; - } - tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); @@ -818,13 +818,42 @@ return (err); } + uint64_t txg = dmu_tx_get_txg(tx); + mutex_enter(&dp->dp_lock); - dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += - chunk_len; + uint64_t long_free_dirty = + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; mutex_exit(&dp->dp_lock); + + /* + * To avoid filling up a TXG with just frees, wait for + * the next TXG to open before freeing more chunks if + * we have reached the threshold of frees. + */ + if (dirty_frees_threshold != 0 && + long_free_dirty >= dirty_frees_threshold) { + dmu_tx_commit(tx); + txg_wait_open(dp, 0); + continue; + } + + /* + * In order to prevent unnecessary write throttling, for each + * TXG, we track the cumulative size of L1 blocks being dirtied + * in dnode_free_range() below. We compare this number to a + * tunable threshold, past which we prevent new L1 dirty freeing + * blocks from being added into the open TXG. See + * dmu_free_long_range_impl() for details. The threshold + * prevents write throttle activation due to dirty freeing L1 + * blocks taking up a large percentage of zfs_dirty_data_max. + */ + mutex_enter(&dp->dp_lock); + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += + l1blks << dn->dn_indblkshift; + mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, - uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, - uint64_t, dmu_tx_get_txg(tx)); + uint64_t, long_free_dirty, uint64_t, chunk_len, + uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx);