Page MenuHomeFreeBSD

D21895.diff
No OneTemporary

D21895.diff

Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
*/
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
@@ -62,14 +63,15 @@
&zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
/*
- * Tunable to control percentage of dirtied blocks from frees in one TXG.
- * After this threshold is crossed, additional dirty blocks from frees
- * wait until the next TXG.
+ * Tunable to control percentage of dirtied L1 blocks from frees allowed into
+ * one TXG. After this threshold is crossed, additional dirty blocks from frees
+ * will wait until the next TXG.
* A value of zero will disable this throttle.
*/
-uint32_t zfs_per_txg_dirty_frees_percent = 30;
+uint32_t zfs_per_txg_dirty_frees_percent = 5;
SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
- &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
+ &zfs_per_txg_dirty_frees_percent, 0,
+ "Percentage of dirtied indirect blocks from frees allowed in one txg");
/*
* This can be used for testing, to ensure that certain actions happen
@@ -683,11 +685,13 @@
*
* On input, *start should be the first offset that does not need to be
* freed (e.g. "offset + length"). On return, *start will be the first
- * offset that should be freed.
+ * offset that should be freed and l1blks is set to the number of level 1
+ * indirect blocks found within the chunk.
*/
static int
-get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
{
+ uint64_t blks;
uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
/* bytes of data covered by a level-1 indirect block */
uint64_t iblkrange =
@@ -695,13 +699,23 @@
ASSERT3U(minimum, <=, *start);
- if (*start - minimum <= iblkrange * maxblks) {
+ /*
+ * Check if we can free the entire range assuming that all of the
+ * L1 blocks in this range have data. If we can, we use this
+ * worst case value as an estimate so we can avoid having to look
+ * at the object's actual data.
+ */
+ uint64_t total_l1blks =
+ (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
+ iblkrange;
+ if (total_l1blks <= maxblks) {
+ *l1blks = total_l1blks;
*start = minimum;
return (0);
}
ASSERT(ISP2(iblkrange));
- for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
+ for (blks = 0; *start > minimum && blks < maxblks; blks++) {
int err;
/*
@@ -711,6 +725,7 @@
* to search.
*/
(*start)--;
+
err = dnode_next_offset(dn,
DNODE_FIND_BACKWARDS, start, 2, 1, 0);
@@ -719,6 +734,7 @@
*start = minimum;
break;
} else if (err != 0) {
+ *l1blks = blks;
return (err);
}
@@ -727,6 +743,8 @@
}
if (*start < minimum)
*start = minimum;
+ *l1blks = blks;
+
return (0);
}
@@ -762,14 +780,14 @@
dirty_frees_threshold =
zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
else
- dirty_frees_threshold = zfs_dirty_data_max / 4;
+ dirty_frees_threshold = zfs_dirty_data_max / 20;
if (length == DMU_OBJECT_END || offset + length > object_size)
length = object_size - offset;
while (length != 0) {
uint64_t chunk_end, chunk_begin, chunk_len;
- uint64_t long_free_dirty_all_txgs = 0;
+ uint64_t l1blks;
dmu_tx_t *tx;
if (dmu_objset_zfs_unmounting(dn->dn_objset))
@@ -778,7 +796,7 @@
chunk_end = chunk_begin = offset + length;
/* move chunk_begin backwards to the beginning of this chunk */
- err = get_next_chunk(dn, &chunk_begin, offset);
+ err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
if (err)
return (err);
ASSERT3U(chunk_begin, >=, offset);
@@ -786,24 +804,6 @@
chunk_len = chunk_end - chunk_begin;
- mutex_enter(&dp->dp_lock);
- for (int t = 0; t < TXG_SIZE; t++) {
- long_free_dirty_all_txgs +=
- dp->dp_long_free_dirty_pertxg[t];
- }
- mutex_exit(&dp->dp_lock);
-
- /*
- * To avoid filling up a TXG with just frees wait for
- * the next TXG to open before freeing more chunks if
- * we have reached the threshold of frees
- */
- if (dirty_frees_threshold != 0 &&
- long_free_dirty_all_txgs >= dirty_frees_threshold) {
- txg_wait_open(dp, 0);
- continue;
- }
-
tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
@@ -818,13 +818,42 @@
return (err);
}
+ uint64_t txg = dmu_tx_get_txg(tx);
+
mutex_enter(&dp->dp_lock);
- dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
- chunk_len;
+ uint64_t long_free_dirty =
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
mutex_exit(&dp->dp_lock);
+
+ /*
+ * To avoid filling up a TXG with just frees, wait for
+ * the next TXG to open before freeing more chunks if
+ * we have reached the threshold of frees.
+ */
+ if (dirty_frees_threshold != 0 &&
+ long_free_dirty >= dirty_frees_threshold) {
+ dmu_tx_commit(tx);
+ txg_wait_open(dp, 0);
+ continue;
+ }
+
+ /*
+ * In order to prevent unnecessary write throttling, for each
+ * TXG, we track the cumulative size of L1 blocks being dirtied
+ * in dnode_free_range() below. We compare this number to a
+ * tunable threshold, past which we prevent new L1 dirty freeing
+ * blocks from being added into the open TXG. See
+ * dmu_free_long_range_impl() for details. The threshold
+ * prevents write throttle activation due to dirty freeing L1
+ * blocks taking up a large percentage of zfs_dirty_data_max.
+ */
+ mutex_enter(&dp->dp_lock);
+ dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
+ l1blks << dn->dn_indblkshift;
+ mutex_exit(&dp->dp_lock);
DTRACE_PROBE3(free__long__range,
- uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
- uint64_t, dmu_tx_get_txg(tx));
+ uint64_t, long_free_dirty, uint64_t, chunk_len,
+ uint64_t, txg);
dnode_free_range(dn, chunk_begin, chunk_len, tx);
dmu_tx_commit(tx);

File Metadata

Mime Type
text/plain
Expires
Fri, Nov 21, 10:36 PM (5 h, 18 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25817834
Default Alt Text
D21895.diff (6 KB)

Event Timeline