Changeset View
Changeset View
Standalone View
Standalone View
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
Show First 20 Lines • Show All 6,176 Lines • ▼ Show 20 Lines | |||||
* Returns the number of bytes actually written (which may be smaller than | * Returns the number of bytes actually written (which may be smaller than | ||||
* the delta by which the device hand has changed due to alignment). | * the delta by which the device hand has changed due to alignment). | ||||
*/ | */ | ||||
static uint64_t | static uint64_t | ||||
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, | l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, | ||||
boolean_t *headroom_boost) | boolean_t *headroom_boost) | ||||
{ | { | ||||
arc_buf_hdr_t *hdr, *hdr_prev, *head; | arc_buf_hdr_t *hdr, *hdr_prev, *head; | ||||
uint64_t write_asize, write_psize, write_sz, headroom, | uint64_t write_asize, write_sz, headroom, buf_compress_minsz; | ||||
buf_compress_minsz; | |||||
void *buf_data; | void *buf_data; | ||||
boolean_t full; | boolean_t full; | ||||
l2arc_write_callback_t *cb; | l2arc_write_callback_t *cb; | ||||
zio_t *pio, *wzio; | zio_t *pio, *wzio; | ||||
uint64_t guid = spa_load_guid(spa); | uint64_t guid = spa_load_guid(spa); | ||||
const boolean_t do_headroom_boost = *headroom_boost; | const boolean_t do_headroom_boost = *headroom_boost; | ||||
int try; | int try; | ||||
ASSERT(dev->l2ad_vdev != NULL); | ASSERT(dev->l2ad_vdev != NULL); | ||||
/* Lower the flag now, we might want to raise it again later. */ | /* Lower the flag now, we might want to raise it again later. */ | ||||
*headroom_boost = B_FALSE; | *headroom_boost = B_FALSE; | ||||
pio = NULL; | pio = NULL; | ||||
write_sz = write_asize = write_psize = 0; | write_sz = write_asize = 0; | ||||
full = B_FALSE; | full = B_FALSE; | ||||
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); | head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); | ||||
head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; | head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; | ||||
head->b_flags |= ARC_FLAG_HAS_L2HDR; | head->b_flags |= ARC_FLAG_HAS_L2HDR; | ||||
ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); | ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); | ||||
/* | /* | ||||
* We will want to try to compress buffers that are at least 2x the | * We will want to try to compress buffers that are at least 2x the | ||||
Show All 25 Lines | for (try = 0; try <= 3; try++) { | ||||
headroom = target_sz * l2arc_headroom; | headroom = target_sz * l2arc_headroom; | ||||
if (do_headroom_boost) | if (do_headroom_boost) | ||||
headroom = (headroom * l2arc_headroom_boost) / 100; | headroom = (headroom * l2arc_headroom_boost) / 100; | ||||
for (; hdr; hdr = hdr_prev) { | for (; hdr; hdr = hdr_prev) { | ||||
kmutex_t *hash_lock; | kmutex_t *hash_lock; | ||||
uint64_t buf_sz; | uint64_t buf_sz; | ||||
uint64_t buf_a_sz; | |||||
if (arc_warm == B_FALSE) | if (arc_warm == B_FALSE) | ||||
hdr_prev = multilist_sublist_next(mls, hdr); | hdr_prev = multilist_sublist_next(mls, hdr); | ||||
else | else | ||||
hdr_prev = multilist_sublist_prev(mls, hdr); | hdr_prev = multilist_sublist_prev(mls, hdr); | ||||
ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); | ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); | ||||
hash_lock = HDR_LOCK(hdr); | hash_lock = HDR_LOCK(hdr); | ||||
Show All 15 Lines | for (; hdr; hdr = hdr_prev) { | ||||
break; | break; | ||||
} | } | ||||
if (!l2arc_write_eligible(guid, hdr)) { | if (!l2arc_write_eligible(guid, hdr)) { | ||||
mutex_exit(hash_lock); | mutex_exit(hash_lock); | ||||
continue; | continue; | ||||
} | } | ||||
if ((write_sz + hdr->b_size) > target_sz) { | /* | ||||
* Assume that the buffer is not going to be compressed | |||||
* and could take more space on disk because of a larger | |||||
* disk block size. | |||||
*/ | |||||
buf_sz = hdr->b_size; | |||||
buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); | |||||
if ((write_asize + buf_a_sz) > target_sz) { | |||||
full = B_TRUE; | full = B_TRUE; | ||||
mutex_exit(hash_lock); | mutex_exit(hash_lock); | ||||
ARCSTAT_BUMP(arcstat_l2_write_full); | ARCSTAT_BUMP(arcstat_l2_write_full); | ||||
break; | break; | ||||
} | } | ||||
if (pio == NULL) { | if (pio == NULL) { | ||||
/* | /* | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | for (; hdr; hdr = hdr_prev) { | ||||
* of this function. Thus, we can't simply | * of this function. Thus, we can't simply | ||||
* change the b_flags field to denote that the | * change the b_flags field to denote that the | ||||
* IO has been sent. We can change the b_daddr | * IO has been sent. We can change the b_daddr | ||||
* field of the L2 portion, though, since we'll | * field of the L2 portion, though, since we'll | ||||
* be holding the l2ad_mtx; which is why we're | * be holding the l2ad_mtx; which is why we're | ||||
* using it to denote the header's state change. | * using it to denote the header's state change. | ||||
*/ | */ | ||||
hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; | hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; | ||||
buf_sz = hdr->b_size; | |||||
hdr->b_flags |= ARC_FLAG_HAS_L2HDR; | hdr->b_flags |= ARC_FLAG_HAS_L2HDR; | ||||
mutex_enter(&dev->l2ad_mtx); | mutex_enter(&dev->l2ad_mtx); | ||||
list_insert_head(&dev->l2ad_buflist, hdr); | list_insert_head(&dev->l2ad_buflist, hdr); | ||||
mutex_exit(&dev->l2ad_mtx); | mutex_exit(&dev->l2ad_mtx); | ||||
/* | /* | ||||
* Compute and store the buffer cksum before | * Compute and store the buffer cksum before | ||||
* writing. On debug the cksum is verified first. | * writing. On debug the cksum is verified first. | ||||
*/ | */ | ||||
arc_cksum_verify(hdr->b_l1hdr.b_buf); | arc_cksum_verify(hdr->b_l1hdr.b_buf); | ||||
arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); | arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); | ||||
mutex_exit(hash_lock); | mutex_exit(hash_lock); | ||||
write_sz += buf_sz; | write_sz += buf_sz; | ||||
write_asize += buf_a_sz; | |||||
} | } | ||||
multilist_sublist_unlock(mls); | multilist_sublist_unlock(mls); | ||||
if (full == B_TRUE) | if (full == B_TRUE) | ||||
break; | break; | ||||
} | } | ||||
/* No buffers selected for writing? */ | /* No buffers selected for writing? */ | ||||
if (pio == NULL) { | if (pio == NULL) { | ||||
ASSERT0(write_sz); | ASSERT0(write_sz); | ||||
ASSERT(!HDR_HAS_L1HDR(head)); | ASSERT(!HDR_HAS_L1HDR(head)); | ||||
kmem_cache_free(hdr_l2only_cache, head); | kmem_cache_free(hdr_l2only_cache, head); | ||||
return (0); | return (0); | ||||
} | } | ||||
mutex_enter(&dev->l2ad_mtx); | mutex_enter(&dev->l2ad_mtx); | ||||
/* | /* | ||||
* Note that elsewhere in this file arcstat_l2_asize | |||||
smh: I don't really like the fact that arcstat_l2_asize isn't calculating asize tbh. Would it not be… | |||||
Not Done Inline ActionsTwo notes:
I will submit the patch for a review soon. avg: Two notes:
- it's been like that for years, so no rush
- I do have a patch that ensures… | |||||
* and the used space on l2ad_vdev are updated using b_asize, | |||||
* which is not necessarily rounded up to the device block size. | |||||
* Too keep accounting consistent we do the same here as well: | |||||
* stats_size accumulates the sum of b_asize of the written buffers, | |||||
* while write_asize accumulates the sum of b_asize rounded up | |||||
* to the device block size. | |||||
* The latter sum is used only to validate the corectness of the code. | |||||
*/ | |||||
uint64_t stats_size = 0; | |||||
write_asize = 0; | |||||
/* | |||||
* Now start writing the buffers. We're starting at the write head | * Now start writing the buffers. We're starting at the write head | ||||
* and work backwards, retracing the course of the buffer selector | * and work backwards, retracing the course of the buffer selector | ||||
* loop above. | * loop above. | ||||
*/ | */ | ||||
for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; | for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; | ||||
hdr = list_prev(&dev->l2ad_buflist, hdr)) { | hdr = list_prev(&dev->l2ad_buflist, hdr)) { | ||||
uint64_t buf_sz; | uint64_t buf_sz; | ||||
Show All 36 Lines | for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; | ||||
* We need to do this regardless if buf_sz is zero or | * We need to do this regardless if buf_sz is zero or | ||||
* not, otherwise, when this l2hdr is evicted we'll | * not, otherwise, when this l2hdr is evicted we'll | ||||
* remove a reference that was never added. | * remove a reference that was never added. | ||||
*/ | */ | ||||
(void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); | (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); | ||||
/* Compression may have squashed the buffer to zero length. */ | /* Compression may have squashed the buffer to zero length. */ | ||||
if (buf_sz != 0) { | if (buf_sz != 0) { | ||||
uint64_t buf_p_sz; | uint64_t buf_a_sz; | ||||
wzio = zio_write_phys(pio, dev->l2ad_vdev, | wzio = zio_write_phys(pio, dev->l2ad_vdev, | ||||
dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, | dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, | ||||
NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, | NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, | ||||
ZIO_FLAG_CANFAIL, B_FALSE); | ZIO_FLAG_CANFAIL, B_FALSE); | ||||
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, | DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, | ||||
zio_t *, wzio); | zio_t *, wzio); | ||||
(void) zio_nowait(wzio); | (void) zio_nowait(wzio); | ||||
write_asize += buf_sz; | stats_size += buf_sz; | ||||
/* | /* | ||||
* Keep the clock hand suitably device-aligned. | * Keep the clock hand suitably device-aligned. | ||||
*/ | */ | ||||
buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); | buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); | ||||
write_psize += buf_p_sz; | write_asize += buf_a_sz; | ||||
dev->l2ad_hand += buf_p_sz; | dev->l2ad_hand += buf_a_sz; | ||||
} | } | ||||
} | } | ||||
mutex_exit(&dev->l2ad_mtx); | mutex_exit(&dev->l2ad_mtx); | ||||
ASSERT3U(write_asize, <=, target_sz); | ASSERT3U(write_asize, <=, target_sz); | ||||
ARCSTAT_BUMP(arcstat_l2_writes_sent); | ARCSTAT_BUMP(arcstat_l2_writes_sent); | ||||
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); | ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); | ||||
ARCSTAT_INCR(arcstat_l2_size, write_sz); | ARCSTAT_INCR(arcstat_l2_size, write_sz); | ||||
ARCSTAT_INCR(arcstat_l2_asize, write_asize); | ARCSTAT_INCR(arcstat_l2_asize, stats_size); | ||||
vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); | vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); | ||||
Not Done Inline ActionsAs this doesn't include the uplift to match block size then the space allocation will be incorrect. I've not checked but this could cause issues later when the device is full but we still think there is space available? smh: As this doesn't include the uplift to match block size then the space allocation will be… | |||||
Not Done Inline ActionsFirst, nothing changes from the previous code in this respect: you can easily see that stats_size is exactly the same as what write_asize used to be. avg: First, nothing changes from the previous code in this respect: you can easily see that… | |||||
/* | /* | ||||
* Bump device hand to the device start if it is approaching the end. | * Bump device hand to the device start if it is approaching the end. | ||||
* l2arc_evict() will already have evicted ahead for this case. | * l2arc_evict() will already have evicted ahead for this case. | ||||
*/ | */ | ||||
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { | if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { | ||||
dev->l2ad_hand = dev->l2ad_start; | dev->l2ad_hand = dev->l2ad_start; | ||||
dev->l2ad_first = B_FALSE; | dev->l2ad_first = B_FALSE; | ||||
▲ Show 20 Lines • Show All 447 Lines • Show Last 20 Lines |
I don't really like the fact that arcstat_l2_asize isn't calculating asize tbh. Would it not be better to fix this properly in this patch too?