diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c --- a/sys/vm/uma_core.c +++ b/sys/vm/uma_core.c @@ -295,6 +295,7 @@ static void bucket_cache_reclaim(uma_zone_t zone, bool, int); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); +static void keg_drain(uma_keg_t keg, int domain); static int zone_ctor(void *, int, void *, int); static void zone_dtor(void *, int, void *); static inline void item_dtor(uma_zone_t zone, void *item, int size, @@ -700,24 +701,6 @@ return (domain); } -/* - * Safely subtract cnt from imax. - */ -static void -zone_domain_imax_sub(uma_zone_domain_t zdom, int cnt) -{ - long new; - long old; - - old = zdom->uzd_imax; - do { - if (old <= cnt) - new = 0; - else - new = old - cnt; - } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, new) == 0); -} - /* * Set the maximum imax value. */ @@ -729,8 +712,16 @@ old = zdom->uzd_imax; do { if (old >= nitems) - break; + return; } while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0); + + /* + * We are at new maximum, so do the last WSS update for the old + * bimin and prepare to measure next allocation batch. + */ + if (zdom->uzd_wss < old - zdom->uzd_bimin) + zdom->uzd_wss = old - zdom->uzd_bimin; + zdom->uzd_bimin = nitems; } /* @@ -741,6 +732,7 @@ zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim) { uma_bucket_t bucket; + long cnt; int i; bool dtor = false; @@ -768,15 +760,30 @@ ("%s: empty bucket in bucket cache", __func__)); zdom->uzd_nitems -= bucket->ub_cnt; - /* - * Shift the bounds of the current WSS interval to avoid - * perturbing the estimate. - */ if (reclaim) { + /* + * Shift the bounds of the current WSS interval to avoid + * perturbing the estimates. + */ + cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt); + atomic_subtract_long(&zdom->uzd_imax, cnt); + zdom->uzd_bimin -= cnt; zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt); - zone_domain_imax_sub(zdom, bucket->ub_cnt); - } else if (zdom->uzd_imin > zdom->uzd_nitems) - zdom->uzd_imin = zdom->uzd_nitems; + zdom->uzd_limin -= lmin(zdom->uzd_limin, bucket->ub_cnt); + if (zdom->uzd_limin == 0) + zdom->uzd_timin = 0; + } else if (zdom->uzd_bimin > zdom->uzd_nitems) { + /* nitems >= bimin >= imin >= limin */ + zdom->uzd_bimin = zdom->uzd_nitems; + if (zdom->uzd_imin > zdom->uzd_nitems) { + zdom->uzd_imin = zdom->uzd_nitems; + if (zdom->uzd_limin > zdom->uzd_nitems) { + zdom->uzd_limin = zdom->uzd_nitems; + if (zdom->uzd_limin == 0) + zdom->uzd_timin = 0; + } + } + } ZDOM_UNLOCK(zdom); if (dtor) @@ -808,8 +815,18 @@ */ zdom->uzd_nitems += bucket->ub_cnt; if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) { - if (ws) + if (ws) { zone_domain_imax_set(zdom, zdom->uzd_nitems); + } else { + /* + * Shift the bounds of the current WSS interval to + * avoid perturbing the estimates. + */ + atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt); + zdom->uzd_imin += bucket->ub_cnt; + zdom->uzd_bimin += bucket->ub_cnt; + zdom->uzd_limin += bucket->ub_cnt; + } if (STAILQ_EMPTY(&zdom->uzd_buckets)) zdom->uzd_seq = bucket->ub_seq; @@ -1041,22 +1058,40 @@ } /* - * Update the working set size estimate for the zone's bucket cache. - * The constants chosen here are somewhat arbitrary. With an update period of - * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the - * last 100s. + * Update the working set size estimates for the zone's bucket cache. + * The constants chosen here are somewhat arbitrary. */ static void zone_domain_update_wss(uma_zone_domain_t zdom) { - long wss; - ZDOM_LOCK(zdom); - MPASS(zdom->uzd_imax >= zdom->uzd_imin); - wss = zdom->uzd_imax - zdom->uzd_imin; - zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; - zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; - ZDOM_UNLOCK(zdom); + ZDOM_LOCK_ASSERT(zdom); + MPASS(zdom->uzd_imax >= zdom->uzd_nitems); + MPASS(zdom->uzd_nitems >= zdom->uzd_bimin); + MPASS(zdom->uzd_bimin >= zdom->uzd_imin); + MPASS(zdom->uzd_imin >= zdom->uzd_limin); + + /* + * Estimate WSS as modified moving average of biggest allocation + * batches for each period over few minutes (UMA_TIMEOUT of 20s). + */ + zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4, + zdom->uzd_imax - zdom->uzd_bimin); + + /* + * Estimate longtime minimum item count as a combination of minimum + * and modified moving average of minimum item counts from each period + * over few hours (UMA_TIMEOUT of 20s). uzd_timin measures time since + * uzd_limin dropped to zero, that means we got out of cache. + */ + if (zdom->uzd_limin > 0) + zdom->uzd_timin++; + zdom->uzd_limin = (zdom->uzd_imin + zdom->uzd_limin * 255) / 256; + + /* To reduce period edge effects on WSS keep half of the imax. */ + atomic_subtract_long(&zdom->uzd_imax, + (zdom->uzd_imax - zdom->uzd_nitems + 1) / 2); + zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems; } /* @@ -1069,10 +1104,13 @@ zone_timeout(uma_zone_t zone, void *unused) { uma_keg_t keg; + uma_zone_domain_t zdom; + uma_bucket_t bucket; + long target; u_int slabs, pages; if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0) - goto update_wss; + goto trim; keg = zone->uz_keg; @@ -1113,14 +1151,38 @@ KEG_UNLOCK(keg, 0); hash_free(&oldhash); - goto update_wss; + goto trim; } } KEG_UNLOCK(keg, 0); -update_wss: - for (int i = 0; i < vm_ndomains; i++) - zone_domain_update_wss(ZDOM_GET(zone, i)); +trim: + /* + * Every 30 minutes drop ~30% of caches not used during the + * time, keeping safety distance of WSS just in case. + */ + for (int i = 0; i < vm_ndomains; i++) { + zdom = ZDOM_GET(zone, i); + ZDOM_LOCK(zdom); + zone_domain_update_wss(zdom); + if (zdom->uzd_timin < 90) { + ZDOM_UNLOCK(zdom); + continue; + } + zdom->uzd_timin = 0; + target = zdom->uzd_nitems - zdom->uzd_limin + zdom->uzd_wss; + while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL && + zdom->uzd_nitems >= target + bucket->ub_cnt) { + bucket = zone_fetch_bucket(zone, zdom, true); + if (bucket == NULL) + break; + bucket_free(zone, bucket, NULL); + ZDOM_LOCK(zdom); + } + ZDOM_UNLOCK(zdom); + if ((zone->uz_flags & UMA_ZFLAG_CACHE) == 0) + keg_drain(zone->uz_keg, i); + } } /* @@ -1429,15 +1491,14 @@ /* * If we were asked to drain the zone, we are done only once * this bucket cache is empty. Otherwise, we reclaim items in - * excess of the zone's estimated working set size. If the - * difference nitems - imin is larger than the WSS estimate, - * then the estimate will grow at the end of this interval and - * we ignore the historical average. + * excess of the zone's estimated working set size. Multiple + * consecutive calls will shrink the WSS and so reclaim more. */ ZDOM_LOCK(zdom); - target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - - zdom->uzd_imin); - while (zdom->uzd_nitems > target) { + zone_domain_update_wss(zdom); + target = drain ? 0 : zdom->uzd_wss; + while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL && + zdom->uzd_nitems >= target + bucket->ub_cnt) { bucket = zone_fetch_bucket(zone, zdom, true); if (bucket == NULL) break; @@ -2611,9 +2672,18 @@ SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "imin", CTLFLAG_RD, &zdom->uzd_imin, "minimum item count in this period"); + SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "bimin", CTLFLAG_RD, &zdom->uzd_bimin, + "Minimum item count in this batch"); SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wss", CTLFLAG_RD, &zdom->uzd_wss, "Working set size"); + SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "limin", CTLFLAG_RD, &zdom->uzd_limin, + "Long time minimum item count"); + SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "timin", CTLFLAG_RD, &zdom->uzd_timin, 0, + "Time since zero long time minimum item count"); } /* @@ -3642,7 +3712,7 @@ * We lost the race, release this bucket and start over. */ critical_exit(); - zone_put_bucket(zone, domain, bucket, udata, false); + zone_put_bucket(zone, domain, bucket, udata, !new); critical_enter(); return (true); diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h --- a/sys/vm/uma_int.h +++ b/sys/vm/uma_int.h @@ -445,7 +445,10 @@ long uzd_nitems; /* total item count */ long uzd_imax; /* maximum item count this period */ long uzd_imin; /* minimum item count this period */ + long uzd_bimin; /* Minimum item count this batch. */ long uzd_wss; /* working set size estimate */ + long uzd_limin; /* Longtime minimum item count. */ + u_int uzd_timin; /* Time since uzd_limin == 0. */ smr_seq_t uzd_seq; /* Lowest queued seq. */ struct mtx uzd_lock; /* Lock for the domain */ } __aligned(CACHE_LINE_SIZE);