Index: head/lib/libmemstat/memstat_uma.c =================================================================== --- head/lib/libmemstat/memstat_uma.c +++ head/lib/libmemstat/memstat_uma.c @@ -474,9 +474,9 @@ ret = kread(kvm, &uz.uz_domain[i], &uzd, sizeof(uzd), 0); for (ubp = - LIST_FIRST(&uzd.uzd_buckets); + TAILQ_FIRST(&uzd.uzd_buckets); ubp != NULL; - ubp = LIST_NEXT(&ub, ub_link)) { + ubp = TAILQ_NEXT(&ub, ub_link)) { ret = kread(kvm, ubp, &ub, sizeof(ub), 0); mtp->mt_zonefree += ub.ub_cnt; Index: head/share/man/man9/Makefile =================================================================== --- head/share/man/man9/Makefile +++ head/share/man/man9/Makefile @@ -2281,6 +2281,8 @@ vrele.9 vunref.9 MLINKS+=vslock.9 vsunlock.9 MLINKS+=zone.9 uma.9 \ + zone.9 uma_prealloc.9 \ + zone.9 uma_reclaim.9 \ zone.9 uma_zalloc.9 \ zone.9 uma_zalloc_arg.9 \ zone.9 uma_zalloc_domain.9 \ @@ -2296,7 +2298,7 @@ zone.9 uma_zfree_pcpu_arg.9 \ zone.9 uma_zone_get_cur.9 \ zone.9 uma_zone_get_max.9 \ - zone.9 uma_zone_prealloc.9 \ + zone.9 uma_zone_reclaim.9 \ zone.9 uma_zone_reserve.9 \ zone.9 uma_zone_reserve_kva.9 \ zone.9 uma_zone_set_allocf.9 \ Index: head/share/man/man9/zone.9 =================================================================== --- head/share/man/man9/zone.9 +++ head/share/man/man9/zone.9 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd August 30, 2019 +.Dd September 1, 2019 .Dt UMA 9 .Os .Sh NAME @@ -98,6 +98,10 @@ .Ft void .Fn uma_zone_reserve_kva "uma_zone_t zone" "int nitems" .Ft void +.Fn uma_reclaim "int req" +.Ft void +.Fn uma_zone_reclaim "uma_zone_t zone" "int req" +.Ft void .Fn uma_zone_set_allocf "uma_zone_t zone" "uma_alloc allocf" .Ft void .Fn uma_zone_set_freef "uma_zone_t zone" "uma_free freef" @@ -436,6 +440,32 @@ does not restrict the use of the pre-allocation to .Dv M_USE_RESERVE requests. +.Pp +The +.Fn uma_reclaim +and +.Fn uma_zone_reclaim +functions reclaim cached items from UMA zones, releasing unused memory. +The +.Fn uma_reclaim +function reclaims items from all regular zones, while +.Fn uma_zone_reclaim +reclaims items only from the specified zone. +The +.Fa req +parameter must be one of three values which specify how aggressively +items are to be reclaimed: +.Bl -tag -width indent +.It Dv UMA_RECLAIM_TRIM +Reclaim items only in excess of the zone's estimated working set size. +The working set size is periodically updated and tracks the recent history +of the zone's usage. +.It Dv UMA_RECLAIM_DRAIN +Reclaim all items from the unbounded cache. +Free items in the per-CPU caches are left alone. +.It Dv UMA_RECLAIM_DRAIN_CPU +Reclaim all cached items. +.El .Pp The .Fn uma_zone_set_allocf Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c =================================================================== --- head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c +++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c @@ -238,14 +238,14 @@ kmem_cache_reap_soon(kmem_cache_t *cache) { #ifndef KMEM_DEBUG - zone_drain(cache->kc_zone); + uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN); #endif } void kmem_reap(void) { - uma_reclaim(); + uma_reclaim(UMA_RECLAIM_TRIM); } #else void Index: head/sys/kern/kern_mbuf.c =================================================================== --- head/sys/kern/kern_mbuf.c +++ head/sys/kern/kern_mbuf.c @@ -711,14 +711,14 @@ #endif /* * If there are processes blocked on zone_clust, waiting for pages - * to be freed up, * cause them to be woken up by draining the - * packet zone. We are exposed to a race here * (in the check for + * to be freed up, cause them to be woken up by draining the + * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) - zone_drain(zone_pack); + uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* @@ -1362,7 +1362,7 @@ * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { - zone_drain(zone_pack); + uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); Index: head/sys/kern/subr_vmem.c =================================================================== --- head/sys/kern/subr_vmem.c +++ head/sys/kern/subr_vmem.c @@ -588,7 +588,7 @@ qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) - zone_drain(vm->vm_qcache[i].qc_cache); + uma_zone_reclaim(vm->vm_qcache[i].qc_cache, UMA_RECLAIM_DRAIN); } #ifndef UMA_MD_SMALL_ALLOC Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c +++ head/sys/kern/vfs_subr.c @@ -1321,7 +1321,7 @@ } mtx_unlock(&mountlist_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) - uma_reclaim(); + uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; Index: head/sys/vm/uma.h =================================================================== --- head/sys/vm/uma.h +++ head/sys/vm/uma.h @@ -50,8 +50,6 @@ /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; -void zone_drain(uma_zone_t); - /* * Item constructor * @@ -438,17 +436,18 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); /* - * Reclaims unused memory for all zones + * Reclaims unused memory * * Arguments: - * None + * req Reclamation request type. * Returns: * None - * - * This should only be called by the page out daemon. */ - -void uma_reclaim(void); +#define UMA_RECLAIM_DRAIN 1 /* release bucket cache */ +#define UMA_RECLAIM_DRAIN_CPU 2 /* release bucket and per-CPU caches */ +#define UMA_RECLAIM_TRIM 3 /* trim bucket cache to WSS */ +void uma_reclaim(int req); +void uma_zone_reclaim(uma_zone_t, int req); /* * Sets the alignment mask to be used for all zones requesting cache Index: head/sys/vm/uma_core.c =================================================================== --- head/sys/vm/uma_core.c +++ head/sys/vm/uma_core.c @@ -142,7 +142,7 @@ static char *bootmem; static int boot_pages; -static struct sx uma_drain_lock; +static struct sx uma_reclaim_lock; /* * kmem soft limit, initialized by uma_set_limit(). Ensure that early @@ -250,7 +250,7 @@ static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); -static void bucket_cache_drain(uma_zone_t zone); +static void bucket_cache_reclaim(uma_zone_t zone, bool); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); @@ -467,27 +467,36 @@ struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) - zone_drain(ubz->ubz_zone); + uma_zone_reclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); } +/* + * Attempt to satisfy an allocation by retrieving a full bucket from one of the + * zone's caches. + */ static uma_bucket_t -zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws) +zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom) { uma_bucket_t bucket; ZONE_LOCK_ASSERT(zone); - if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { + if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) { MPASS(zdom->uzd_nitems >= bucket->ub_cnt); - LIST_REMOVE(bucket, ub_link); + TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); zdom->uzd_nitems -= bucket->ub_cnt; - if (ws && zdom->uzd_imin > zdom->uzd_nitems) + if (zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; zone->uz_bkt_count -= bucket->ub_cnt; } return (bucket); } +/* + * Insert a full bucket into the specified cache. The "ws" parameter indicates + * whether the bucket's contents should be counted as part of the zone's working + * set. + */ static void zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket, const bool ws) @@ -497,7 +506,10 @@ KASSERT(zone->uz_bkt_count < zone->uz_bkt_max, ("%s: zone %p overflow", __func__, zone)); - LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); + if (ws) + TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); + else + TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); zdom->uzd_nitems += bucket->ub_cnt; if (ws && zdom->uzd_imax < zdom->uzd_nitems) zdom->uzd_imax = zdom->uzd_nitems; @@ -558,7 +570,7 @@ MPASS(zdom->uzd_imax >= zdom->uzd_imin); wss = zdom->uzd_imax - zdom->uzd_imin; zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; - zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5; + zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; } /* @@ -609,11 +621,12 @@ return; } } + KEG_UNLOCK(keg); + ZONE_LOCK(zone); for (int i = 0; i < vm_ndomains; i++) zone_domain_update_wss(&zone->uz_domain[i]); - - KEG_UNLOCK(keg); + ZONE_UNLOCK(zone); } /* @@ -777,7 +790,7 @@ * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). * - * XXX: We lock the zone before passing into bucket_cache_drain() as + * XXX: We lock the zone before passing into bucket_cache_reclaim() as * it is used elsewhere. Should the tear-down path be made special * there in some form? */ @@ -797,7 +810,7 @@ cache->uc_crossbucket = NULL; } ZONE_LOCK(zone); - bucket_cache_drain(zone); + bucket_cache_reclaim(zone, true); ZONE_UNLOCK(zone); } @@ -869,7 +882,7 @@ * Zone lock must not be held on call this function. */ static void -cache_drain_safe(uma_zone_t zone) +pcpu_cache_drain_safe(uma_zone_t zone) { int cpu; @@ -897,22 +910,46 @@ } /* - * Drain the cached buckets from a zone. Expects a locked zone on entry. + * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller + * requested a drain, otherwise the per-domain caches are trimmed to either + * estimated working set size. */ static void -bucket_cache_drain(uma_zone_t zone) +bucket_cache_reclaim(uma_zone_t zone, bool drain) { uma_zone_domain_t zdom; uma_bucket_t bucket; + long target, tofree; int i; - /* - * Drain the bucket queues and free the buckets. - */ for (i = 0; i < vm_ndomains; i++) { zdom = &zone->uz_domain[i]; - while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) != - NULL) { + + /* + * If we were asked to drain the zone, we are done only once + * this bucket cache is empty. Otherwise, we reclaim items in + * excess of the zone's estimated working set size. If the + * difference nitems - imin is larger than the WSS estimate, + * then the estimate will grow at the end of this interval and + * we ignore the historical average. + */ + target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - + zdom->uzd_imin); + while (zdom->uzd_nitems > target) { + bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist); + if (bucket == NULL) + break; + tofree = bucket->ub_cnt; + TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); + zdom->uzd_nitems -= tofree; + + /* + * Shift the bounds of the current WSS interval to avoid + * perturbing the estimate. + */ + zdom->uzd_imax -= lmin(zdom->uzd_imax, tofree); + zdom->uzd_imin -= lmin(zdom->uzd_imin, tofree); + ZONE_UNLOCK(zone); bucket_drain(zone, bucket); bucket_free(zone, bucket, NULL); @@ -921,8 +958,8 @@ } /* - * Shrink further bucket sizes. Price of single zone lock collision - * is probably lower then price of global cache drain. + * Shrink the zone bucket size to ensure that the per-CPU caches + * don't grow too large. */ if (zone->uz_count > zone->uz_count_min) zone->uz_count--; @@ -1020,7 +1057,7 @@ } static void -zone_drain_wait(uma_zone_t zone, int waitok) +zone_reclaim(uma_zone_t zone, int waitok, bool drain) { /* @@ -1030,14 +1067,15 @@ * when it wakes up. */ ZONE_LOCK(zone); - while (zone->uz_flags & UMA_ZFLAG_DRAINING) { + while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { if (waitok == M_NOWAIT) goto out; msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); } - zone->uz_flags |= UMA_ZFLAG_DRAINING; - bucket_cache_drain(zone); + zone->uz_flags |= UMA_ZFLAG_RECLAIMING; + bucket_cache_reclaim(zone, drain); ZONE_UNLOCK(zone); + /* * The DRAINING flag protects us from being freed while * we're running. Normally the uma_rwlock would protect us but we @@ -1045,19 +1083,26 @@ */ keg_drain(zone->uz_keg); ZONE_LOCK(zone); - zone->uz_flags &= ~UMA_ZFLAG_DRAINING; + zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; wakeup(zone); out: ZONE_UNLOCK(zone); } -void +static void zone_drain(uma_zone_t zone) { - zone_drain_wait(zone, M_NOWAIT); + zone_reclaim(zone, M_NOWAIT, true); } +static void +zone_trim(uma_zone_t zone) +{ + + zone_reclaim(zone, M_NOWAIT, false); +} + /* * Allocate a new slab for a keg. This does not insert the slab onto a list. * If the allocation was successful, the keg lock will be held upon return, @@ -1756,6 +1801,7 @@ uma_zone_t zone = mem; uma_zone_t z; uma_keg_t keg; + int i; bzero(zone, size); zone->uz_name = arg->name; @@ -1783,6 +1829,9 @@ zone->uz_fails = EARLY_COUNTER; } + for (i = 0; i < vm_ndomains; i++) + TAILQ_INIT(&zone->uz_domain[i].uzd_buckets); + /* * This is a pure cache zone, no kegs. */ @@ -1933,7 +1982,7 @@ * released and then refilled before we * remove it... we dont care for now */ - zone_drain_wait(zone, M_WAITOK); + zone_reclaim(zone, M_WAITOK, true); /* * We only destroy kegs from non secondary/non cache zones. */ @@ -2138,7 +2187,7 @@ printf("Entering %s with %d boot pages left\n", __func__, boot_pages); #endif booted = BOOT_BUCKETS; - sx_init(&uma_drain_lock, "umadrain"); + sx_init(&uma_reclaim_lock, "umareclaim"); bucket_enable(); } @@ -2233,12 +2282,12 @@ if (booted < BOOT_BUCKETS) { locked = false; } else { - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); locked = true; } res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); return (res); } @@ -2267,13 +2316,13 @@ if (booted < BOOT_BUCKETS) { locked = false; } else { - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); locked = true; } /* XXX Attaches only one keg of potentially many. */ res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); return (res); } @@ -2306,9 +2355,9 @@ uma_zdestroy(uma_zone_t zone) { - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); } void @@ -2521,7 +2570,7 @@ zdom = &zone->uz_domain[0]; } - if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) { + if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); cache->uc_allocbucket = bucket; @@ -3672,17 +3721,28 @@ } /* See uma.h */ -static void -uma_reclaim_locked(bool kmem_danger) +void +uma_reclaim(int req) { CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); - sx_assert(&uma_drain_lock, SA_XLOCKED); + sx_xlock(&uma_reclaim_lock); bucket_enable(); - zone_foreach(zone_drain); - if (vm_page_count_min() || kmem_danger) { - cache_drain_safe(NULL); + + switch (req) { + case UMA_RECLAIM_TRIM: + zone_foreach(zone_trim); + break; + case UMA_RECLAIM_DRAIN: + case UMA_RECLAIM_DRAIN_CPU: zone_foreach(zone_drain); + if (req == UMA_RECLAIM_DRAIN_CPU) { + pcpu_cache_drain_safe(NULL); + zone_foreach(zone_drain); + } + break; + default: + panic("unhandled reclamation request %d", req); } /* @@ -3692,17 +3752,9 @@ */ zone_drain(slabzone); bucket_zone_drain(); + sx_xunlock(&uma_reclaim_lock); } -void -uma_reclaim(void) -{ - - sx_xlock(&uma_drain_lock); - uma_reclaim_locked(false); - sx_xunlock(&uma_drain_lock); -} - static volatile int uma_reclaim_needed; void @@ -3718,18 +3770,37 @@ { for (;;) { - sx_xlock(&uma_drain_lock); + sx_xlock(&uma_reclaim_lock); while (atomic_load_int(&uma_reclaim_needed) == 0) - sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl", + sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", hz); - sx_xunlock(&uma_drain_lock); + sx_xunlock(&uma_reclaim_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); - sx_xlock(&uma_drain_lock); - uma_reclaim_locked(true); + uma_reclaim(UMA_RECLAIM_DRAIN_CPU); atomic_store_int(&uma_reclaim_needed, 0); - sx_xunlock(&uma_drain_lock); /* Don't fire more than once per-second. */ pause("umarclslp", hz); + } +} + +/* See uma.h */ +void +uma_zone_reclaim(uma_zone_t zone, int req) +{ + + switch (req) { + case UMA_RECLAIM_TRIM: + zone_trim(zone); + break; + case UMA_RECLAIM_DRAIN: + zone_drain(zone); + break; + case UMA_RECLAIM_DRAIN_CPU: + pcpu_cache_drain_safe(zone); + zone_drain(zone); + break; + default: + panic("unhandled reclamation request %d", req); } } Index: head/sys/vm/uma_int.h =================================================================== --- head/sys/vm/uma_int.h +++ head/sys/vm/uma_int.h @@ -197,7 +197,7 @@ */ struct uma_bucket { - LIST_ENTRY(uma_bucket) ub_link; /* Link into the zone */ + TAILQ_ENTRY(uma_bucket) ub_link; /* Link into the zone */ int16_t ub_cnt; /* Count of items in bucket. */ int16_t ub_entries; /* Max items. */ void *ub_bucket[]; /* actual allocation storage */ @@ -306,8 +306,10 @@ typedef struct uma_slab * uma_slab_t; +TAILQ_HEAD(uma_bucketlist, uma_bucket); + struct uma_zone_domain { - LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */ + struct uma_bucketlist uzd_buckets; /* full buckets */ long uzd_nitems; /* total item count */ long uzd_imax; /* maximum item count this period */ long uzd_imin; /* minimum item count this period */ @@ -384,7 +386,7 @@ * These flags must not overlap with the UMA_ZONE flags specified in uma.h. */ #define UMA_ZFLAG_CACHE 0x04000000 /* uma_zcache_create()d it */ -#define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */ +#define UMA_ZFLAG_RECLAIMING 0x08000000 /* Running zone_reclaim(). */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ #define UMA_ZFLAG_CACHEONLY 0x80000000 /* Don't ask VM for buckets. */ Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c +++ head/sys/vm/vm_pageout.c @@ -1871,9 +1871,12 @@ /* * We do this explicitly after the caches have been - * drained above. + * drained above. If we have a severe page shortage on + * our hands, completely drain all UMA zones. Otherwise, + * just prune the caches. */ - uma_reclaim(); + uma_reclaim(vm_page_count_min() ? UMA_RECLAIM_DRAIN_CPU : + UMA_RECLAIM_TRIM); return (true); } return (false);