Index: sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c =================================================================== --- sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c +++ sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c @@ -238,14 +238,14 @@ kmem_cache_reap_soon(kmem_cache_t *cache) { #ifndef KMEM_DEBUG - zone_drain(cache->kc_zone); + uma_zreclaim(cache->kc_zone, UMA_RECLAIM_DRAIN); #endif } void kmem_reap(void) { - uma_reclaim(); + uma_reclaim(UMA_RECLAIM_TRIM); } #else void Index: sys/kern/kern_mbuf.c =================================================================== --- sys/kern/kern_mbuf.c +++ sys/kern/kern_mbuf.c @@ -679,14 +679,14 @@ #endif /* * If there are processes blocked on zone_clust, waiting for pages - * to be freed up, * cause them to be woken up by draining the - * packet zone. We are exposed to a race here * (in the check for + * to be freed up, cause them to be woken up by draining the + * packet zone. We are exposed to a race here (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) - zone_drain(zone_pack); + uma_zreclaim(zone_pack, UMA_RECLAIM_DRAIN); } /* @@ -930,7 +930,7 @@ * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { - zone_drain(zone_pack); + uma_zreclaim(zone_pack, UMA_RECLAIM_DRAIN); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); Index: sys/kern/subr_vmem.c =================================================================== --- sys/kern/subr_vmem.c +++ sys/kern/subr_vmem.c @@ -586,7 +586,7 @@ qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift; for (i = 0; i < qcache_idx_max; i++) - zone_drain(vm->vm_qcache[i].qc_cache); + uma_zreclaim(vm->vm_qcache[i].qc_cache, UMA_RECLAIM_DRAIN); } #ifndef UMA_MD_SMALL_ALLOC Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -1222,7 +1222,7 @@ } mtx_unlock(&mountlist_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) - uma_reclaim(); + uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; Index: sys/vm/uma.h =================================================================== --- sys/vm/uma.h +++ sys/vm/uma.h @@ -50,8 +50,6 @@ /* Opaque type used as a handle to the zone */ typedef struct uma_zone * uma_zone_t; -void zone_drain(uma_zone_t); - /* * Item constructor * @@ -450,17 +448,18 @@ typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag); /* - * Reclaims unused memory for all zones + * Reclaims unused memory * * Arguments: - * None + * req Reclamation request type. * Returns: * None - * - * This should only be called by the page out daemon. */ - -void uma_reclaim(void); +#define UMA_RECLAIM_DRAIN 1 /* release bucket cache */ +#define UMA_RECLAIM_DRAIN_CPU 2 /* release bucket and per-CPU caches */ +#define UMA_RECLAIM_TRIM 3 /* trim bucket cache to WSS */ +void uma_reclaim(int req); +void uma_zreclaim(uma_zone_t, int req); /* * Sets the alignment mask to be used for all zones requesting cache Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -142,7 +142,7 @@ static char *bootmem; static int boot_pages; -static struct sx uma_drain_lock; +static struct sx uma_reclaim_lock; /* kmem soft limit. */ static unsigned long uma_kmem_limit = LONG_MAX; @@ -239,7 +239,7 @@ static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); -static void bucket_cache_drain(uma_zone_t zone); +static void bucket_cache_reclaim(uma_zone_t zone, bool); static int keg_ctor(void *, int, void *, int); static void keg_dtor(void *, int, void *); static int zone_ctor(void *, int, void *, int); @@ -456,26 +456,35 @@ struct uma_bucket_zone *ubz; for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) - zone_drain(ubz->ubz_zone); + uma_zreclaim(ubz->ubz_zone, UMA_RECLAIM_DRAIN); } +/* + * Attempt to satisfy an allocation by retrieving a full bucket from one of the + * zone's caches. + */ static uma_bucket_t -zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws) +zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom) { uma_bucket_t bucket; ZONE_LOCK_ASSERT(zone); - if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) { + if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) { MPASS(zdom->uzd_nitems >= bucket->ub_cnt); - LIST_REMOVE(bucket, ub_link); + TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); zdom->uzd_nitems -= bucket->ub_cnt; - if (ws && zdom->uzd_imin > zdom->uzd_nitems) + if (zdom->uzd_imin > zdom->uzd_nitems) zdom->uzd_imin = zdom->uzd_nitems; } return (bucket); } +/* + * Insert a full bucket into the specified cache. The "ws" parameter indicates + * whether the bucket's contents should be counted as part of the zone's working + * set. + */ static void zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket, const bool ws) @@ -483,7 +492,10 @@ ZONE_LOCK_ASSERT(zone); - LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); + if (ws) + TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link); + else + TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link); zdom->uzd_nitems += bucket->ub_cnt; if (ws && zdom->uzd_imax < zdom->uzd_nitems) zdom->uzd_imax = zdom->uzd_nitems; @@ -552,7 +564,7 @@ MPASS(zdom->uzd_imax >= zdom->uzd_imin); wss = zdom->uzd_imax - zdom->uzd_imin; zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems; - zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5; + zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5; } /* @@ -774,7 +786,7 @@ * XXX: It would good to be able to assert that the zone is being * torn down to prevent improper use of cache_drain(). * - * XXX: We lock the zone before passing into bucket_cache_drain() as + * XXX: We lock the zone before passing into bucket_cache_reclaim() as * it is used elsewhere. Should the tear-down path be made special * there in some form? */ @@ -789,7 +801,7 @@ cache->uc_allocbucket = cache->uc_freebucket = NULL; } ZONE_LOCK(zone); - bucket_cache_drain(zone); + bucket_cache_reclaim(zone, true); ZONE_UNLOCK(zone); } @@ -855,7 +867,7 @@ * Zone lock must not be held on call this function. */ static void -cache_drain_safe(uma_zone_t zone) +pcpu_cache_drain_safe(uma_zone_t zone) { int cpu; @@ -883,32 +895,56 @@ } /* - * Drain the cached buckets from a zone. Expects a locked zone on entry. + * Reclaim cached buckets from a zone. All buckets are reclaimed if the caller + * requested a drain, otherwise the per-domain caches are trimmed to either + * estimated working set size. */ static void -bucket_cache_drain(uma_zone_t zone) +bucket_cache_reclaim(uma_zone_t zone, bool drain) { uma_zone_domain_t zdom; uma_bucket_t bucket; + long freed, target; int i; - /* - * Drain the bucket queues and free the buckets. - */ for (i = 0; i < vm_ndomains; i++) { zdom = &zone->uz_domain[i]; - while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) != - NULL) { + + /* + * If we were asked to drain the zone, we are done only once + * this bucket cache is empty. Otherwise, we reclaim items in + * excess of the zone's estimated working set size. If the + * difference nitems - imin is larger than the WSS estimate, + * then the estimate will grow at the end of this interval and + * we ignore the historical average. + */ + target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems - + zdom->uzd_imin); + while (zdom->uzd_nitems > target) { + bucket = TAILQ_LAST(&zdom->uzd_buckets, uma_bucketlist); + if (bucket == NULL) + break; + TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link); + zdom->uzd_nitems -= bucket->ub_cnt; ZONE_UNLOCK(zone); + + freed = bucket->ub_cnt; bucket_drain(zone, bucket); bucket_free(zone, bucket, NULL); + + /* + * Shift the bounds of the current WSS interval to avoid + * perturbing the estimate. + */ ZONE_LOCK(zone); + zdom->uzd_imax -= lmin(zdom->uzd_imax, freed); + zdom->uzd_imin -= lmin(zdom->uzd_imin, freed); } } /* - * Shrink further bucket sizes. Price of single zone lock collision - * is probably lower then price of global cache drain. + * Shrink the zone bucket size to ensure that the per-CPU caches + * don't grow too large. */ if (zone->uz_count > zone->uz_count_min) zone->uz_count--; @@ -1006,7 +1042,7 @@ } static void -zone_drain_wait(uma_zone_t zone, int waitok) +zone_reclaim(uma_zone_t zone, int waitok, bool drain) { /* @@ -1016,13 +1052,13 @@ * when it wakes up. */ ZONE_LOCK(zone); - while (zone->uz_flags & UMA_ZFLAG_DRAINING) { + while (zone->uz_flags & UMA_ZFLAG_RECLAIMING) { if (waitok == M_NOWAIT) goto out; msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1); } - zone->uz_flags |= UMA_ZFLAG_DRAINING; - bucket_cache_drain(zone); + zone->uz_flags |= UMA_ZFLAG_RECLAIMING; + bucket_cache_reclaim(zone, drain); ZONE_UNLOCK(zone); /* * The DRAINING flag protects us from being freed while @@ -1031,17 +1067,24 @@ */ zone_foreach_keg(zone, &keg_drain); ZONE_LOCK(zone); - zone->uz_flags &= ~UMA_ZFLAG_DRAINING; + zone->uz_flags &= ~UMA_ZFLAG_RECLAIMING; wakeup(zone); out: ZONE_UNLOCK(zone); } -void +static void zone_drain(uma_zone_t zone) { - zone_drain_wait(zone, M_NOWAIT); + zone_reclaim(zone, M_NOWAIT, true); +} + +static void +zone_trim(uma_zone_t zone) +{ + + zone_reclaim(zone, M_NOWAIT, false); } /* @@ -1908,7 +1951,7 @@ * released and then refilled before we * remove it... we dont care for now */ - zone_drain_wait(zone, M_WAITOK); + zone_reclaim(zone, M_WAITOK, true); /* * Unlink all of our kegs. */ @@ -2111,7 +2154,7 @@ printf("Entering %s with %d boot pages left\n", __func__, boot_pages); #endif booted = BOOT_BUCKETS; - sx_init(&uma_drain_lock, "umadrain"); + sx_init(&uma_reclaim_lock, "umareclaim"); bucket_enable(); } @@ -2200,12 +2243,12 @@ if (booted < BOOT_BUCKETS) { locked = false; } else { - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); locked = true; } res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); return (res); } @@ -2234,13 +2277,13 @@ if (booted < BOOT_BUCKETS) { locked = false; } else { - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); locked = true; } /* XXX Attaches only one keg of potentially many. */ res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK); if (locked) - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); return (res); } @@ -2352,9 +2395,9 @@ uma_zdestroy(uma_zone_t zone) { - sx_slock(&uma_drain_lock); + sx_slock(&uma_reclaim_lock); zone_free_item(zones, zone, NULL, SKIP_NONE); - sx_sunlock(&uma_drain_lock); + sx_sunlock(&uma_reclaim_lock); } void @@ -2564,7 +2607,7 @@ zdom = &zone->uz_domain[0]; else zdom = &zone->uz_domain[domain]; - if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) { + if ((bucket = zone_try_fetch_bucket(zone, zdom)) != NULL) { KASSERT(bucket->ub_cnt != 0, ("uma_zalloc_arg: Returning an empty bucket.")); cache->uc_allocbucket = bucket; @@ -3678,17 +3721,28 @@ } /* See uma.h */ -static void -uma_reclaim_locked(bool kmem_danger) +void +uma_reclaim(int req) { CTR0(KTR_UMA, "UMA: vm asked us to release pages!"); - sx_assert(&uma_drain_lock, SA_XLOCKED); + sx_xlock(&uma_reclaim_lock); bucket_enable(); - zone_foreach(zone_drain); - if (vm_page_count_min() || kmem_danger) { - cache_drain_safe(NULL); + + switch (req) { + case UMA_RECLAIM_TRIM: + zone_foreach(zone_trim); + break; + case UMA_RECLAIM_DRAIN: + case UMA_RECLAIM_DRAIN_CPU: zone_foreach(zone_drain); + if (req == UMA_RECLAIM_DRAIN_CPU) { + pcpu_cache_drain_safe(NULL); + zone_foreach(zone_drain); + } + break; + default: + panic("unhandled reclamation request %d", req); } /* @@ -3698,15 +3752,7 @@ */ zone_drain(slabzone); bucket_zone_drain(); -} - -void -uma_reclaim(void) -{ - - sx_xlock(&uma_drain_lock); - uma_reclaim_locked(false); - sx_xunlock(&uma_drain_lock); + sx_xunlock(&uma_reclaim_lock); } static volatile int uma_reclaim_needed; @@ -3724,21 +3770,40 @@ { for (;;) { - sx_xlock(&uma_drain_lock); + sx_xlock(&uma_reclaim_lock); while (atomic_load_int(&uma_reclaim_needed) == 0) - sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl", + sx_sleep(uma_reclaim, &uma_reclaim_lock, PVM, "umarcl", hz); - sx_xunlock(&uma_drain_lock); + sx_xunlock(&uma_reclaim_lock); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM); - sx_xlock(&uma_drain_lock); - uma_reclaim_locked(true); + uma_reclaim(UMA_RECLAIM_DRAIN_CPU); atomic_store_int(&uma_reclaim_needed, 0); - sx_xunlock(&uma_drain_lock); /* Don't fire more than once per-second. */ pause("umarclslp", hz); } } +/* See uma.h */ +void +uma_zreclaim(uma_zone_t zone, int req) +{ + + switch (req) { + case UMA_RECLAIM_TRIM: + zone_trim(zone); + break; + case UMA_RECLAIM_DRAIN: + zone_drain(zone); + break; + case UMA_RECLAIM_DRAIN_CPU: + pcpu_cache_drain_safe(zone); + zone_drain(zone); + break; + default: + panic("unhandled reclamation request %d", req); + } +} + /* See uma.h */ int uma_zone_exhausted(uma_zone_t zone) Index: sys/vm/uma_int.h =================================================================== --- sys/vm/uma_int.h +++ sys/vm/uma_int.h @@ -196,7 +196,7 @@ */ struct uma_bucket { - LIST_ENTRY(uma_bucket) ub_link; /* Link into the zone */ + TAILQ_ENTRY(uma_bucket) ub_link; /* Link into the zone */ int16_t ub_cnt; /* Count of items in bucket. */ int16_t ub_entries; /* Max items. */ void *ub_bucket[]; /* actual allocation storage */ @@ -310,8 +310,10 @@ }; typedef struct uma_klink *uma_klink_t; +TAILQ_HEAD(uma_bucketlist, uma_bucket); + struct uma_zone_domain { - LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */ + struct uma_bucketlist uzd_buckets; /* full buckets */ long uzd_nitems; /* total item count */ long uzd_imax; /* maximum item count this period */ long uzd_imin; /* minimum item count this period */ @@ -383,7 +385,7 @@ * These flags must not overlap with the UMA_ZONE flags specified in uma.h. */ #define UMA_ZFLAG_MULTI 0x04000000 /* Multiple kegs in the zone. */ -#define UMA_ZFLAG_DRAINING 0x08000000 /* Running zone_drain. */ +#define UMA_ZFLAG_RECLAIMING 0x08000000 /* Running zone_reclaim(). */ #define UMA_ZFLAG_BUCKET 0x10000000 /* Bucket zone. */ #define UMA_ZFLAG_INTERNAL 0x20000000 /* No offpage no PCPU. */ #define UMA_ZFLAG_FULL 0x40000000 /* Reached uz_maxpages */ Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c +++ sys/vm/vm_pageout.c @@ -1874,9 +1874,12 @@ /* * We do this explicitly after the caches have been - * drained above. + * drained above. If we have a severe page shortage on + * our hands, completely drain all UMA zones. Otherwise, + * just prune the caches. */ - uma_reclaim(); + uma_reclaim(vm_page_count_min() ? UMA_RECLAIM_DRAIN_CPU : + UMA_RECLAIM_TRIM); return (true); } return (false);