Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -267,8 +268,9 @@ static void uma_timeout(void *); static void uma_startup3(void); static void *zone_alloc_item(uma_zone_t, void *, int, int); -static void *zone_alloc_item_locked(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); +static int zone_alloc_limit(uma_zone_t zone, int count, int flags); +static void zone_free_limit(uma_zone_t zone, int count); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); @@ -290,6 +292,7 @@ static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); +static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); #ifdef INVARIANTS static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); @@ -903,13 +906,8 @@ for (i = 0; i < bucket->ub_cnt; i++) zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); - if (zone->uz_max_items > 0) { - ZONE_LOCK(zone); - zone->uz_items -= bucket->ub_cnt; - if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items) - wakeup_one(zone); - ZONE_UNLOCK(zone); - } + if (zone->uz_max_items > 0) + zone_free_limit(zone, bucket->ub_cnt); bucket->ub_cnt = 0; } @@ -2088,9 +2086,10 @@ */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "limit", CTLFLAG_RD, NULL, ""); - SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, - "items", CTLFLAG_RD, &zone->uz_items, 0, - "current number of cached items"); + SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, + zone, 0, sysctl_handle_uma_zone_items, "QU", + "current number of allocated items if limit is set"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, "Maximum number of cached items"); @@ -2100,6 +2099,12 @@ SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, "Total zone limit sleeps"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "bucket_max", CTLFLAG_RD, &zone->uz_bkt_max, 0, + "Maximum number items in the bucket cache"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "bucket_cnt", CTLFLAG_RD, &zone->uz_bkt_count, 0, + "Number of items in the bucket cache"); /* * Per-domain information. @@ -2952,15 +2957,15 @@ domain = PCPU_GET(domain); else domain = UMA_ANYDOMAIN; - return (zone_alloc_item_locked(zone, udata, domain, flags)); + return (zone_alloc_item(zone, udata, domain, flags)); } /* * Replenish an alloc bucket and possibly restore an old one. Called in * a critical section. Returns in a critical section. * - * A false return value indicates failure and returns with the zone lock - * held. A true return value indicates success and the caller should retry. + * A false return value indicates an allocation failure. + * A true return value indicates success and the caller should retry. */ static __noinline bool cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) @@ -2989,6 +2994,12 @@ if (bucket != NULL) bucket_free(zone, bucket, udata); + /* Short-circuit for zones without buckets and low memory. */ + if (zone->uz_bucket_size == 0 || bucketdisable) { + critical_enter(); + return (false); + } + /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zone lock, so we @@ -3005,14 +3016,9 @@ lockfail = 1; } + /* See if we lost the race to fill the cache. */ critical_enter(); - /* Short-circuit for zones without buckets and low memory. */ - if (zone->uz_bucket_size == 0 || bucketdisable) - return (false); - cache = &zone->uz_cpu[curcpu]; - - /* See if we lost the race to fill the cache. */ if (cache->uc_allocbucket.ucb_bucket != NULL) { ZONE_UNLOCK(zone); return (true); @@ -3045,6 +3051,7 @@ */ if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) zone->uz_bucket_size++; + ZONE_UNLOCK(zone); /* * Fill a bucket and attempt to use it as the alloc bucket. @@ -3052,15 +3059,18 @@ bucket = zone_alloc_bucket(zone, udata, domain, flags); CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); - critical_enter(); - if (bucket == NULL) + if (bucket == NULL) { + critical_enter(); return (false); + } /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ + ZONE_LOCK(zone); + critical_enter(); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket.ucb_bucket == NULL && ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || @@ -3305,6 +3315,148 @@ return i; } +static int +zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) +{ + uint64_t old, new, cnt; + + /* + * The hard case. We're going to sleep because there were existing + * sleepers or because we ran out of items. This routine enforces + * fairness by keeping fifo order. + * + * First release our ill gotten gains and make some noise. + */ + for (;;) { + zone_free_limit(zone, count); + zone_log_warning(zone); + zone_maxaction(zone); + if (flags & M_NOWAIT) + return (0); + + /* + * We need to allocate an item or set ourself as a sleeper + * while the sleepq lock is held to avoid wakeup races. This + * is essentially a home rolled semaphore. + */ + sleepq_lock(&zone->uz_max_items); + old = zone->uz_items; + do { + if (UZ_ITEMS_SLEEPERS(old) != 0 || + UZ_ITEMS_COUNT(old) >= zone->uz_max_items) + new = old + UZ_ITEMS_SLEEPER; + else + new = old + MIN(count, + zone->uz_max_items - old); + } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); + + /* We may have successfully allocated under the sleepq lock. */ + if (new <= zone->uz_max_items) { + sleepq_release(&zone->uz_max_items); + return (new - old); + } + + /* + * This is in a different cacheline from uz_items so that we + * don't constantly invalidate the fastpath cacheline when we + * adjust item counts. This could be limited to toggling on + * transitions. + */ + atomic_fetchadd_int(&zone->uz_sleepers, 1); + + /* + * We have added ourselves as a sleeper. The sleepq lock + * protects us from wakeup races. Sleep now and then retry. + */ + sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); + sleepq_wait(&zone->uz_max_items, PVM); + + /* + * After wakeup, remove ourselves as a sleeper and try + * again. We no longer have the sleepq lock for protection. + */ + atomic_fetchadd_int(&zone->uz_sleepers, -1); + old = atomic_fetchadd_64(&zone->uz_items, + (uint64_t)count - UZ_ITEMS_SLEEPER); + /* We're no longer a sleeper. */ + old -= UZ_ITEMS_SLEEPER; + + /* + * If we're still at the limit, restart. Notably do not + * block on other sleepers. + */ + cnt = UZ_ITEMS_COUNT(old); + if (cnt >= zone->uz_max_items) + continue; + /* Truncate if necessary, otherwise wake other sleepers. */ + if (cnt + count > zone->uz_max_items) { + zone_free_limit(zone, + (cnt + count) - zone->uz_max_items); + count = zone->uz_max_items - cnt; + } else if (cnt + count < zone->uz_max_items && + UZ_ITEMS_SLEEPERS(old) != 0) + wakeup_one(&zone->uz_max_items); + return (count); + } +} + +/* + * Allocate 'count' items from our max_items limit. Returns the number + * available. If M_NOWAIT is not specified it will sleep until at least + * one item can be allocated. + */ +static int +zone_alloc_limit(uma_zone_t zone, int count, int flags) +{ + uint64_t old; + + MPASS(zone->uz_max_items > 0); + + /* + * We expect normal allocations to succeed with a simple + * fetchadd. + */ + old = atomic_fetchadd_64(&zone->uz_items, count); + if (__predict_true(old + count <= zone->uz_max_items)) + return (count); + + /* + * If we had some items and no sleepers just return the + * truncated value. We have to release the excess space + * though because that may wake sleepers who weren't woken + * because we were temporarily over the limit. + */ + if (old < zone->uz_max_items) { + zone_free_limit(zone, (old + count) - zone->uz_max_items); + return (zone->uz_max_items - old); + } + return zone_alloc_limit_hard(zone, count, flags); +} + +/* + * Free a number of items back to the limit. + */ +static void +zone_free_limit(uma_zone_t zone, int count) +{ + uint64_t old; + + /* + * In the common case we either have no sleepers or + * are still over the limit and can just return. + */ + old = atomic_fetchadd_64(&zone->uz_items, -count); + if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || + UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) + return; + + /* + * Moderate the rate of wakeups. Sleepers will continue + * to generate wakeups if necessary. + */ + wakeup_one(&zone->uz_max_items); +} + static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { @@ -3317,15 +3469,13 @@ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; - if (zone->uz_max_items > 0) { - if (zone->uz_items >= zone->uz_max_items) - return (false); - maxbucket = MIN(zone->uz_bucket_size, - zone->uz_max_items - zone->uz_items); - zone->uz_items += maxbucket; - } else + if (zone->uz_max_items > 0) + maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, + M_NOWAIT); + else maxbucket = zone->uz_bucket_size; - ZONE_UNLOCK(zone); + if (maxbucket == 0) + return (false); /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); @@ -3369,15 +3519,8 @@ bucket = NULL; } out: - ZONE_LOCK(zone); - if (zone->uz_max_items > 0 && cnt < maxbucket) { - MPASS(zone->uz_items >= maxbucket - cnt); - zone->uz_items -= maxbucket - cnt; - if (zone->uz_sleepers > 0 && - (cnt == 0 ? zone->uz_items + 1 : zone->uz_items) < - zone->uz_max_items) - wakeup_one(zone); - } + if (zone->uz_max_items > 0 && cnt < maxbucket) + zone_free_limit(zone, maxbucket - cnt); return (bucket); } @@ -3398,43 +3541,11 @@ static void * zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) -{ - - ZONE_LOCK(zone); - return (zone_alloc_item_locked(zone, udata, domain, flags)); -} - -/* - * Returns with zone unlocked. - */ -static void * -zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags) { void *item; - ZONE_LOCK_ASSERT(zone); - - if (zone->uz_max_items > 0) { - if (zone->uz_items >= zone->uz_max_items) { - zone_log_warning(zone); - zone_maxaction(zone); - if (flags & M_NOWAIT) { - ZONE_UNLOCK(zone); - return (NULL); - } - zone->uz_sleeps++; - zone->uz_sleepers++; - while (zone->uz_items >= zone->uz_max_items) - mtx_sleep(zone, zone->uz_lockptr, PVM, - "zonelimit", 0); - zone->uz_sleepers--; - if (zone->uz_sleepers > 0 && - zone->uz_items + 1 < zone->uz_max_items) - wakeup_one(zone); - } - zone->uz_items++; - } - ZONE_UNLOCK(zone); + if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) + return (NULL); /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) @@ -3468,14 +3579,11 @@ fail_cnt: counter_u64_add(zone->uz_fails, 1); fail: - if (zone->uz_max_items > 0) { - ZONE_LOCK(zone); - /* XXX Decrement without wakeup */ - zone->uz_items--; - ZONE_UNLOCK(zone); - } + if (zone->uz_max_items > 0) + zone_free_limit(zone, 1); CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); + return (NULL); } @@ -3821,14 +3929,8 @@ counter_u64_add(zone->uz_frees, 1); - if (zone->uz_max_items > 0) { - ZONE_LOCK(zone); - zone->uz_items--; - if (zone->uz_sleepers > 0 && - zone->uz_items < zone->uz_max_items) - wakeup_one(zone); - ZONE_UNLOCK(zone); - } + if (zone->uz_max_items > 0) + zone_free_limit(zone, 1); } /* See uma.h */ @@ -3844,6 +3946,9 @@ zone->uz_bucket_size_max = zone->uz_bucket_size = count; if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) zone->uz_bucket_size_min = zone->uz_bucket_size_max; + /* We may need to wake waiters. */ + if (zone->uz_max_items) + wakeup(&zone->uz_max_items); zone->uz_max_items = nitems; zone->uz_flags |= UMA_ZFLAG_LIMIT; zone_update_flags(zone); @@ -4578,6 +4683,16 @@ return (sysctl_handle_int(oidp, &effpct, 0, req)); } +static int +sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) +{ + uma_zone_t zone = arg1; + uint64_t cur; + + cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); + return (sysctl_handle_64(oidp, &cur, 0, req)); +} + #ifdef INVARIANTS static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) Index: sys/vm/uma_int.h =================================================================== --- sys/vm/uma_int.h +++ sys/vm/uma_int.h @@ -407,10 +407,7 @@ typedef struct uma_zone_domain * uma_zone_domain_t; /* - * Zone management structure - * - * TODO: Optimize for cache line size - * + * Zone structure - per memory type. */ struct uma_zone { /* Offset 0, used in alloc/free fast/medium fast path and const. */ @@ -423,9 +420,9 @@ uint32_t uz_size; /* Size inherited from kegs */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ - uint64_t uz_items; /* Total items count */ + uint64_t uz_spare0; uint64_t uz_max_items; /* Maximum number of items to alloc */ - uint32_t uz_sleepers; /* Number of sleepers on memory */ + uint32_t uz_sleepers; /* Threads sleeping on limit */ uint16_t uz_bucket_size; /* Number of items in full bucket */ uint16_t uz_bucket_size_max; /* Maximum number of bucket items */ @@ -435,7 +432,7 @@ void *uz_arg; /* Import/release argument. */ uma_init uz_init; /* Initializer for each item */ uma_fini uz_fini; /* Finalizer for each item. */ - void *uz_spare; + void *uz_spare1; uint64_t uz_bkt_count; /* Items in bucket cache */ uint64_t uz_bkt_max; /* Maximum bucket cache size */ @@ -460,6 +457,8 @@ counter_u64_t uz_fails; /* Total number of alloc failures */ uint64_t uz_sleeps; /* Total number of alloc sleeps */ uint64_t uz_xdomain; /* Total number of cross-domain frees */ + volatile uint64_t uz_items; /* Total items count & sleepers */ + char *uz_ctlname; /* sysctl safe name string. */ struct sysctl_oid *uz_oid; /* sysctl oid pointer. */ int uz_namecnt; /* duplicate name count. */ @@ -514,6 +513,16 @@ "\2ZINIT" \ "\1PAGEABLE" +/* + * Macros for interpreting the uz_items field. 20 bits of sleeper count + * and 44 bit of item count. + */ +#define UZ_ITEMS_SLEEPER_SHIFT 44LL +#define UZ_ITEMS_COUNT_MASK ((1LL << UZ_ITEMS_SLEEPER_SHIFT) - 1) +#define UZ_ITEMS_COUNT(x) ((x) & UZ_ITEMS_COUNT_MASK) +#define UZ_ITEMS_SLEEPERS(x) ((x) >> UZ_ITEMS_SLEEPER_SHIFT) +#define UZ_ITEMS_SLEEPER (1LL << UZ_ITEMS_SLEEPER_SHIFT) + #undef UMA_ALIGN #ifdef _KERNEL