Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -267,8 +268,9 @@ static void uma_timeout(void *); static void uma_startup3(void); static void *zone_alloc_item(uma_zone_t, void *, int, int); -static void *zone_alloc_item_locked(uma_zone_t, void *, int, int); static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip); +static int zone_alloc_limit(uma_zone_t zone, int count, int flags); +static void zone_free_limit(uma_zone_t zone, int count); static void bucket_enable(void); static void bucket_init(void); static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int); @@ -290,6 +292,7 @@ static int sysctl_handle_uma_zone_frees(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_zone_flags(SYSCTL_HANDLER_ARGS); static int sysctl_handle_uma_slab_efficiency(SYSCTL_HANDLER_ARGS); +static int sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS); #ifdef INVARIANTS static inline struct noslabbits *slab_dbg_bits(uma_slab_t slab, uma_keg_t keg); @@ -911,13 +914,8 @@ for (i = 0; i < bucket->ub_cnt; i++) zone->uz_fini(bucket->ub_bucket[i], zone->uz_size); zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt); - if (zone->uz_max_items > 0) { - ZONE_LOCK(zone); - zone->uz_items -= bucket->ub_cnt; - if (zone->uz_sleepers && zone->uz_items < zone->uz_max_items) - wakeup_one(zone); - ZONE_UNLOCK(zone); - } + if (zone->uz_max_items > 0) + zone_free_limit(zone, bucket->ub_cnt); bucket->ub_cnt = 0; } @@ -2096,9 +2094,10 @@ */ oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(zone->uz_oid), OID_AUTO, "limit", CTLFLAG_RD, NULL, ""); - SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, - "items", CTLFLAG_RD, &zone->uz_items, 0, - "current number of cached items"); + SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "items", CTLFLAG_RD | CTLTYPE_U64 | CTLFLAG_MPSAFE, + zone, 0, sysctl_handle_uma_zone_items, "QU", + "current number of allocated items if limit is set"); SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "max_items", CTLFLAG_RD, &zone->uz_max_items, 0, "Maximum number of cached items"); @@ -2108,6 +2107,12 @@ SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "sleeps", CTLFLAG_RD, &zone->uz_sleeps, 0, "Total zone limit sleeps"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "bucket_max", CTLFLAG_RD, &zone->uz_bkt_max, 0, + "Maximum number items in the bucket cache"); + SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, + "bucket_cnt", CTLFLAG_RD, &zone->uz_bkt_count, 0, + "Number of items in the bucket cache"); /* * Per-domain information. @@ -2961,15 +2966,15 @@ domain = PCPU_GET(domain); else domain = UMA_ANYDOMAIN; - return (zone_alloc_item_locked(zone, udata, domain, flags)); + return (zone_alloc_item(zone, udata, domain, flags)); } /* * Replenish an alloc bucket and possibly restore an old one. Called in * a critical section. Returns in a critical section. * - * A false return value indicates failure and returns with the zone lock - * held. A true return value indicates success and the caller should retry. + * A false return value indicates an allocation failure. + * A true return value indicates success and the caller should retry. */ static __noinline bool cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags) @@ -2998,6 +3003,12 @@ if (bucket != NULL) bucket_free(zone, bucket, udata); + /* Short-circuit for zones without buckets and low memory. */ + if (zone->uz_bucket_size == 0 || bucketdisable) { + critical_enter(); + return (false); + } + /* * Attempt to retrieve the item from the per-CPU cache has failed, so * we must go back to the zone. This requires the zone lock, so we @@ -3014,14 +3025,9 @@ lockfail = 1; } + /* See if we lost the race to fill the cache. */ critical_enter(); - /* Short-circuit for zones without buckets and low memory. */ - if (zone->uz_bucket_size == 0 || bucketdisable) - return (false); - cache = &zone->uz_cpu[curcpu]; - - /* See if we lost the race to fill the cache. */ if (cache->uc_allocbucket.ucb_bucket != NULL) { ZONE_UNLOCK(zone); return (true); @@ -3054,6 +3060,7 @@ */ if (lockfail && zone->uz_bucket_size < zone->uz_bucket_size_max) zone->uz_bucket_size++; + ZONE_UNLOCK(zone); /* * Fill a bucket and attempt to use it as the alloc bucket. @@ -3061,15 +3068,18 @@ bucket = zone_alloc_bucket(zone, udata, domain, flags); CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p", zone->uz_name, zone, bucket); - critical_enter(); - if (bucket == NULL) + if (bucket == NULL) { + critical_enter(); return (false); + } /* * See if we lost the race or were migrated. Cache the * initialized bucket to make this less likely or claim * the memory directly. */ + ZONE_LOCK(zone); + critical_enter(); cache = &zone->uz_cpu[curcpu]; if (cache->uc_allocbucket.ucb_bucket == NULL && ((zone->uz_flags & UMA_ZONE_NUMA) == 0 || @@ -3202,10 +3212,6 @@ if (flags & M_NOVM) break; - KASSERT(zone->uz_max_items == 0 || - zone->uz_items <= zone->uz_max_items, - ("%s: zone %p overflow", __func__, zone)); - slab = keg_alloc_slab(keg, zone, domain, flags, aflags); /* * If we got a slab here it's safe to mark it partially used @@ -3314,6 +3320,158 @@ return i; } +static int +zone_alloc_limit_hard(uma_zone_t zone, int count, int flags) +{ + uint64_t old, new, total, max; + + /* + * The hard case. We're going to sleep because there were existing + * sleepers or because we ran out of items. This routine enforces + * fairness by keeping fifo order. + * + * First release our ill gotten gains and make some noise. + */ + for (;;) { + zone_free_limit(zone, count); + zone_log_warning(zone); + zone_maxaction(zone); + if (flags & M_NOWAIT) + return (0); + + /* + * We need to allocate an item or set ourself as a sleeper + * while the sleepq lock is held to avoid wakeup races. This + * is essentially a home rolled semaphore. + */ + sleepq_lock(&zone->uz_max_items); + old = zone->uz_items; + do { + MPASS(UZ_ITEMS_SLEEPERS(old) < UZ_ITEMS_SLEEPERS_MAX); + /* Cache the max since we will evaluate twice. */ + max = zone->uz_max_items; + if (UZ_ITEMS_SLEEPERS(old) != 0 || + UZ_ITEMS_COUNT(old) >= max) + new = old + UZ_ITEMS_SLEEPER; + else + new = old + MIN(count, max - old); + } while (atomic_fcmpset_64(&zone->uz_items, &old, new) == 0); + + /* We may have successfully allocated under the sleepq lock. */ + if (new <= zone->uz_max_items) { + sleepq_release(&zone->uz_max_items); + return (new - old); + } + + /* + * This is in a different cacheline from uz_items so that we + * don't constantly invalidate the fastpath cacheline when we + * adjust item counts. This could be limited to toggling on + * transitions. + */ + atomic_add_32(&zone->uz_sleepers, 1); + + /* + * We have added ourselves as a sleeper. The sleepq lock + * protects us from wakeup races. Sleep now and then retry. + */ + sleepq_add(&zone->uz_max_items, NULL, "zonelimit", 0, 0); + sleepq_wait(&zone->uz_max_items, PVM); + + /* + * After wakeup, remove ourselves as a sleeper and try + * again. We no longer have the sleepq lock for protection. + * + * Subract ourselves as a sleeper while attempting to add + * our count. + */ + atomic_subtract_32(&zone->uz_sleepers, 1); + old = atomic_fetchadd_64(&zone->uz_items, + (uint64_t)count - UZ_ITEMS_SLEEPER); + /* We're no longer a sleeper. */ + old -= UZ_ITEMS_SLEEPER; + + /* + * If we're still at the limit, restart. Notably do not + * block on other sleepers. Cache the max value to protect + * against changes via sysctl. + */ + total = UZ_ITEMS_COUNT(old); + max = zone->uz_max_items; + if (total >= max) + continue; + /* Truncate if necessary, otherwise wake other sleepers. */ + if (total + count > max) { + zone_free_limit(zone, total + count - max); + count = max - total; + } else if (total + count < max && UZ_ITEMS_SLEEPERS(old) != 0) + wakeup_one(&zone->uz_max_items); + + return (count); + } +} + +/* + * Allocate 'count' items from our max_items limit. Returns the number + * available. If M_NOWAIT is not specified it will sleep until at least + * one item can be allocated. + */ +static int +zone_alloc_limit(uma_zone_t zone, int count, int flags) +{ + uint64_t old; + uint64_t max; + + max = zone->uz_max_items; + MPASS(max > 0); + + /* + * We expect normal allocations to succeed with a simple + * fetchadd. + */ + old = atomic_fetchadd_64(&zone->uz_items, count); + if (__predict_true(old + count <= max)) + return (count); + + /* + * If we had some items and no sleepers just return the + * truncated value. We have to release the excess space + * though because that may wake sleepers who weren't woken + * because we were temporarily over the limit. + */ + if (old < max) { + zone_free_limit(zone, (old + count) - max); + return (max - old); + } + return (zone_alloc_limit_hard(zone, count, flags)); +} + +/* + * Free a number of items back to the limit. + */ +static void +zone_free_limit(uma_zone_t zone, int count) +{ + uint64_t old; + + MPASS(count > 0); + + /* + * In the common case we either have no sleepers or + * are still over the limit and can just return. + */ + old = atomic_fetchadd_64(&zone->uz_items, -count); + if (__predict_true(UZ_ITEMS_SLEEPERS(old) == 0 || + UZ_ITEMS_COUNT(old) - count >= zone->uz_max_items)) + return; + + /* + * Moderate the rate of wakeups. Sleepers will continue + * to generate wakeups if necessary. + */ + wakeup_one(&zone->uz_max_items); +} + static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags) { @@ -3326,15 +3484,13 @@ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) domain = UMA_ANYDOMAIN; - if (zone->uz_max_items > 0) { - if (zone->uz_items >= zone->uz_max_items) - return (false); - maxbucket = MIN(zone->uz_bucket_size, - zone->uz_max_items - zone->uz_items); - zone->uz_items += maxbucket; - } else + if (zone->uz_max_items > 0) + maxbucket = zone_alloc_limit(zone, zone->uz_bucket_size, + M_NOWAIT); + else maxbucket = zone->uz_bucket_size; - ZONE_UNLOCK(zone); + if (maxbucket == 0) + return (false); /* Don't wait for buckets, preserve caller's NOVM setting. */ bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM)); @@ -3378,15 +3534,8 @@ bucket = NULL; } out: - ZONE_LOCK(zone); - if (zone->uz_max_items > 0 && cnt < maxbucket) { - MPASS(zone->uz_items >= maxbucket - cnt); - zone->uz_items -= maxbucket - cnt; - if (zone->uz_sleepers > 0 && - (cnt == 0 ? zone->uz_items + 1 : zone->uz_items) < - zone->uz_max_items) - wakeup_one(zone); - } + if (zone->uz_max_items > 0 && cnt < maxbucket) + zone_free_limit(zone, maxbucket - cnt); return (bucket); } @@ -3407,43 +3556,11 @@ static void * zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags) -{ - - ZONE_LOCK(zone); - return (zone_alloc_item_locked(zone, udata, domain, flags)); -} - -/* - * Returns with zone unlocked. - */ -static void * -zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags) { void *item; - ZONE_LOCK_ASSERT(zone); - - if (zone->uz_max_items > 0) { - if (zone->uz_items >= zone->uz_max_items) { - zone_log_warning(zone); - zone_maxaction(zone); - if (flags & M_NOWAIT) { - ZONE_UNLOCK(zone); - return (NULL); - } - zone->uz_sleeps++; - zone->uz_sleepers++; - while (zone->uz_items >= zone->uz_max_items) - mtx_sleep(zone, zone->uz_lockptr, PVM, - "zonelimit", 0); - zone->uz_sleepers--; - if (zone->uz_sleepers > 0 && - zone->uz_items + 1 < zone->uz_max_items) - wakeup_one(zone); - } - zone->uz_items++; - } - ZONE_UNLOCK(zone); + if (zone->uz_max_items > 0 && zone_alloc_limit(zone, 1, flags) == 0) + return (NULL); /* Avoid allocs targeting empty domains. */ if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain)) @@ -3477,14 +3594,11 @@ fail_cnt: counter_u64_add(zone->uz_fails, 1); fail: - if (zone->uz_max_items > 0) { - ZONE_LOCK(zone); - /* XXX Decrement without wakeup */ - zone->uz_items--; - ZONE_UNLOCK(zone); - } + if (zone->uz_max_items > 0) + zone_free_limit(zone, 1); CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)", zone->uz_name, zone); + return (NULL); } @@ -3830,14 +3944,8 @@ counter_u64_add(zone->uz_frees, 1); - if (zone->uz_max_items > 0) { - ZONE_LOCK(zone); - zone->uz_items--; - if (zone->uz_sleepers > 0 && - zone->uz_items < zone->uz_max_items) - wakeup_one(zone); - ZONE_UNLOCK(zone); - } + if (zone->uz_max_items > 0) + zone_free_limit(zone, 1); } /* See uma.h */ @@ -3853,6 +3961,9 @@ zone->uz_bucket_size_max = zone->uz_bucket_size = count; if (zone->uz_bucket_size_min > zone->uz_bucket_size_max) zone->uz_bucket_size_min = zone->uz_bucket_size_max; + /* We may need to wake waiters. */ + if (zone->uz_max_items) + wakeup(&zone->uz_max_items); zone->uz_max_items = nitems; zone->uz_flags |= UMA_ZFLAG_LIMIT; zone_update_caches(zone); @@ -4414,6 +4525,7 @@ struct sbuf sbuf; uma_keg_t kz; uma_zone_t z; + uint64_t items; int count, error, i; error = sysctl_wire_old_buffer(req, 0); @@ -4450,10 +4562,11 @@ uth.uth_align = kz->uk_align; uth.uth_size = kz->uk_size; uth.uth_rsize = kz->uk_rsize; - if (z->uz_max_items > 0) - uth.uth_pages = (z->uz_items / kz->uk_ipers) * + if (z->uz_max_items > 0) { + items = UZ_ITEMS_COUNT(z->uz_items); + uth.uth_pages = (items / kz->uk_ipers) * kz->uk_ppera; - else + } else uth.uth_pages = kz->uk_pages; uth.uth_maxpages = (z->uz_max_items / kz->uk_ipers) * kz->uk_ppera; @@ -4587,6 +4700,16 @@ return (sysctl_handle_int(oidp, &effpct, 0, req)); } +static int +sysctl_handle_uma_zone_items(SYSCTL_HANDLER_ARGS) +{ + uma_zone_t zone = arg1; + uint64_t cur; + + cur = UZ_ITEMS_COUNT(atomic_load_64(&zone->uz_items)); + return (sysctl_handle_64(oidp, &cur, 0, req)); +} + #ifdef INVARIANTS static uma_slab_t uma_dbg_getslab(uma_zone_t zone, void *item) Index: sys/vm/uma_int.h =================================================================== --- sys/vm/uma_int.h +++ sys/vm/uma_int.h @@ -406,10 +406,7 @@ typedef struct uma_zone_domain * uma_zone_domain_t; /* - * Zone management structure - * - * TODO: Optimize for cache line size - * + * Zone structure - per memory type. */ struct uma_zone { /* Offset 0, used in alloc/free fast/medium fast path and const. */ @@ -422,9 +419,9 @@ uint32_t uz_size; /* Size inherited from kegs */ uma_ctor uz_ctor; /* Constructor for each allocation */ uma_dtor uz_dtor; /* Destructor */ - uint64_t uz_items; /* Total items count */ + uint64_t uz_spare0; uint64_t uz_max_items; /* Maximum number of items to alloc */ - uint32_t uz_sleepers; /* Number of sleepers on memory */ + uint32_t uz_sleepers; /* Threads sleeping on limit */ uint16_t uz_bucket_size; /* Number of items in full bucket */ uint16_t uz_bucket_size_max; /* Maximum number of bucket items */ @@ -434,7 +431,7 @@ void *uz_arg; /* Import/release argument. */ uma_init uz_init; /* Initializer for each item */ uma_fini uz_fini; /* Finalizer for each item. */ - void *uz_spare; + void *uz_spare1; uint64_t uz_bkt_count; /* Items in bucket cache */ uint64_t uz_bkt_max; /* Maximum bucket cache size */ @@ -459,6 +456,8 @@ counter_u64_t uz_fails; /* Total number of alloc failures */ uint64_t uz_sleeps; /* Total number of alloc sleeps */ uint64_t uz_xdomain; /* Total number of cross-domain frees */ + volatile uint64_t uz_items; /* Total items count & sleepers */ + char *uz_ctlname; /* sysctl safe name string. */ struct sysctl_oid *uz_oid; /* sysctl oid pointer. */ int uz_namecnt; /* duplicate name count. */ @@ -515,6 +514,17 @@ "\2ZINIT" \ "\1PAGEABLE" +/* + * Macros for interpreting the uz_items field. 20 bits of sleeper count + * and 44 bit of item count. + */ +#define UZ_ITEMS_SLEEPER_SHIFT 44LL +#define UZ_ITEMS_SLEEPERS_MAX ((1 << (64 - UZ_ITEMS_SLEEPER_SHIFT)) - 1) +#define UZ_ITEMS_COUNT_MASK ((1LL << UZ_ITEMS_SLEEPER_SHIFT) - 1) +#define UZ_ITEMS_COUNT(x) ((x) & UZ_ITEMS_COUNT_MASK) +#define UZ_ITEMS_SLEEPERS(x) ((x) >> UZ_ITEMS_SLEEPER_SHIFT) +#define UZ_ITEMS_SLEEPER (1LL << UZ_ITEMS_SLEEPER_SHIFT) + #undef UMA_ALIGN #ifdef _KERNEL