Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F142372023
D22586.id67270.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
32 KB
Referenced Files
None
Subscribers
None
D22586.id67270.diff
View Options
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4942,6 +4942,7 @@
vm/swap_pager.c standard
vm/uma_core.c standard
vm/uma_dbg.c standard
+vm/uma_smr.c standard
vm/memguard.c optional DEBUG_MEMGUARD
vm/vm_domainset.c standard
vm/vm_fault.c standard
Index: sys/tools/smrstress/Makefile
===================================================================
--- /dev/null
+++ sys/tools/smrstress/Makefile
@@ -0,0 +1,4 @@
+KMOD= smrstress
+SRCS= smrstress.c
+
+.include <bsd.kmod.mk>
Index: sys/tools/smrstress/smrstress.c
===================================================================
--- /dev/null
+++ sys/tools/smrstress/smrstress.c
@@ -0,0 +1,193 @@
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/kthread.h>
+#include <sys/conf.h>
+#include <sys/mbuf.h>
+#include <sys/smp.h>
+
+#include <vm/uma.h>
+#include <vm/uma_smr.h>
+
+#include <machine/stdarg.h>
+
+static uma_zone_t smrs_zone;
+static uma_smr_t smrs_smr;
+
+static int smrs_cpus = 32;
+static int smrs_started;
+static int smrs_iterations = 100000;
+static int smrs_failures = 0;
+static volatile int smrs_completed;
+
+struct smrs {
+ int generation;
+ volatile u_int count;
+};
+
+uintptr_t smrs_current;
+
+static void
+smrs_error(struct smrs *smrs, const char *fmt, ...)
+{
+ va_list ap;
+
+ atomic_add_int(&smrs_failures, 1);
+ printf("SMR ERROR: epoch %d, min_epoch %d, cpu epoch %d, generation %d, count %d ",
+ smrs_smr->smr_global->smrg_epoch, smrs_smr->smr_global->smrg_epoch_min,
+ zpcpu_get(smrs_smr)->smr_epoch, smrs->generation, smrs->count);
+ va_start(ap, fmt);
+ (void)vprintf(fmt, ap);
+ va_end(ap);
+}
+
+static void
+smrs_read(void)
+{
+ struct smrs *cur;
+ int cnt;
+
+ /* Wait for the writer to exit. */
+ while (smrs_completed == 0) {
+ uma_smr_enter(smrs_smr);
+ cur = (void *)atomic_load_ptr(&smrs_current);
+ if (cur->generation == -1)
+ smrs_error(cur, "read early: Use after free!\n");
+ atomic_add_int(&cur->count, 1);
+ DELAY(100);
+ cnt = atomic_fetchadd_int(&cur->count, -1);
+ if (cur->generation == -1)
+ smrs_error(cur, "read late: Use after free!\n");
+ else if (cnt <= 0)
+ smrs_error(cur, "Invalid ref\n");
+ uma_smr_exit(smrs_smr);
+ maybe_yield();
+ }
+}
+
+static void
+smrs_write(void)
+{
+ struct smrs *cur;
+ int i;
+
+ for (i = 0; i < smrs_iterations; i++) {
+ cur = (void *)smrs_current;
+ atomic_store_rel_ptr(&smrs_current,
+ (uintptr_t)uma_zalloc(smrs_zone, M_WAITOK));
+ uma_zfree(smrs_zone, cur);
+ }
+}
+
+static void
+smrs_thread(void *arg)
+{
+ int rw = (intptr_t)arg;
+
+ if (rw == 0)
+ smrs_write();
+ else
+ smrs_read();
+ atomic_add_int(&smrs_completed, 1);
+}
+
+static void
+smrs_start(void)
+{
+ struct smrs *cur;
+ int i;
+
+ smrs_started = smrs_cpus;
+ smrs_completed = 0;
+ atomic_store_rel_ptr(&smrs_current,
+ (uintptr_t)uma_zalloc(smrs_zone, M_WAITOK));
+ for (i = 0; i < smrs_started; i++)
+ kthread_add((void (*)(void *))smrs_thread,
+ (void *)(intptr_t)i, curproc, NULL, 0, 0, "smrs-%d", i);
+
+ while (smrs_completed != smrs_started)
+ pause("prf", hz/2);
+
+ cur = (void *)smrs_current;
+ smrs_current = (uintptr_t)NULL;
+ uma_zfree(smrs_zone, cur);
+
+ printf("Completed %d loops with %d failures\n",
+ smrs_iterations, smrs_failures);
+}
+
+static int
+smrs_ctor(void *mem, int size, void *arg, int flags)
+{
+ static int smr_generation = 0;
+ struct smrs *smrs = mem;
+
+ if (smrs->generation != -1 && smrs->generation != 0)
+ smrs_error(smrs, "ctor: Invalid smr generation on ctor\n");
+ else if (smrs->count != 0)
+ smrs_error(smrs, "ctor: Invalid count\n");
+ smrs->generation = ++smr_generation;
+
+ return (0);
+}
+
+
+static void
+smrs_dtor(void *mem, int size, void *arg)
+{
+ struct smrs *smrs = mem;
+
+ if (smrs->generation == -1)
+ smrs_error(smrs, "dtor: Invalid generation");
+ smrs->generation = -1;
+ if (smrs->count != 0)
+ smrs_error(smrs, "dtor: Invalid count\n");
+}
+
+
+static void
+smrs_init(void)
+{
+
+ smrs_zone = uma_zcreate("smrs", sizeof(struct smrs),
+ smrs_ctor, smrs_dtor, NULL, NULL, UMA_ALIGN_PTR,
+ UMA_ZONE_SMR | UMA_ZONE_ZINIT);
+ smrs_smr = uma_zone_get_smr(smrs_zone);
+}
+
+static void
+smrs_fini(void)
+{
+
+ uma_zdestroy(smrs_zone);
+}
+
+static int
+smrs_modevent(module_t mod, int what, void *arg)
+{
+
+ switch (what) {
+ case MOD_LOAD:
+ smrs_init();
+ smrs_start();
+ break;
+ case MOD_UNLOAD:
+ smrs_fini();
+ break;
+ default:
+ break;
+ }
+ return (0);
+}
+
+moduledata_t smrs_meta = {
+ "smrstress",
+ smrs_modevent,
+ NULL
+};
+DECLARE_MODULE(smrstress, smrs_meta, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+MODULE_VERSION(smrstress, 1);
Index: sys/vm/uma.h
===================================================================
--- sys/vm/uma.h
+++ sys/vm/uma.h
@@ -263,6 +263,17 @@
*/
#define UMA_ZONE_FIRSTTOUCH 0x10000 /* First touch NUMA policy */
#define UMA_ZONE_ROUNDROBIN 0x20000 /* Round-robin NUMA policy. */
+#define UMA_ZONE_SMR 0x40000 /*
+ * Safe memory reclamation defers
+ * frees until all read sections
+ * have exited. This flag creates
+ * a unique SMR context for this
+ * zone. To share contexts see
+ * uma_zone_set_smr() below.
+ *
+ * See sys/vm/uma_smr.h for more
+ * details.
+ */
/* In use by UMA_ZFLAGs: 0xffe00000 */
/*
@@ -273,7 +284,7 @@
#define UMA_ZONE_INHERIT \
(UMA_ZONE_NOTOUCH | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \
UMA_ZONE_NOTPAGE | UMA_ZONE_PCPU | UMA_ZONE_FIRSTTOUCH | \
- UMA_ZONE_ROUNDROBIN)
+ UMA_ZONE_ROUNDROBIN | UMA_ZONE_SMR)
/* Definitions for align */
#define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */
@@ -600,6 +611,17 @@
void uma_zone_set_freef(uma_zone_t zone, uma_free freef);
+/*
+ * Associate a zone with a smr context that is allocated after creation
+ * so that multiple zones may share the same context.
+ */
+
+struct uma_smr;
+typedef struct uma_smr * uma_smr_t;
+
+void uma_zone_set_smr(uma_zone_t zone, uma_smr_t smr);
+uma_smr_t uma_zone_get_smr(uma_zone_t zone);
+
/*
* These flags are setable in the allocf and visible in the freef.
*/
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -92,6 +92,7 @@
#include <vm/vm_kern.h>
#include <vm/vm_extern.h>
#include <vm/uma.h>
+#include <vm/uma_smr.h>
#include <vm/uma_int.h>
#include <vm/uma_dbg.h>
@@ -273,6 +274,8 @@
static void keg_dtor(void *, int, void *);
static int zone_ctor(void *, int, void *, int);
static void zone_dtor(void *, int, void *);
+static inline void item_dtor(uma_zone_t zone, void *item, int size,
+ void *udata, enum zfreeskip skip);
static int zero_init(void *, int, int);
static void zone_foreach(void (*zfunc)(uma_zone_t, void *), void *);
static void zone_foreach_unlocked(void (*zfunc)(uma_zone_t, void *), void *);
@@ -454,9 +457,9 @@
uma_bucket_t bucket;
/*
- * Don't allocate buckets in low memory situations.
+ * Don't allocate buckets early in boot.
*/
- if (bucketdisable)
+ if (__predict_false(booted < BOOT_KVA))
return (NULL);
/*
@@ -488,6 +491,9 @@
#endif
bucket->ub_cnt = 0;
bucket->ub_entries = ubz->ubz_entries;
+ bucket->ub_epoch = 0;
+ CTR3(KTR_UMA, "bucket_alloc: zone %s(%p) allocated bucket %p",
+ zone->uz_name, zone, bucket);
}
return (bucket);
@@ -500,6 +506,8 @@
KASSERT(bucket->ub_cnt == 0,
("bucket_free: Freeing a non free bucket."));
+ KASSERT(bucket->ub_epoch == 0,
+ ("bucket_free: Freeing an SMR bucket."));
if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
udata = (void *)(uintptr_t)zone->uz_flags;
ubz = bucket_zone_lookup(bucket->ub_entries);
@@ -517,23 +525,38 @@
/*
* Attempt to satisfy an allocation by retrieving a full bucket from one of the
- * zone's caches.
+ * zone's caches. If a bucket is found the zone is not locked on return.
*/
static uma_bucket_t
zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom)
{
uma_bucket_t bucket;
+ int i;
+ bool dtor = false;
ZONE_LOCK_ASSERT(zone);
- if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) != NULL) {
- MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
- TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
- zdom->uzd_nitems -= bucket->ub_cnt;
- if (zdom->uzd_imin > zdom->uzd_nitems)
- zdom->uzd_imin = zdom->uzd_nitems;
- zone->uz_bkt_count -= bucket->ub_cnt;
+ if ((bucket = TAILQ_FIRST(&zdom->uzd_buckets)) == NULL)
+ return (NULL);
+
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_epoch) {
+ if (!uma_smr_poll(zone->uz_smr, bucket->ub_epoch, false))
+ return (NULL);
+ bucket->ub_epoch = 0;
+ dtor = true;
}
+ MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
+ TAILQ_REMOVE(&zdom->uzd_buckets, bucket, ub_link);
+ zdom->uzd_nitems -= bucket->ub_cnt;
+ if (zdom->uzd_imin > zdom->uzd_nitems)
+ zdom->uzd_imin = zdom->uzd_nitems;
+ zone->uz_bkt_count -= bucket->ub_cnt;
+ ZONE_UNLOCK(zone);
+ if (dtor)
+ for (i = 0; i < bucket->ub_cnt; i++)
+ item_dtor(zone, bucket->ub_bucket[i], zone->uz_size,
+ NULL, SKIP_NONE);
+
return (bucket);
}
@@ -551,10 +574,7 @@
KASSERT(!ws || zone->uz_bkt_count < zone->uz_bkt_max,
("%s: zone %p overflow", __func__, zone));
- if (ws)
- TAILQ_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
- else
- TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
+ TAILQ_INSERT_TAIL(&zdom->uzd_buckets, bucket, ub_link);
zdom->uzd_nitems += bucket->ub_cnt;
if (ws && zdom->uzd_imax < zdom->uzd_nitems)
zdom->uzd_imax = zdom->uzd_nitems;
@@ -941,12 +961,22 @@
if (bucket == NULL || bucket->ub_cnt == 0)
return;
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && bucket->ub_epoch != 0) {
+ uma_smr_wait(zone->uz_smr, bucket->ub_epoch);
+ for (i = 0; i < bucket->ub_cnt; i++)
+ item_dtor(zone, bucket->ub_bucket[i],
+ zone->uz_size, NULL, SKIP_NONE);
+ bucket->ub_epoch = 0;
+ }
if (zone->uz_fini)
for (i = 0; i < bucket->ub_cnt; i++)
zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
if (zone->uz_max_items > 0)
zone_free_limit(zone, bucket->ub_cnt);
+#ifdef INVARIANTS
+ bzero(bucket->ub_bucket, sizeof(void *) * bucket->ub_cnt);
+#endif
bucket->ub_cnt = 0;
}
@@ -1035,12 +1065,21 @@
zone_put_bucket(zone, &zone->uz_domain[domain], b1, false);
b1 = NULL;
}
+
+ /*
+ * Don't flush SMR zone buckets. This leaves the zone without a
+ * bucket and forces every free to synchronize().
+ */
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+ goto out;
b2 = cache_bucket_unload_free(cache);
if (b2 != NULL && b2->ub_cnt != 0) {
zone_put_bucket(zone, &zone->uz_domain[domain], b2, false);
b2 = NULL;
}
b3 = cache_bucket_unload_cross(cache);
+
+out:
critical_exit();
ZONE_UNLOCK(zone);
if (b1)
@@ -2356,6 +2395,8 @@
ZONE_LOCK(zone);
LIST_FOREACH(z, &keg->uk_zones, uz_link) {
if (LIST_NEXT(z, uz_link) == NULL) {
+ /* Keg zones share smr state. */
+ zone->uz_smr = z->uz_smr;
LIST_INSERT_AFTER(z, zone, uz_link);
break;
}
@@ -2399,6 +2440,11 @@
zone->uz_fails = EARLY_COUNTER;
}
+ /* Caller requests a private SMR context. */
+ if ((zone->uz_flags & (UMA_ZONE_SMR | UMA_ZONE_SECONDARY)) ==
+ UMA_ZONE_SMR)
+ zone->uz_smr = uma_smr_create(zone->uz_name);
+
KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
(UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
("Invalid zone flag combination"));
@@ -2600,6 +2646,7 @@
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
bucket_init();
+ uma_smr_init();
}
#ifndef UMA_MD_SMALL_ALLOC
@@ -3014,9 +3061,14 @@
/*
* If we have run out of items in our alloc bucket see
* if we can switch with the free bucket.
+ *
+ * SMR Zones can't re-use the free bucket until the epoch has
+ * expired.
*/
- if (cache->uc_freebucket.ucb_cnt != 0) {
- cache_bucket_swap(&cache->uc_freebucket, &cache->uc_allocbucket);
+ if ((zone->uz_flags & UMA_ZONE_SMR) == 0 &&
+ cache->uc_freebucket.ucb_cnt != 0) {
+ cache_bucket_swap(&cache->uc_freebucket,
+ &cache->uc_allocbucket);
return (true);
}
@@ -3070,7 +3122,6 @@
}
if ((bucket = zone_fetch_bucket(zone, zdom)) != NULL) {
- ZONE_UNLOCK(zone);
KASSERT(bucket->ub_cnt != 0,
("uma_zalloc_arg: Returning an empty bucket."));
cache_bucket_load_alloc(cache, bucket);
@@ -3665,8 +3716,8 @@
*/
cache = &zone->uz_cpu[curcpu];
uz_flags = cache_uz_flags(cache);
- if (__predict_false((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 ||
- UMA_ALWAYS_CTORDTOR))
+ if (__predict_false(((uz_flags & UMA_ZFLAG_CTORDTOR) != 0 ||
+ UMA_ALWAYS_CTORDTOR) && (uz_flags & UMA_ZONE_SMR) == 0))
item_dtor(zone, item, cache_uz_size(cache), udata, SKIP_NONE);
/*
@@ -3697,6 +3748,13 @@
critical_enter();
do {
cache = &zone->uz_cpu[curcpu];
+ /*
+ * Try to free into the allocbucket first to give LIFO
+ * ordering for cache-hot datastructures. Spill over
+ * into the freebucket if necessary. Alloc will swap
+ * them if one runs dry.
+ */
+ bucket = &cache->uc_allocbucket;
#ifdef NUMA
domain = PCPU_GET(domain);
if ((uz_flags & UMA_ZONE_FIRSTTOUCH) != 0 &&
@@ -3704,18 +3762,13 @@
bucket = &cache->uc_crossbucket;
} else
#endif
- {
- /*
- * Try to free into the allocbucket first to give LIFO
- * ordering for cache-hot datastructures. Spill over
- * into the freebucket if necessary. Alloc will swap
- * them if one runs dry.
- */
- bucket = &cache->uc_allocbucket;
- if (__predict_false(bucket->ucb_cnt >=
- bucket->ucb_entries))
- bucket = &cache->uc_freebucket;
- }
+ /*
+ * If this is a SMR zone or the allocbucket is entry we use
+ * the free bucket.
+ */
+ if (__predict_false((uz_flags & UMA_ZONE_SMR) != 0 ||
+ bucket->ucb_cnt >= bucket->ucb_entries))
+ bucket = &cache->uc_freebucket;
if (__predict_true(bucket->ucb_cnt < bucket->ucb_entries)) {
cache_bucket_push(cache, bucket, item);
critical_exit();
@@ -3728,7 +3781,8 @@
* If nothing else caught this, we'll just do an internal free.
*/
zfree_item:
- zone_free_item(zone, item, udata, SKIP_DTOR);
+ zone_free_item(zone, item, udata,
+ (uz_flags & UMA_ZONE_SMR) == 0 ? SKIP_DTOR : SKIP_NONE);
}
#ifdef NUMA
@@ -3778,6 +3832,9 @@
if (!TAILQ_EMPTY(&fullbuckets)) {
ZONE_LOCK(zone);
while ((b = TAILQ_FIRST(&fullbuckets)) != NULL) {
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+ bucket->ub_epoch = uma_smr_current(
+ zone->uz_smr);
TAILQ_REMOVE(&fullbuckets, b, ub_link);
if (zone->uz_bkt_count >= zone->uz_bkt_max) {
ZONE_UNLOCK(zone);
@@ -3796,6 +3853,7 @@
}
if (bucket->ub_cnt != 0)
bucket_drain(zone, bucket);
+ bucket->ub_epoch = 0;
bucket_free(zone, bucket, udata);
}
#endif
@@ -3862,15 +3920,16 @@
int itemdomain)
{
uma_cache_bucket_t cbucket;
- uma_bucket_t bucket;
+ uma_bucket_t newbucket, bucket;
int domain;
CRITICAL_ASSERT(curthread);
- if (zone->uz_bucket_size == 0 || bucketdisable)
+ if (zone->uz_bucket_size == 0)
return false;
cache = &zone->uz_cpu[curcpu];
+ newbucket = NULL;
/*
* FIRSTTOUCH domains need to free to the correct zdom. When
@@ -3895,14 +3954,29 @@
/* We are no longer associated with this CPU. */
critical_exit();
+ /*
+ * Don't let SMR zones operate without a free bucket. Force
+ * a synchronize and re-use this one. We will only degrade
+ * to a synchronize every bucket_size items rather than every
+ * item if we fail to allocate a bucket.
+ */
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) {
+ if (bucket != NULL)
+ bucket->ub_epoch = uma_smr_advance(zone->uz_smr);
+ newbucket = bucket_alloc(zone, udata, M_NOWAIT);
+ if (newbucket == NULL && bucket != NULL) {
+ bucket_drain(zone, bucket);
+ newbucket = bucket;
+ bucket = NULL;
+ }
+ } else if (!bucketdisable)
+ newbucket = bucket_alloc(zone, udata, M_NOWAIT);
+
if (bucket != NULL)
zone_free_bucket(zone, bucket, udata, domain, itemdomain);
- bucket = bucket_alloc(zone, udata, M_NOWAIT);
- CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
- zone->uz_name, zone, bucket);
critical_enter();
- if (bucket == NULL)
+ if ((bucket = newbucket) == NULL)
return (false);
cache = &zone->uz_cpu[curcpu];
#ifdef NUMA
@@ -4031,6 +4105,15 @@
zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
{
+ /*
+ * If a free is sent directly to an SMR zone we have to
+ * synchronize immediately because the item can instantly
+ * be reallocated. This should only happen in degenerate
+ * cases when no memory is available for per-cpu caches.
+ */
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0 && skip == SKIP_DTOR)
+ uma_smr_synchronize(zone->uz_smr);
+
item_dtor(zone, item, zone->uz_size, udata, skip);
if (skip < SKIP_FINI && zone->uz_fini)
@@ -4255,6 +4338,31 @@
keg->uk_allocf = allocf;
}
+/* See uma.h */
+void
+uma_zone_set_smr(uma_zone_t zone, uma_smr_t smr)
+{
+ uma_keg_t keg;
+
+ ZONE_ASSERT_COLD(zone);
+
+ KEG_GET(zone, keg);
+ KEG_ASSERT_COLD(keg);
+ keg->uk_flags |= UMA_ZONE_SMR;
+ ZONE_LOCK(zone);
+ zone->uz_flags |= UMA_ZONE_SMR;
+ zone->uz_smr = smr;
+ zone_update_caches(zone);
+ ZONE_UNLOCK(zone);
+}
+
+uma_smr_t
+uma_zone_get_smr(uma_zone_t zone)
+{
+
+ return (zone->uz_smr);
+}
+
/* See uma.h */
void
uma_zone_reserve(uma_zone_t zone, int items)
Index: sys/vm/uma_int.h
===================================================================
--- sys/vm/uma_int.h
+++ sys/vm/uma_int.h
@@ -247,6 +247,7 @@
TAILQ_ENTRY(uma_bucket) ub_link; /* Link into the zone */
int16_t ub_cnt; /* Count of items in bucket. */
int16_t ub_entries; /* Max items. */
+ uint32_t ub_epoch; /* epoch value for SMR. */
void *ub_bucket[]; /* actual allocation storage */
};
@@ -529,6 +530,8 @@
struct sysctl_oid *uz_oid; /* sysctl oid pointer. */
int uz_namecnt; /* duplicate name count. */
+ uma_smr_t uz_smr; /* Safe memory reclaim context. */
+
/*
* This HAS to be the last item because we adjust the zone size
* based on NCPU and then allocate the space for the zones.
Index: sys/vm/uma_smr.h
===================================================================
--- /dev/null
+++ sys/vm/uma_smr.h
@@ -0,0 +1,149 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 Jeffrey Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef _VM_UMA_SMR_H_
+#define _VM_UMA_SMR_H_
+
+typedef uint32_t uma_epoch_t;
+
+/*
+ * Safe memory reclamation. See uma_smr.c for a description of the
+ * algorithm.
+ *
+ * Readers synchronize with uma_smr_enter()/exit() and writers may either
+ * free directly to a SMR UMA zone or use uma_smr_synchronize or wait.
+ */
+
+/* Global SMR state. */
+struct uma_smr_g {
+ const char *smrg_name; /* Name for debugging/reporting. */
+ volatile uma_epoch_t smrg_epoch; /* Newest global epoch value. */
+ volatile uma_epoch_t smrg_epoch_min; /* Minimum observed epoch. */
+};
+typedef struct uma_smr_g * uma_smr_g_t;
+
+/* Per-cpu SMR state. */
+struct uma_smr {
+ volatile uma_epoch_t smr_epoch; /* Last epoch value. */
+ uma_smr_g_t smr_global; /* Global SMR state. */
+};
+
+/*
+ * Enter a read section.
+ */
+static inline void
+uma_smr_enter(uma_smr_t smr)
+{
+ uma_smr_g_t smrg;
+ uma_epoch_t g;
+
+ critical_enter();
+ smr = zpcpu_get(smr);
+ smrg = smr->smr_global;
+ KASSERT(smr->smr_epoch == 0,
+ ("uma_smr_enter(%s) does not support recursion.",
+ smrg->smrg_name));
+
+ /* Acquire barrier to synchronize with smr_exit and smr_wait(). */
+ g = smrg->smrg_epoch;
+ /*
+ * XXX It is possible that a long delay between this load and
+ * store will leave the cpu running on an epoch below min. The
+ * store fence will prevent it from loading stale data but the
+ * bookkeeping can become inaccurate.
+ */
+ atomic_add_acq_int(&smr->smr_epoch, g);
+}
+
+/*
+ * Exit a safe memory reclamation section.
+ */
+static inline void
+uma_smr_exit(uma_smr_t smr)
+{
+ uma_smr_g_t smrg;
+
+ smr = zpcpu_get(smr);
+ smrg = smr->smr_global;
+ CRITICAL_ASSERT(curthread);
+ KASSERT(smr->smr_epoch != 0,
+ ("uma_smr_exit(%s) not in a smr section.", smrg->smrg_name));
+
+ /*
+ * Release from above and to retire memory references accessed
+ * in the recorded epoch.
+ */
+ atomic_store_rel_int(&smr->smr_epoch, 0);
+ critical_exit();
+}
+
+/*
+ * Synchronize returns when all readers have observed the current
+ * epoch.
+ */
+void uma_smr_synchronize(uma_smr_t smr);
+
+/*
+ * Advances the epoch to indicate a write. Returns the goal epoch
+ * required to ensure that all modifications are visible.
+ */
+uma_epoch_t uma_smr_advance(uma_smr_t smr);
+
+/*
+ * Returns true if a goal epoch has been reached. If
+ * wait is true it will busy loop until success.
+ */
+bool uma_smr_poll(uma_smr_t smr, uma_epoch_t goal, bool wait);
+
+static inline bool
+uma_smr_wait(uma_smr_t smr, uma_epoch_t goal)
+{
+
+ return (uma_smr_poll(smr, goal, true));
+}
+
+/*
+ * Return the current epoch value.
+ */
+static inline uma_epoch_t
+uma_smr_current(uma_smr_t smr)
+{
+ return (smr->smr_global->smrg_epoch);
+}
+
+/* Create a new SMR context. */
+uma_smr_t uma_smr_create(const char *name);
+void uma_smr_destroy(uma_smr_t smr);
+
+/* Only at startup. */
+void uma_smr_init(void);
+
+#endif /* _VM_UMA_SMR_H_ */
Index: sys/vm/uma_smr.c
===================================================================
--- /dev/null
+++ sys/vm/uma_smr.c
@@ -0,0 +1,312 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 Jeffrey Roberson <jeff@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+
+#include <vm/uma.h>
+#include <vm/uma_smr.h>
+
+/*
+ * This is a novel safe memory reclamation technique inspired by
+ * epoch based reclamation from Samy Al Bahra's concurrency kit which
+ * in turn was based on work described in:
+ * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
+ * of Cambridge Computing Laboratory.
+ *
+ * This is not an implementation of hazard pointers or related
+ * techniques. The term safe memory reclamation is used as a
+ * generic descriptor.
+ *
+ * This approach could be called 'unbounded epoch' because we
+ * do not maintain the invariant that only 2 epochs are active.
+ * This tradeoff allows us to decouple advancing the epoch from
+ * waiting for threads to observe the epoch. Which further allows
+ * deferring the expiration check until long after it is likely
+ * to require busy loops and many epochs may be batched and expired
+ * by a single scan. Waits for expired epochs may return immediately.
+ *
+ * This implements a two handed clock approach to epoch values. One
+ * hand, the global epoch, is incremented every time a write changes
+ * the state of the system. Another hand, min_epoch, keeps track of
+ * the oldest write that was observed by any cpu. Goal epochs that
+ * fall outside of this range have expired and wrapped. Special
+ * consideration is given to safely handle wrapping and stale epoch
+ * values.
+ *
+ * Writers that can naturally advance the epoch and wait for it to
+ * expire sometime later will batch together in a single wait. If
+ * this time exceeds the zsmr_enter/exit section time the scan will
+ * never spinwait, allowing effective overlapping of latencies and
+ * relatively few traversals.
+ *
+ * By integrating with the allocator we avoid all of the callout
+ * queue machinery and are provided with an efficient way to batch
+ * epoch advancement and waiting. The allocator accumulates a full
+ * per-cpu cache of memory before advancing the epoch. It then
+ * delays waiting for this epoch to expire until the memory is
+ * selected for reuse. In this way we only increment the epoch
+ * value every 1/(cache size) frees and the waits are done long
+ * after the epoch has been expired so they need only be verified
+ * to account for pathological conditions.
+ *
+ * If the read overhead of accessing the global cacheline becomes
+ * burdensome an invariant TSC could be used in place of the
+ * epoch. The algorithm would then only need to maintain the minimum
+ * observed tsc. This would trade potential cache synchronization
+ * overhead for local serialization and computational expense.
+ */
+uma_zone_t uma_smr_g_zone;
+uma_zone_t uma_smr_zone;
+
+/*
+ * Modular arithmetic for comparing epoch numbers that have
+ * potentially wrapped. Copied from tcp_seq.h.
+ */
+#define EPOCH_LT(a, b) ((int)((a)-(b)) < 0)
+#define EPOCH_LEQ(a, b) ((int)((a)-(b)) <= 0)
+#define EPOCH_GT(a, b) ((int)((a)-(b)) > 0)
+#define EPOCH_GEQ(a, b) ((int)((a)-(b)) >= 0)
+
+#ifndef INVARIANTS
+#define EPOCH_INIT 1
+#define EPOCH_INCR 2
+#else
+/* We want to test the wrapping feature in invariants kernels. */
+#define EPOCH_INCR (UINT_MAX / 10000)
+#define EPOCH_INIT (UINT_MAX - 100000)
+#endif
+
+/*
+ * Advance the global epoch and return the new value for use as the
+ * wait goal. This guarantees that any changes made by the calling
+ * thread prior to this call will be visible to all threads after
+ * epoch_min meets or exceeds the return value.
+ */
+uma_epoch_t
+uma_smr_advance(uma_smr_t smr)
+{
+ uma_smr_g_t g;
+ uma_epoch_t goal;
+
+ /*
+ * Modifications not done in a smr section need to be visible
+ * before advancing the epoch.
+ */
+ atomic_thread_fence_rel();
+
+ /*
+ * Increment the global epoch value by 2. Since the epoch is
+ * initialized to 1 this means the only valid epoch values are
+ * odd and an observed value of 0 in a particular CPU means it
+ * is not currently in an epoch section.
+ */
+ g = smr->smr_global;
+ goal = atomic_fetchadd_int(&g->smrg_epoch, EPOCH_INCR) + EPOCH_INCR;
+
+ /*
+ * Force a synchronization here if the goal is getting too
+ * far ahead of the minimum observed epoch. This keeps the
+ * wrap detecting arithmetic working in pathological cases.
+ */
+ if (goal - g->smrg_epoch_min == INT_MAX / 2)
+ uma_smr_wait(smr, goal);
+
+ return (goal);
+}
+
+/*
+ * Loop updating epoch_min if it is below goal. If wait is true
+ * this will spin until the goal is met.
+ *
+ * Returns true if the goal is met and false if not.
+ */
+bool
+uma_smr_poll(uma_smr_t smr, uma_epoch_t goal, bool wait)
+{
+ uma_smr_g_t g;
+ uma_smr_t c;
+ uma_epoch_t epoch, gmin, newmin, e;
+ int i;
+ bool success;
+
+ /*
+ * Use a critical section so that we can avoid ABA races
+ * caused by long preemption sleeps.
+ */
+ critical_enter();
+ g = smr->smr_global;
+
+ /*
+ * Acquire barrier loads epoch after min so that we can not
+ * observe an updated minimum that is larger than the epoch.
+ */
+ gmin = atomic_load_acq_int(&g->smrg_epoch_min);
+
+ /*
+ * Load epoch prior to reading any of the per-cpu epochs to prevent
+ * stale comparisons.
+ */
+ epoch = atomic_load_acq_int(&g->smrg_epoch);
+
+ /*
+ * Detect whether the goal epoch has already been observed.
+ *
+ * The goal must be in the range of epoch >= goal >= epoch_min for
+ * it to be valid. If it is not then the caller held on to it and
+ * the integer wrapped. If we wrapped back within range the caller
+ * will harmlessly scan.
+ */
+ if (EPOCH_GEQ(gmin, goal) || EPOCH_LT(epoch, goal)) {
+ critical_exit();
+ return (true);
+ }
+
+ /*
+ * Loop until all cores have observed the goal epoch
+ * or have gone inactive. Keep track of the oldest
+ * epoch currently active.
+ */
+ success = true;
+ newmin = epoch;
+ CPU_FOREACH(i) {
+ c = zpcpu_get_cpu(smr, i);
+ e = 0;
+ for (;;) {
+ e = c->smr_epoch;
+ if (e == 0)
+ break;
+
+ /*
+ * XXX There is a race described in uma_smr.h that
+ * can lead to a stale epoch value but not stale data
+ * access. If we find a value out of range here we
+ * pin it to the current min to prevent it from
+ * advancing until that stale section has expired.
+ */
+ if (EPOCH_LT(e, gmin))
+ e = gmin;
+
+ /*
+ * Check to see if another thread has
+ * advanced the epoch while being careful
+ * to handle integer wrapping.
+ */
+ if (EPOCH_GEQ(e, goal))
+ break;
+
+ if (!wait) {
+ success = false;
+ break;
+ }
+ cpu_spinwait();
+ gmin = atomic_load_acq_int(&g->smrg_epoch_min);
+ }
+
+ /*
+ * Limit the minimum observed epoch whether we met the goal
+ * or not.
+ */
+ if (e != 0 && EPOCH_GT(newmin, e))
+ newmin = e;
+ }
+
+ /*
+ * Advance the min observed epoch as long as we observed the most
+ * recent one.
+ */
+ gmin = g->smrg_epoch_min;
+ do {
+ if (EPOCH_LEQ(newmin, gmin))
+ break;
+ } while (atomic_fcmpset_int(&g->smrg_epoch_min, &gmin, newmin) == 0);
+ critical_exit();
+
+ return (success);
+}
+
+void
+uma_smr_synchronize(uma_smr_t smr)
+{
+ uma_epoch_t goal;
+
+ goal = uma_smr_advance(smr);
+ uma_smr_poll(smr, goal, true);
+}
+
+uma_smr_t
+uma_smr_create(const char *name)
+{
+ uma_smr_t smr, c;
+ uma_smr_g_t g;
+ int i;
+
+ g = uma_zalloc(uma_smr_g_zone, M_WAITOK);
+ smr = uma_zalloc(uma_smr_zone, M_WAITOK);
+
+ g->smrg_name = name;
+ g->smrg_epoch_min = g->smrg_epoch = EPOCH_INIT;
+ /* Initialize all CPUS, not just those running. */
+ for (i = 0; i <= mp_maxid; i++) {
+ c = zpcpu_get_cpu(smr, i);
+ c->smr_epoch = 0;
+ c->smr_global = g;
+ }
+ atomic_thread_fence_seq_cst();
+
+ return (smr);
+}
+
+void
+uma_smr_destroy(uma_smr_t smr)
+{
+
+ uma_smr_synchronize(smr);
+ uma_zfree(uma_smr_g_zone, smr->smr_global);
+ uma_zfree(uma_smr_zone, smr);
+}
+
+/*
+ * Initialize the UMA slab zone.
+ */
+void
+uma_smr_init(void)
+{
+
+ uma_smr_g_zone = uma_zcreate("SMR GLOBAL", sizeof(struct uma_smr_g),
+ NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, 0);
+ uma_smr_zone = uma_zcreate("SMR CPU", sizeof(struct uma_smr),
+ NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, UMA_ZONE_PCPU);
+}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Jan 20, 5:51 AM (11 h, 57 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27759793
Default Alt Text
D22586.id67270.diff (32 KB)
Attached To
Mode
D22586: Implement safe memory reclamation in UMA.
Attached
Detach File
Event Timeline
Log In to Comment