Differential D22586 Diff 67270 sys/vm/uma_smr.c

Changeset View

Standalone View

sys/vm/uma_smr.c

This file was added.

				/*-
				* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
				*
				* Copyright (c) 2019 Jeffrey Roberson <jeff@FreeBSD.org>
				* All rights reserved.
				*
				* Redistribution and use in source and binary forms, with or without
				* modification, are permitted provided that the following conditions
				* are met:
				* 1. Redistributions of source code must retain the above copyright
				* notice unmodified, this list of conditions, and the following
				* disclaimer.
				* 2. Redistributions in binary form must reproduce the above copyright
				* notice, this list of conditions and the following disclaimer in the
				* documentation and/or other materials provided with the distribution.
				*
				* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
				* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
				* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
				* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
				* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
				* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
				* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				*/

				#include <sys/cdefs.h>
				__FBSDID("$FreeBSD$");

				#include <sys/param.h>
				#include <sys/systm.h>
				#include <sys/limits.h>
				#include <sys/kernel.h>
				#include <sys/proc.h>
				#include <sys/smp.h>

				#include <vm/uma.h>
				#include <vm/uma_smr.h>

				/*
				* This is a novel safe memory reclamation technique inspired by
				* epoch based reclamation from Samy Al Bahra's concurrency kit which
				* in turn was based on work described in:
				* Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
				* of Cambridge Computing Laboratory.
				*
				* This is not an implementation of hazard pointers or related
				* techniques. The term safe memory reclamation is used as a
				* generic descriptor.
				*
				* This approach could be called 'unbounded epoch' because we
				* do not maintain the invariant that only 2 epochs are active.
				* This tradeoff allows us to decouple advancing the epoch from
				* waiting for threads to observe the epoch. Which further allows
				* deferring the expiration check until long after it is likely
				* to require busy loops and many epochs may be batched and expired
				* by a single scan. Waits for expired epochs may return immediately.
				*
				* This implements a two handed clock approach to epoch values. One
				* hand, the global epoch, is incremented every time a write changes
				* the state of the system. Another hand, min_epoch, keeps track of
				* the oldest write that was observed by any cpu. Goal epochs that
				* fall outside of this range have expired and wrapped. Special
				* consideration is given to safely handle wrapping and stale epoch
				* values.
				*
				* Writers that can naturally advance the epoch and wait for it to
				* expire sometime later will batch together in a single wait. If
				* this time exceeds the zsmr_enter/exit section time the scan will
				* never spinwait, allowing effective overlapping of latencies and
				* relatively few traversals.
				*
				* By integrating with the allocator we avoid all of the callout
				* queue machinery and are provided with an efficient way to batch
				* epoch advancement and waiting. The allocator accumulates a full
				* per-cpu cache of memory before advancing the epoch. It then
				* delays waiting for this epoch to expire until the memory is
				* selected for reuse. In this way we only increment the epoch
				* value every 1/(cache size) frees and the waits are done long
				* after the epoch has been expired so they need only be verified
				* to account for pathological conditions.
				*
				* If the read overhead of accessing the global cacheline becomes
				* burdensome an invariant TSC could be used in place of the
				* epoch. The algorithm would then only need to maintain the minimum
				* observed tsc. This would trade potential cache synchronization
				* overhead for local serialization and computational expense.
				*/
				uma_zone_t uma_smr_g_zone;
				uma_zone_t uma_smr_zone;

				/*
				* Modular arithmetic for comparing epoch numbers that have
				* potentially wrapped. Copied from tcp_seq.h.
				*/
				#define EPOCH_LT(a, b) ((int)((a)-(b)) < 0)
				#define EPOCH_LEQ(a, b) ((int)((a)-(b)) <= 0)
				#define EPOCH_GT(a, b) ((int)((a)-(b)) > 0)
				#define EPOCH_GEQ(a, b) ((int)((a)-(b)) >= 0)

				#ifndef INVARIANTS
				#define EPOCH_INIT 1
				#define EPOCH_INCR 2
				#else
				/* We want to test the wrapping feature in invariants kernels. */
				#define EPOCH_INCR (UINT_MAX / 10000)
				#define EPOCH_INIT (UINT_MAX - 100000)
				#endif

				/*
				* Advance the global epoch and return the new value for use as the
				* wait goal. This guarantees that any changes made by the calling
				* thread prior to this call will be visible to all threads after
				* epoch_min meets or exceeds the return value.
				*/
				uma_epoch_t
				uma_smr_advance(uma_smr_t smr)
				{
				uma_smr_g_t g;
				uma_epoch_t goal;

				/*
				* Modifications not done in a smr section need to be visible
				* before advancing the epoch.
				*/
				hselaskyUnsubmitted Not Done Inline Actions Need to cast or compute this difference in an own variable to avoid compiler making predictions? hselasky: Need to cast or compute this difference in an own variable to avoid compiler making predictions?
				jeffAuthorUnsubmitted Done Inline Actions Can you elaborate? I'm not sure I follow. jeff: Can you elaborate? I'm not sure I follow.
				hselaskyUnsubmitted Not Done Inline Actions From what I understand the counter g->smrg_epoch_min will eventually wrap around, so that the subtraction gets a sign overflow. I.E. you are subtracting a higer unsigned value from a lower one, which is undefined behaviour from what I understand. Then you either need to cast the subtraction or put it into a temporary unsigned variable. @kib can probably explain this better than I can. See also the commit which added: sys/conf/kern.mk:CFLAGS+= -fwrapv hselasky: From what I understand the counter g->smrg_epoch_min will eventually wrap around, so that the…
				kibUnsubmitted Not Done Inline Actions Yes, -fwrapv should make the signed arithmetic wrap on overflow, same as unsigned. Of course this is less tested compiler's option, so usual approach is to avoid undefined behavior either by changing types of the epoch counters to unsigned ints, or at least casting the expression elements to unsigned. kib: Yes, -fwrapv should make the signed arithmetic wrap on overflow, same as unsigned. Of course…
				atomic_thread_fence_rel();

				/*
				* Increment the global epoch value by 2. Since the epoch is
				* initialized to 1 this means the only valid epoch values are
				* odd and an observed value of 0 in a particular CPU means it
				* is not currently in an epoch section.
				*/
				g = smr->smr_global;
				goal = atomic_fetchadd_int(&g->smrg_epoch, EPOCH_INCR) + EPOCH_INCR;

				/*
				* Force a synchronization here if the goal is getting too
				* far ahead of the minimum observed epoch. This keeps the
				* wrap detecting arithmetic working in pathological cases.
				*/
				if (goal - g->smrg_epoch_min == INT_MAX / 2)
				uma_smr_wait(smr, goal);

				return (goal);
				}

				/*
				* Loop updating epoch_min if it is below goal. If wait is true
				* this will spin until the goal is met.
				*
				* Returns true if the goal is met and false if not.
				*/
				bool
				uma_smr_poll(uma_smr_t smr, uma_epoch_t goal, bool wait)
				{
				uma_smr_g_t g;
				uma_smr_t c;
				uma_epoch_t epoch, gmin, newmin, e;
				int i;
				bool success;

				/*
				* Use a critical section so that we can avoid ABA races
				* caused by long preemption sleeps.
				*/
				critical_enter();
				g = smr->smr_global;

				/*
				* Acquire barrier loads epoch after min so that we can not
				hselaskyUnsubmitted Not Done Inline Actions You probably need to cast these differences or compute them into an own variable to avoid predictions made by the compiler. hselasky: You probably need to cast these differences or compute them into an own variable to avoid…
				* observe an updated minimum that is larger than the epoch.
				*/
				gmin = atomic_load_acq_int(&g->smrg_epoch_min);

				/*
				* Load epoch prior to reading any of the per-cpu epochs to prevent
				* stale comparisons.
				*/
				epoch = atomic_load_acq_int(&g->smrg_epoch);

				/*
				* Detect whether the goal epoch has already been observed.
				*
				* The goal must be in the range of epoch >= goal >= epoch_min for
				* it to be valid. If it is not then the caller held on to it and
				* the integer wrapped. If we wrapped back within range the caller
				* will harmlessly scan.
				*/
				if (EPOCH_GEQ(gmin, goal) \|\| EPOCH_LT(epoch, goal)) {
				critical_exit();
				return (true);
				}

				/*
				hselaskyUnsubmitted Not Done Inline Actions Ditto. hselasky: Ditto.
				* Loop until all cores have observed the goal epoch
				hselaskyUnsubmitted Not Done Inline Actions Ditto. hselasky: Ditto.
				* or have gone inactive. Keep track of the oldest
				* epoch currently active.
				*/
				success = true;
				newmin = epoch;
				CPU_FOREACH(i) {
				c = zpcpu_get_cpu(smr, i);
				e = 0;
				for (;;) {
				e = c->smr_epoch;
				if (e == 0)
				markjUnsubmitted Not Done Inline Actions Should we perhaps use a named constant akin to EPOCH_INIT here? EPOCH_INACTIVE? markj: Should we perhaps use a named constant akin to EPOCH_INIT here? EPOCH_INACTIVE?
				jeffAuthorUnsubmitted Done Inline Actions Sure can. jeff: Sure can.
				break;

				/*
				* XXX There is a race described in uma_smr.h that
				* can lead to a stale epoch value but not stale data
				* access. If we find a value out of range here we
				* pin it to the current min to prevent it from
				hselaskyUnsubmitted Not Done Inline Actions Ditto. hselasky: Ditto.
				* advancing until that stale section has expired.
				*/
				if (EPOCH_LT(e, gmin))
				e = gmin;

				/*
				* Check to see if another thread has
				* advanced the epoch while being careful
				* to handle integer wrapping.
				*/
				if (EPOCH_GEQ(e, goal))
				break;

				if (!wait) {
				success = false;
				break;
				}
				cpu_spinwait();
				gmin = atomic_load_acq_int(&g->smrg_epoch_min);
				}

				/*
				* Limit the minimum observed epoch whether we met the goal
				* or not.
				*/
				if (e != 0 && EPOCH_GT(newmin, e))
				newmin = e;
				}

				/*
				* Advance the min observed epoch as long as we observed the most
				* recent one.
				*/
				gmin = g->smrg_epoch_min;
				do {
				if (EPOCH_LEQ(newmin, gmin))
				break;
				} while (atomic_fcmpset_int(&g->smrg_epoch_min, &gmin, newmin) == 0);
				critical_exit();

				return (success);
				}

				void
				uma_smr_synchronize(uma_smr_t smr)
				{
				uma_epoch_t goal;

				goal = uma_smr_advance(smr);
				uma_smr_poll(smr, goal, true);
				}

				uma_smr_t
				uma_smr_create(const char *name)
				{
				uma_smr_t smr, c;
				uma_smr_g_t g;
				int i;

				g = uma_zalloc(uma_smr_g_zone, M_WAITOK);
				smr = uma_zalloc(uma_smr_zone, M_WAITOK);

				g->smrg_name = name;
				g->smrg_epoch_min = g->smrg_epoch = EPOCH_INIT;
				/* Initialize all CPUS, not just those running. */
				for (i = 0; i <= mp_maxid; i++) {
				c = zpcpu_get_cpu(smr, i);
				c->smr_epoch = 0;
				c->smr_global = g;
				}
				atomic_thread_fence_seq_cst();

				return (smr);
				}

				void
				uma_smr_destroy(uma_smr_t smr)
				{

				uma_smr_synchronize(smr);
				uma_zfree(uma_smr_g_zone, smr->smr_global);
				uma_zfree(uma_smr_zone, smr);
				}

				/*
				* Initialize the UMA slab zone.
				*/
				void
				uma_smr_init(void)
				{

				uma_smr_g_zone = uma_zcreate("SMR GLOBAL", sizeof(struct uma_smr_g),
				NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, 0);
				uma_smr_zone = uma_zcreate("SMR CPU", sizeof(struct uma_smr),
				NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, UMA_ZONE_PCPU);
				}