Differential D32360 Diff 96702 sys/kern/kern_membarrier.c

Changeset View

Standalone View

sys/kern/kern_membarrier.c

This file was added.

				/*-
				* Copyright (c) 2021 The FreeBSD Foundation
				*
				* This software were developed by Konstantin Belousov <kib@FreeBSD.org>
				* under sponsorship from the FreeBSD Foundation.
				*
				* Redistribution and use in source and binary forms, with or without
				* modification, are permitted provided that the following conditions
				* are met:
				* 1. Redistributions of source code must retain the above copyright
				* notice, this list of conditions and the following disclaimer.
				* 2. Redistributions in binary form must reproduce the above copyright
				* notice, this list of conditions and the following disclaimer in the
				* documentation and/or other materials provided with the distribution.
				*
				* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
				* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
				* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
				* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
				* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
				* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
				* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				* SUCH DAMAGE.
				*/

				#include <sys/cdefs.h>
				#include <sys/param.h>
				#include <sys/systm.h>
				#include <sys/cpuset.h>
				#include <sys/lock.h>
				#include <sys/membarrier.h>
				#include <sys/mutex.h>
				#include <sys/proc.h>
				#include <sys/sched.h>
				#include <sys/smp.h>
				#include <sys/syscallsubr.h>
				#include <sys/sysproto.h>

				#include <vm/vm_param.h>
				#include <vm/vm.h>
				#include <vm/pmap.h>
				#include <vm/vm_map.h>

				#define MEMBARRIER_SUPPORTED_CMDS ( \
				MEMBARRIER_CMD_GLOBAL \| \
				MEMBARRIER_CMD_GLOBAL_EXPEDITED \| \
				MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \| \
				MEMBARRIER_CMD_PRIVATE_EXPEDITED \| \
				MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \| \
				MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \| \
				MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)

				static void
				membarrier_action_seqcst(void *arg __unused)
				{
				atomic_thread_fence_seq_cst();
				}

				static void
				membarrier_action_seqcst_sync_core(void *arg __unused)
				{
				atomic_thread_fence_seq_cst();
				cpu_sync_core();
				}

				static void
				do_membarrier_ipi(cpuset_t csp, void (func)(void *))
				{
				atomic_thread_fence_seq_cst();
				smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
				smp_no_rendezvous_barrier, NULL);
				atomic_thread_fence_seq_cst();
				}

				static bool
				check_cpu_switched(int c, cpuset_t csp, uint64_t swt, bool init)
				{
				struct pcpu *pc;
				uint64_t sw;

				if (CPU_ISSET(c, csp))
				return (false);

				pc = cpuid_to_pcpu[c];
				if (pc->pc_curthread == pc->pc_idlethread) {
				CPU_SET(c, csp);
				return (true);
				}

				/*
				* Sync with context switch to ensure that override of
				* pc_curthread with non-idle thread pointer is visible before
				* reading of pc_switchtime.
				*/
				atomic_thread_fence_acq();

				sw = pc->pc_switchtime;
				if (init) {
				swt[c] = sw;
				} else if (sw != swt[c]) {
				CPU_SET(c, csp);
				return (true);
				}

				theravenUnsubmitted Done Inline Actions This is functionally correct, but does not have the performance characteristics that are expected from the Linux documentation. The non-`_EXPEDITED` versions are allowed to block and are expected to wait until all of the other cores have done at least one context switch. Callers are likely to expect this to potentially take one scheduling quantum to return but not to disrupt the other threads' execution at all. theraven: This is functionally correct, but does not have the performance characteristics that are…
				kibAuthorUnsubmitted Done Inline Actions In theory this is unbounded sleep, but ok. I changed the _GLOBAL implementation to wait for each core to do at least one non-idle context switch. kib: In theory this is unbounded sleep, but ok. I changed the _GLOBAL implementation to wait for…
				return (false);
				}

				/*
				*
				* XXXKIB: We execute the requested action (seq_cst and possibly
				* sync_core) on current CPU as well. There is no guarantee that
				* current thread executes anything with the full fence semantics
				* during syscall execution. Similarly, cpu_core_sync() semantics
				* might be not provided by the syscall return. E.g. on amd64 we
				* typically return without IRET.
				*/
				int
				kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
				{
				struct proc p, p1;
				struct thread *td1;
				cpuset_t cs;
				uint64_t *swt;
				int c, error;
				bool changed;

				if (flags != 0 \|\| (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
				return (EINVAL);

				if (cmd == MEMBARRIER_CMD_QUERY) {
				td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
				return (0);
				}

				p = td->td_proc;
				theravenUnsubmitted Done Inline Actions If I understand correctly (some comments would be nice so I could check my understanding), this is collecting the set of CPUs that are running threads in the same process. It would be worth a comment explaining the inherent optimisation. For functional correctness, this does not need to be an accurate snapshot of the set of CPUs: it can be a racy access because any thread that is [de]scheduled out from under you during the check is implicitly meeting the requirements (anything that does a context switch is sufficiently ordered). theraven: If I understand correctly (some comments would be nice so I could check my understanding), this…
				kibAuthorUnsubmitted Done Inline Actions Technically this is not just running threads, it is also e.g. rfork()ed siblings. More correct explanation is the set of threads sharing the same address space. I believe this is identical to the set which is IPI-ed by linux. kib: Technically this is not just running threads, it is also e.g. rfork()ed siblings. More correct…
				error = 0;

				switch (cmd) {
				case MEMBARRIER_CMD_GLOBAL:
				swt = malloc(mp_maxid * sizeof(*swt), M_TEMP, M_WAITOK);
				CPU_ZERO(&cs);
				sched_pin();
				CPU_SET(PCPU_GET(cpuid), &cs);
				CPU_FOREACH(c)
				check_cpu_switched(c, &cs, swt, true);
				if (CPU_CMP(&cs, &all_cpus)) {
				for (;;) {
				changed = false;
				CPU_FOREACH(c) {
				changed \|= check_cpu_switched(c,
				&cs, swt, false);
				}
				if (changed && !CPU_CMP(&cs, &all_cpus))
				break;
				if (!changed)
				pause("mmbr", 1);
				}
				}
				sched_unpin();
				free(swt, M_TEMP);
				atomic_thread_fence_seq_cst();
				break;

				case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
				if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
				error = EPERM;
				} else {
				CPU_ZERO(&cs);
				CPU_FOREACH(c) {
				td1 = cpuid_to_pcpu[c]->pc_curthread;
				p1 = td1->td_proc;
				if (p1 != NULL &&
				(p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
				CPU_SET(c, &cs);
				}
				do_membarrier_ipi(&cs, membarrier_action_seqcst);
				}
				break;

				case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
				if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
				PROC_LOCK(p);
				p->p_flag2 \|= P2_MEMBAR_GLOBE;
				PROC_UNLOCK(p);
				}
				break;

				case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
				if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
				error = EPERM;
				} else {
				pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
				do_membarrier_ipi(&cs, membarrier_action_seqcst);
				}
				break;

				case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
				if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
				PROC_LOCK(p);
				p->p_flag2 \|= P2_MEMBAR_PRIVE;
				PROC_UNLOCK(p);
				}
				break;

				case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
				if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
				error = EPERM;
				} else {
				/*
				* Calculating the IPI multicast mask from
				* pmap active mask means that we do not call
				* cpu_sync_core() on CPUs that were missed
				* from pmap active mask but could be switched
				* from or to meantime. This is fine at least
				* on amd64 because threads always use slow
				* (IRETQ) path to return from syscall after
				* context switch.
				*/
				pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);

				do_membarrier_ipi(&cs,
				membarrier_action_seqcst_sync_core);
				}
				break;

				case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
				if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
				PROC_LOCK(p);
				p->p_flag2 \|= P2_MEMBAR_PRIVE_SYNCORE;
				PROC_UNLOCK(p);
				}
				break;

				default:
				error = EINVAL;
				break;
				}

				return (error);
				}

				int
				sys_membarrier(struct thread td, struct membarrier_args uap)
				{
				return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
				}