Differential D22007 Diff 63363 sys/amd64/amd64/pmap.c

Changeset View

Standalone View

sys/amd64/amd64/pmap.c

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,004 Lines • ▼ Show 20 Lines	pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
* local APIC is always uncached, so we don't need to flush		* local APIC is always uncached, so we don't need to flush
* for that range anyway.		* for that range anyway.
*/		*/
if (pmap_kextract(sva) == lapic_paddr)		if (pmap_kextract(sva) == lapic_paddr)
return;		return;

if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {		if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
/*		/*
* Do per-cache line flush. Use the sfence		* Do per-cache line flush. Use a locked
* instruction to insure that previous stores are		* instruction to insure that previous stores are
* included in the write-back. The processor		* included in the write-back. The processor
* propagates flush to other processors in the cache		* propagates flush to other processors in the cache
* coherence domain.		* coherence domain.
*/		*/
sfence();		atomic_thread_fence_seq_cst();
		scottphUnsubmitted Not Done Inline Actions What is the ordering that we're establishing here? clflushopt is ordered with respect to earlier writes to the same cacheline. scottph: What is the ordering that we're establishing here? clflushopt is ordered with respect to…
		kibAuthorUnsubmitted Done Inline Actions Right, only to the same cacheline. We want this op to follow normal TSO rules of x86. kib: Right, only to the same cacheline. We want this op to follow normal TSO rules of x86.
for (; sva < eva; sva += cpu_clflush_line_size)		for (; sva < eva; sva += cpu_clflush_line_size)
clflushopt(sva);		clflushopt(sva);
sfence();		atomic_thread_fence_seq_cst();
} else {		} else {
/*		/*
* Writes are ordered by CLFLUSH on Intel CPUs.		* Writes are ordered by CLFLUSH on Intel CPUs.
*/		*/
if (cpu_vendor_id != CPU_VENDOR_INTEL)		if (cpu_vendor_id != CPU_VENDOR_INTEL)
mfence();		mfence();
for (; sva < eva; sva += cpu_clflush_line_size)		for (; sva < eva; sva += cpu_clflush_line_size)
clflush(sva);		clflush(sva);
Show All 25 Lines	pmap_invalidate_cache_pages(vm_page_t *pages, int count)
bool useclflushopt;		bool useclflushopt;

useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;		useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE \|\|		if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE \|\|
((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))		((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
pmap_invalidate_cache();		pmap_invalidate_cache();
else {		else {
if (useclflushopt)		if (useclflushopt)
sfence();		atomic_thread_fence_seq_cst();
else if (cpu_vendor_id != CPU_VENDOR_INTEL)		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
mfence();		mfence();
		cemUnsubmitted Not Done Inline Actions This is extra information in answer to my own question; I'm not requesting any change here. The AMD APM vol 3, rev 3.28 is slightly nuanced (p.139). `mfence()` sandwich as-used today is required IFF the CPU does not support `CLFLUSHOPT`. If the CPU does support `CLFLUSHOPT`, `CLFLUSH` is ordered w.r.t. locked ops, fence instructions other than mfence; as well as same-cacheline {clflushopt, clflush, and writes}. So, the new logic looks correct to me — on future AMD models that support CLFLUSHOPT and have stronger CLFLUSH semantics, we'll just use CLFLUSHOPT anyway due to our existing preference, and the faster locked primitive is adequate. If there is some theoretical reason we might set `useclflushopt=false` on a platform with the cpuid bit set, then it might make sense to optimize the fencing on `CLFLUSH`. But I do not know any reason we would do that. cem: This is extra information in answer to my own question; I'm not requesting any change here.
for (i = 0; i < count; i++) {		for (i = 0; i < count; i++) {
daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));		daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
eva = daddr + PAGE_SIZE;		eva = daddr + PAGE_SIZE;
for (; daddr < eva; daddr += cpu_clflush_line_size) {		for (; daddr < eva; daddr += cpu_clflush_line_size) {
if (useclflushopt)		if (useclflushopt)
clflushopt(daddr);		clflushopt(daddr);
else		else
clflush(daddr);		clflush(daddr);
}		}
}		}
if (useclflushopt)		if (useclflushopt)
sfence();		atomic_thread_fence_seq_cst();
else if (cpu_vendor_id != CPU_VENDOR_INTEL)		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
mfence();		mfence();
		cemUnsubmitted Not Done Inline Actions Again, not a request for any change. I'm not sure the strong `mfence` is actually needed afterwards on AMD. The language (for both clflush and clflushopt in APM 3.28) is confusing to me: Speculative loads initiated by the processor, or specified explicitly using cache-prefetch instructions, can be reordered around a CLFLUSH instruction. Such reordering can invalidate a speculatively prefetched cache line, unintentionally defeating the prefetch operation. The only way to avoid this situation is to use the MFENCE instruction after the CLFLUSH instruction to force strong-ordering of the CLFLUSH instruction with respect to subsequent memory operations. (CLFLUSHOPT language is identical): Speculative loads initiated by the processor, or specified explicitly using cache-prefetch instructions, can be reordered around a CLFLUSHOPT instruction. Such reordering can invalidate a speculatively prefetched cache line, unintentionally defeating the prefetch operation. The only way to avoid this situation is to use the MFENCE instruction after the CLFLUSHOPT instruction to force strong ordering of the CLFLUSHOPT instruction with respect to subsequent memory operations. An invalidated prefetech cacheline doesn't sound like a correctness problem to me. On the other hand, the `CLFLUSH` section does explicitly say that non-`CLFLUSHOPT` models do nor order `CLFLUSH` against `LFENCE`, `SFENCE`, or serializing instructions. So I'm not sure what cheaper store-store barrier would be safe. Maybe none. cem: Again, not a request for any change. I'm not sure the strong `mfence` is actually needed…
}		}
}		}

void		void
pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)		pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
{		{

pmap_invalidate_cache_range_check_align(sva, eva);		pmap_invalidate_cache_range_check_align(sva, eva);

if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {		if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
pmap_force_invalidate_cache_range(sva, eva);		pmap_force_invalidate_cache_range(sva, eva);
return;		return;
}		}

/* See comment in pmap_force_invalidate_cache_range(). */		/* See comment in pmap_force_invalidate_cache_range(). */
if (pmap_kextract(sva) == lapic_paddr)		if (pmap_kextract(sva) == lapic_paddr)
return;		return;

sfence();		atomic_thread_fence_seq_cst();
		scottphUnsubmitted Not Done Inline Actions isn't it the case that the writes we want to flush have either happened on the same processor, so CLWB is implicitly ordered with them, or they have happened on another processor and we have since migrated, and the ordering has been established by the thread lock? scottph: isn't it the case that the writes we want to flush have either happened on the same processor…
		kibAuthorUnsubmitted Done Inline Actions Again, we want this flush to be TSO-consistent with older writes. I suspect this is esp. important there, because the function was added to, and is used with non-coherent hardware. kib: Again, we want this flush to be TSO-consistent with older writes. I suspect this is esp.
		scottphUnsubmitted Not Done Inline Actions What I mean is: which earlier stores done by this logical processor and not in the range being flushed are important to be globally visible before cache flushing begins? The fence after makes sense to me (don't go on to tell somebody outside the coherence domain about data in memory until that data is actually in memory), but I don't yet see the objective of the fence before. scottph: What I mean is: which earlier stores done by this logical processor and not in the range being…
		kibAuthorUnsubmitted Done Inline Actions Fence before the flushing ensures that the writes for clearing cache line are properly ordered, including the writes before the flush. In fact, please look at the Intel' software optimization manual rev. 042b section 8.4.7, esp. example 8.2. kib: Fence before the flushing ensures that the writes for clearing cache line are properly ordered…
for (; sva < eva; sva += cpu_clflush_line_size)		for (; sva < eva; sva += cpu_clflush_line_size)
clwb(sva);		clwb(sva);
sfence();		atomic_thread_fence_seq_cst();
		cemUnsubmitted Not Done Inline Actions AMD APM Vol 3 3.28 language for `CLWB` is: The CLWB instruction is weakly ordered with respect to other instructions that operate on memory. … To create strict ordering of CLWB use a store-ordering instruction such as SFENCE. The "such as" suggests SFENCE is not the only option, but I'm not sure on the semantics of "store-ordering instructions." It is the only use of the term in the document. Do locked instructions count as "store-ordering?" (It's not relevant to CLWB, but sort of similar: the `CLZERO` language says something slightly different:) CLZERO is weakly-ordered with respect to other instructions that operate on memory. Software should use an SFENCE or stronger to enforce memory ordering of CLZERO with respect to other store instructions. cem: AMD APM Vol 3 3.28 language for `CLWB` is: > The CLWB instruction is weakly ordered with…
}		}

void		void
pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)		pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
{		{
pt_entry_t *pte;		pt_entry_t *pte;
vm_offset_t vaddr;		vm_offset_t vaddr;
int error, pte_bits;		int error, pte_bits;
Show All 16 Lines	pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT \| M_WAITOK,		error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT \| M_WAITOK,
&vaddr);		&vaddr);
KASSERT(error == 0, ("vmem_alloc failed: %d", error));		KASSERT(error == 0, ("vmem_alloc failed: %d", error));
pte = vtopte(vaddr);		pte = vtopte(vaddr);
for (; spa < epa; spa += PAGE_SIZE) {		for (; spa < epa; spa += PAGE_SIZE) {
sched_pin();		sched_pin();
pte_store(pte, spa \| pte_bits);		pte_store(pte, spa \| pte_bits);
invlpg(vaddr);		invlpg(vaddr);
/* XXXKIB sfences inside flush_cache_range are excessive */		/* XXXKIB atomic inside flush_cache_range are excessive */
pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);		pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
		cemUnsubmitted Not Done Inline Actions `static inline` variants that take a `bool barrier` could be added. cem: `static inline` variants that take a `bool barrier` could be added.
sched_unpin();		sched_unpin();
}		}
vmem_free(kernel_arena, vaddr, PAGE_SIZE);		vmem_free(kernel_arena, vaddr, PAGE_SIZE);
}		}

/*		/*
* Routine: pmap_extract		* Routine: pmap_extract
* Function:		* Function:
▲ Show 20 Lines • Show All 5,545 Lines • ▼ Show 20 Lines
void		void
pmap_activate_sw(struct thread *td)		pmap_activate_sw(struct thread *td)
{		{
pmap_t oldpmap, pmap;		pmap_t oldpmap, pmap;
u_int cpuid;		u_int cpuid;

oldpmap = PCPU_GET(curpmap);		oldpmap = PCPU_GET(curpmap);
pmap = vmspace_pmap(td->td_proc->p_vmspace);		pmap = vmspace_pmap(td->td_proc->p_vmspace);
if (oldpmap == pmap)		if (oldpmap == pmap) {
		mfence();
		scottphUnsubmitted Not Done Inline Actions won't the thread lock already establish the ordering that we want here? scottph: won't the thread lock already establish the ordering that we want here?
		kibAuthorUnsubmitted Done Inline Actions Not on AMD. kib: Not on AMD.
		scottphUnsubmitted Not Done Inline Actions Here's the execution trace I'm considering, tell me if it's wrong or there's something i'm missing: we're in pmap_invalidate_cache_pages() or pmap_flush_cache_range(), looping over CLFLUSHOPTs or CLWBs we get interrupted somehow and will be preempted. it seems there are a few ways for that to happen, but all of them include thread_lock(curthread) which will execute LOCK CMPXCHG. This means all earlier writes to the memory we want to cache-flush are globally ordered before the acquisition of the lock. (For AMD, APM vol 2, sec 7.4.2, table 7-3 shows loads and stores to all memory types will be globally ordered before the locked cmpxchg. Also in AMD's APM: "CLFLUSHOPT is ordered with respect to fence instructions and locked operations" "To create strict ordering of CLWB use a store-ordering instruction such as SFENCE" (vol 2, sec 7.4.2 shows that locked cmpxchg is store-ordering). the thread gets migrated to another cpu. here again thread_lock() will LOCK CMPXCHG, and this lock will be globally ordered after the previous one (by causality; migration to another cpu can't happen "before" preemption), and so too after the stores we want to cache-flush. we execute our next CLFLUSHOPT or CLWB The right data we want to flush out must be visible, transitively through the locked cmpxchg operations, back to the original stores. scottph: Here's the execution trace I'm considering, tell me if it's wrong or there's something i'm…
		kibAuthorUnsubmitted Done Inline Actions Unfortunately AMD manual is self-contradicting. Please look at the description of the CLFLUSH instruction in vol.3, specifically the paragraph explaining CLFLUSH ordering for CPUs which do not implement CLFLUSHOPT. kib: Unfortunately AMD manual is self-contradicting. Please look at the description of the CLFLUSH…
		scottphUnsubmitted Done Inline Actions I see, you're looking at: The CLFLUSH instruction may also take effect on a cache line while stores from previous store instructions are still pending in the store buffer. To ensure that such stores are included in the cache line that is flushed, use an MFENCE instruction ahead of the CLFLUSH instruction. Such stores would otherwise cause the line to be re-cached and modified after the CLFLUSH completed. The LFENCE, SFENCE, and serializing instructions are not ordered with respect to CLFLUSH. I think the case we're discussing probably still can't cause this CLFLUSH to miss those earlier stores because the migration. That is, even though this CLFLUSH will pass store-ordering instructions, those stores from the other processor's store buffer have to have been flushed so that the new processor is able to see that the thread is available for migration. But here I don't think we can build a fully airtight case for that because this CLFLUSH is so weakly ordered. As specified, it's free to run back in time as far as the last mfence, which... who knows. So I doubt this could happen in practice, but as specified it can. scottph: I see, you're looking at: > The CLFLUSH instruction may also take effect on a cache line >…
return;		return;
		}
cpuid = PCPU_GET(cpuid);		cpuid = PCPU_GET(cpuid);
#ifdef SMP		#ifdef SMP
CPU_SET_ATOMIC(cpuid, &pmap->pm_active);		CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
#else		#else
CPU_SET(cpuid, &pmap->pm_active);		CPU_SET(cpuid, &pmap->pm_active);
#endif		#endif
pmap_activate_sw_mode(td, pmap, cpuid);		pmap_activate_sw_mode(td, pmap, cpuid);
#ifdef SMP		#ifdef SMP
▲ Show 20 Lines • Show All 676 Lines • ▼ Show 20 Lines
static void		static void
pmap_large_map_wb_fence_mfence(void)		pmap_large_map_wb_fence_mfence(void)
{		{

mfence();		mfence();
}		}

static void		static void
pmap_large_map_wb_fence_sfence(void)		pmap_large_map_wb_fence_atomic(void)
{		{

sfence();		atomic_thread_fence_seq_cst();
}		}

static void		static void
pmap_large_map_wb_fence_nop(void)		pmap_large_map_wb_fence_nop(void)
{		{
}		}

DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void))		DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void))
{		{

if (cpu_vendor_id != CPU_VENDOR_INTEL)		if (cpu_vendor_id != CPU_VENDOR_INTEL)
return (pmap_large_map_wb_fence_mfence);		return (pmap_large_map_wb_fence_mfence);
else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB \|		else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB \|
CPUID_STDEXT_CLFLUSHOPT)) == 0)		CPUID_STDEXT_CLFLUSHOPT)) == 0)
return (pmap_large_map_wb_fence_sfence);		return (pmap_large_map_wb_fence_atomic);
else		else
/* clflush is strongly enough ordered */		/* clflush is strongly enough ordered */
return (pmap_large_map_wb_fence_nop);		return (pmap_large_map_wb_fence_nop);
		cemUnsubmitted Not Done Inline Actions Orthogonal to this revision, but I think this logic may be sort of incorrect and should more closely mirror the `pmap_large_map_flush_range` ifunc selection. (Condition first on features, then vendor if we must.) `mfence` is stronger than needed in at least some cases of AMD CPU. Or may be needed on one side but not the other. cem: Orthogonal to this revision, but I think this logic may be sort of incorrect and should more…
}		}

static void		static void
pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)		pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
{		{

for (; len > 0; len -= cpu_clflush_line_size,		for (; len > 0; len -= cpu_clflush_line_size,
va += cpu_clflush_line_size)		va += cpu_clflush_line_size)
▲ Show 20 Lines • Show All 1,028 Lines • Show Last 20 Lines