Changeset View
Standalone View
sys/amd64/amd64/pmap.c
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 3,004 Lines • ▼ Show 20 Lines | pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) | ||||
* local APIC is always uncached, so we don't need to flush | * local APIC is always uncached, so we don't need to flush | ||||
* for that range anyway. | * for that range anyway. | ||||
*/ | */ | ||||
if (pmap_kextract(sva) == lapic_paddr) | if (pmap_kextract(sva) == lapic_paddr) | ||||
return; | return; | ||||
if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { | if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { | ||||
/* | /* | ||||
* Do per-cache line flush. Use the sfence | * Do per-cache line flush. Use a locked | ||||
* instruction to insure that previous stores are | * instruction to insure that previous stores are | ||||
* included in the write-back. The processor | * included in the write-back. The processor | ||||
* propagates flush to other processors in the cache | * propagates flush to other processors in the cache | ||||
* coherence domain. | * coherence domain. | ||||
*/ | */ | ||||
sfence(); | atomic_thread_fence_seq_cst(); | ||||
scottph: What is the ordering that we're establishing here? clflushopt is ordered with respect to… | |||||
kibAuthorUnsubmitted Done Inline ActionsRight, only to the same cacheline. We want this op to follow normal TSO rules of x86. kib: Right, only to the same cacheline. We want this op to follow normal TSO rules of x86. | |||||
for (; sva < eva; sva += cpu_clflush_line_size) | for (; sva < eva; sva += cpu_clflush_line_size) | ||||
clflushopt(sva); | clflushopt(sva); | ||||
sfence(); | atomic_thread_fence_seq_cst(); | ||||
} else { | } else { | ||||
/* | /* | ||||
* Writes are ordered by CLFLUSH on Intel CPUs. | * Writes are ordered by CLFLUSH on Intel CPUs. | ||||
*/ | */ | ||||
if (cpu_vendor_id != CPU_VENDOR_INTEL) | if (cpu_vendor_id != CPU_VENDOR_INTEL) | ||||
mfence(); | mfence(); | ||||
for (; sva < eva; sva += cpu_clflush_line_size) | for (; sva < eva; sva += cpu_clflush_line_size) | ||||
clflush(sva); | clflush(sva); | ||||
Show All 25 Lines | pmap_invalidate_cache_pages(vm_page_t *pages, int count) | ||||
bool useclflushopt; | bool useclflushopt; | ||||
useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; | useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; | ||||
if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || | if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || | ||||
((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) | ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) | ||||
pmap_invalidate_cache(); | pmap_invalidate_cache(); | ||||
else { | else { | ||||
if (useclflushopt) | if (useclflushopt) | ||||
sfence(); | atomic_thread_fence_seq_cst(); | ||||
else if (cpu_vendor_id != CPU_VENDOR_INTEL) | else if (cpu_vendor_id != CPU_VENDOR_INTEL) | ||||
mfence(); | mfence(); | ||||
cemUnsubmitted Not Done Inline ActionsThis is extra information in answer to my own question; I'm not requesting any change here. The AMD APM vol 3, rev 3.28 is slightly nuanced (p.139). mfence() sandwich as-used today is required IFF the CPU does not support CLFLUSHOPT. If the CPU does support CLFLUSHOPT, CLFLUSH is ordered w.r.t. locked ops, fence instructions other than mfence; as well as same-cacheline {clflushopt, clflush, and writes}. So, the new logic looks correct to me — on future AMD models that support CLFLUSHOPT and have stronger CLFLUSH semantics, we'll just use CLFLUSHOPT anyway due to our existing preference, and the faster locked primitive is adequate. If there is some theoretical reason we might set useclflushopt=false on a platform with the cpuid bit set, then it might make sense to optimize the fencing on CLFLUSH. But I do not know any reason we would do that. cem: This is extra information in answer to my own question; I'm not requesting any change here. | |||||
for (i = 0; i < count; i++) { | for (i = 0; i < count; i++) { | ||||
daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); | daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); | ||||
eva = daddr + PAGE_SIZE; | eva = daddr + PAGE_SIZE; | ||||
for (; daddr < eva; daddr += cpu_clflush_line_size) { | for (; daddr < eva; daddr += cpu_clflush_line_size) { | ||||
if (useclflushopt) | if (useclflushopt) | ||||
clflushopt(daddr); | clflushopt(daddr); | ||||
else | else | ||||
clflush(daddr); | clflush(daddr); | ||||
} | } | ||||
} | } | ||||
if (useclflushopt) | if (useclflushopt) | ||||
sfence(); | atomic_thread_fence_seq_cst(); | ||||
else if (cpu_vendor_id != CPU_VENDOR_INTEL) | else if (cpu_vendor_id != CPU_VENDOR_INTEL) | ||||
mfence(); | mfence(); | ||||
cemUnsubmitted Not Done Inline ActionsAgain, not a request for any change. I'm not sure the strong mfence is actually needed afterwards on AMD. The language (for both clflush and clflushopt in APM 3.28) is confusing to me:
(CLFLUSHOPT language is identical):
An invalidated prefetech cacheline doesn't sound like a correctness problem to me. On the other hand, the CLFLUSH section does explicitly say that non-CLFLUSHOPT models do nor order CLFLUSH against LFENCE, SFENCE, or serializing instructions. So I'm not sure what cheaper store-store barrier would be safe. Maybe none. cem: Again, not a request for any change.
I'm not sure the strong `mfence` is actually needed… | |||||
} | } | ||||
} | } | ||||
void | void | ||||
pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) | pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) | ||||
{ | { | ||||
pmap_invalidate_cache_range_check_align(sva, eva); | pmap_invalidate_cache_range_check_align(sva, eva); | ||||
if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { | if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { | ||||
pmap_force_invalidate_cache_range(sva, eva); | pmap_force_invalidate_cache_range(sva, eva); | ||||
return; | return; | ||||
} | } | ||||
/* See comment in pmap_force_invalidate_cache_range(). */ | /* See comment in pmap_force_invalidate_cache_range(). */ | ||||
if (pmap_kextract(sva) == lapic_paddr) | if (pmap_kextract(sva) == lapic_paddr) | ||||
return; | return; | ||||
sfence(); | atomic_thread_fence_seq_cst(); | ||||
scottphUnsubmitted Not Done Inline Actionsisn't it the case that the writes we want to flush have either happened on the same processor, so CLWB is implicitly ordered with them, or they have happened on another processor and we have since migrated, and the ordering has been established by the thread lock? scottph: isn't it the case that the writes we want to flush have either happened on the same processor… | |||||
kibAuthorUnsubmitted Done Inline ActionsAgain, we want this flush to be TSO-consistent with older writes. I suspect this is esp. important there, because the function was added to, and is used with non-coherent hardware. kib: Again, we want this flush to be TSO-consistent with older writes. I suspect this is esp. | |||||
scottphUnsubmitted Not Done Inline ActionsWhat I mean is: which earlier stores done by this logical processor and not in the range being flushed are important to be globally visible before cache flushing begins? The fence after makes sense to me (don't go on to tell somebody outside the coherence domain about data in memory until that data is actually in memory), but I don't yet see the objective of the fence before. scottph: What I mean is: which earlier stores done by this logical processor and not in the range being… | |||||
kibAuthorUnsubmitted Done Inline ActionsFence before the flushing ensures that the writes for clearing cache line are properly ordered, including the writes before the flush. In fact, please look at the Intel' software optimization manual rev. 042b section 8.4.7, esp. example 8.2. kib: Fence before the flushing ensures that the writes for clearing cache line are properly ordered… | |||||
for (; sva < eva; sva += cpu_clflush_line_size) | for (; sva < eva; sva += cpu_clflush_line_size) | ||||
clwb(sva); | clwb(sva); | ||||
sfence(); | atomic_thread_fence_seq_cst(); | ||||
cemUnsubmitted Not Done Inline ActionsAMD APM Vol 3 3.28 language for CLWB is:
The "such as" suggests SFENCE is not the only option, but I'm not sure on the semantics of "store-ordering instructions." It is the only use of the term in the document. Do locked instructions count as "store-ordering?" (It's not relevant to CLWB, but sort of similar: the CLZERO language says something slightly different:)
cem: AMD APM Vol 3 3.28 language for `CLWB` is:
> The CLWB instruction is weakly ordered with… | |||||
} | } | ||||
void | void | ||||
pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) | pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) | ||||
{ | { | ||||
pt_entry_t *pte; | pt_entry_t *pte; | ||||
vm_offset_t vaddr; | vm_offset_t vaddr; | ||||
int error, pte_bits; | int error, pte_bits; | ||||
Show All 16 Lines | pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) | ||||
error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, | error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, | ||||
&vaddr); | &vaddr); | ||||
KASSERT(error == 0, ("vmem_alloc failed: %d", error)); | KASSERT(error == 0, ("vmem_alloc failed: %d", error)); | ||||
pte = vtopte(vaddr); | pte = vtopte(vaddr); | ||||
for (; spa < epa; spa += PAGE_SIZE) { | for (; spa < epa; spa += PAGE_SIZE) { | ||||
sched_pin(); | sched_pin(); | ||||
pte_store(pte, spa | pte_bits); | pte_store(pte, spa | pte_bits); | ||||
invlpg(vaddr); | invlpg(vaddr); | ||||
/* XXXKIB sfences inside flush_cache_range are excessive */ | /* XXXKIB atomic inside flush_cache_range are excessive */ | ||||
pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); | pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); | ||||
cemUnsubmitted Not Done Inline Actionsstatic inline variants that take a bool barrier could be added. cem: `static inline` variants that take a `bool barrier` could be added. | |||||
sched_unpin(); | sched_unpin(); | ||||
} | } | ||||
vmem_free(kernel_arena, vaddr, PAGE_SIZE); | vmem_free(kernel_arena, vaddr, PAGE_SIZE); | ||||
} | } | ||||
/* | /* | ||||
* Routine: pmap_extract | * Routine: pmap_extract | ||||
* Function: | * Function: | ||||
▲ Show 20 Lines • Show All 5,545 Lines • ▼ Show 20 Lines | |||||
void | void | ||||
pmap_activate_sw(struct thread *td) | pmap_activate_sw(struct thread *td) | ||||
{ | { | ||||
pmap_t oldpmap, pmap; | pmap_t oldpmap, pmap; | ||||
u_int cpuid; | u_int cpuid; | ||||
oldpmap = PCPU_GET(curpmap); | oldpmap = PCPU_GET(curpmap); | ||||
pmap = vmspace_pmap(td->td_proc->p_vmspace); | pmap = vmspace_pmap(td->td_proc->p_vmspace); | ||||
if (oldpmap == pmap) | if (oldpmap == pmap) { | ||||
mfence(); | |||||
scottphUnsubmitted Not Done Inline Actionswon't the thread lock already establish the ordering that we want here? scottph: won't the thread lock already establish the ordering that we want here? | |||||
kibAuthorUnsubmitted Done Inline ActionsNot on AMD. kib: Not on AMD. | |||||
scottphUnsubmitted Not Done Inline ActionsHere's the execution trace I'm considering, tell me if it's wrong or there's something i'm missing:
scottph: Here's the execution trace I'm considering, tell me if it's wrong or there's something i'm… | |||||
kibAuthorUnsubmitted Done Inline ActionsUnfortunately AMD manual is self-contradicting. Please look at the description of the CLFLUSH instruction in vol.3, specifically the paragraph explaining CLFLUSH ordering for CPUs which do not implement CLFLUSHOPT. kib: Unfortunately AMD manual is self-contradicting. Please look at the description of the CLFLUSH… | |||||
scottphUnsubmitted Done Inline ActionsI see, you're looking at:
I think the case we're discussing probably still can't cause this CLFLUSH to miss those earlier stores because the migration. That is, even though this CLFLUSH will pass store-ordering instructions, those stores from the other processor's store buffer have to have been flushed so that the new processor is able to see that the thread is available for migration. But here I don't think we can build a fully airtight case for that because this CLFLUSH is so weakly ordered. As specified, it's free to run back in time as far as the last mfence, which... who knows. So I doubt this could happen in practice, but as specified it can. scottph: I see, you're looking at:
> The CLFLUSH instruction may also take effect on a cache line
>… | |||||
return; | return; | ||||
} | |||||
cpuid = PCPU_GET(cpuid); | cpuid = PCPU_GET(cpuid); | ||||
#ifdef SMP | #ifdef SMP | ||||
CPU_SET_ATOMIC(cpuid, &pmap->pm_active); | CPU_SET_ATOMIC(cpuid, &pmap->pm_active); | ||||
#else | #else | ||||
CPU_SET(cpuid, &pmap->pm_active); | CPU_SET(cpuid, &pmap->pm_active); | ||||
#endif | #endif | ||||
pmap_activate_sw_mode(td, pmap, cpuid); | pmap_activate_sw_mode(td, pmap, cpuid); | ||||
#ifdef SMP | #ifdef SMP | ||||
▲ Show 20 Lines • Show All 676 Lines • ▼ Show 20 Lines | |||||
static void | static void | ||||
pmap_large_map_wb_fence_mfence(void) | pmap_large_map_wb_fence_mfence(void) | ||||
{ | { | ||||
mfence(); | mfence(); | ||||
} | } | ||||
static void | static void | ||||
pmap_large_map_wb_fence_sfence(void) | pmap_large_map_wb_fence_atomic(void) | ||||
{ | { | ||||
sfence(); | atomic_thread_fence_seq_cst(); | ||||
} | } | ||||
static void | static void | ||||
pmap_large_map_wb_fence_nop(void) | pmap_large_map_wb_fence_nop(void) | ||||
{ | { | ||||
} | } | ||||
DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) | DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) | ||||
{ | { | ||||
if (cpu_vendor_id != CPU_VENDOR_INTEL) | if (cpu_vendor_id != CPU_VENDOR_INTEL) | ||||
return (pmap_large_map_wb_fence_mfence); | return (pmap_large_map_wb_fence_mfence); | ||||
else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | | else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | | ||||
CPUID_STDEXT_CLFLUSHOPT)) == 0) | CPUID_STDEXT_CLFLUSHOPT)) == 0) | ||||
return (pmap_large_map_wb_fence_sfence); | return (pmap_large_map_wb_fence_atomic); | ||||
else | else | ||||
/* clflush is strongly enough ordered */ | /* clflush is strongly enough ordered */ | ||||
return (pmap_large_map_wb_fence_nop); | return (pmap_large_map_wb_fence_nop); | ||||
cemUnsubmitted Not Done Inline ActionsOrthogonal to this revision, but I think this logic may be sort of incorrect and should more closely mirror the pmap_large_map_flush_range ifunc selection. (Condition first on features, then vendor if we must.) mfence is stronger than needed in at least some cases of AMD CPU. Or may be needed on one side but not the other. cem: Orthogonal to this revision, but I think this logic may be sort of incorrect and should more… | |||||
} | } | ||||
static void | static void | ||||
pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) | pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) | ||||
{ | { | ||||
for (; len > 0; len -= cpu_clflush_line_size, | for (; len > 0; len -= cpu_clflush_line_size, | ||||
va += cpu_clflush_line_size) | va += cpu_clflush_line_size) | ||||
▲ Show 20 Lines • Show All 1,028 Lines • Show Last 20 Lines |
What is the ordering that we're establishing here? clflushopt is ordered with respect to earlier writes to the same cacheline.