Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F143977468
D8041.id21037.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
6 KB
Referenced Files
None
Subscribers
None
D8041.id21037.diff
View Options
Index: head/sys/amd64/amd64/mp_machdep.c
===================================================================
--- head/sys/amd64/amd64/mp_machdep.c
+++ head/sys/amd64/amd64/mp_machdep.c
@@ -409,6 +409,7 @@
invltlb_invpcid_handler(void)
{
struct invpcid_descr d;
+ uint32_t generation;
#ifdef COUNT_XINVLTLB_HITS
xhits_gbl[PCPU_GET(cpuid)]++;
@@ -417,17 +418,20 @@
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */
+ generation = smp_tlb_generation;
d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
d.pad = 0;
d.addr = 0;
invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
INVPCID_CTX);
- atomic_add_int(&smp_tlb_wait, 1);
+ PCPU_SET(smp_tlb_done, generation);
}
void
invltlb_pcid_handler(void)
{
+ uint32_t generation;
+
#ifdef COUNT_XINVLTLB_HITS
xhits_gbl[PCPU_GET(cpuid)]++;
#endif /* COUNT_XINVLTLB_HITS */
@@ -435,6 +439,7 @@
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */
+ generation = smp_tlb_generation; /* Overlap with serialization */
if (smp_tlb_pmap == kernel_pmap) {
invltlb_glob();
} else {
@@ -450,5 +455,5 @@
smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
}
}
- atomic_add_int(&smp_tlb_wait, 1);
+ PCPU_SET(smp_tlb_done, generation);
}
Index: head/sys/amd64/include/pcpu.h
===================================================================
--- head/sys/amd64/include/pcpu.h
+++ head/sys/amd64/include/pcpu.h
@@ -65,7 +65,8 @@
u_int pc_vcpu_id; /* Xen vCPU ID */ \
uint32_t pc_pcid_next; \
uint32_t pc_pcid_gen; \
- char __pad[149] /* be divisor of PAGE_SIZE \
+ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \
+ char __pad[145] /* be divisor of PAGE_SIZE \
after cache alignment */
#define PC_DBREG_CMD_NONE 0
Index: head/sys/i386/include/pcpu.h
===================================================================
--- head/sys/i386/include/pcpu.h
+++ head/sys/i386/include/pcpu.h
@@ -59,7 +59,8 @@
u_int pc_cmci_mask; /* MCx banks for CMCI */ \
u_int pc_vcpu_id; /* Xen vCPU ID */ \
vm_offset_t pc_qmap_addr; /* KVA for temporary mappings */\
- char __pad[229]
+ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \
+ char __pad[225]
#ifdef _KERNEL
Index: head/sys/x86/include/x86_smp.h
===================================================================
--- head/sys/x86/include/x86_smp.h
+++ head/sys/x86/include/x86_smp.h
@@ -35,7 +35,7 @@
extern struct mtx ap_boot_mtx;
extern int cpu_logical;
extern int cpu_cores;
-extern volatile int smp_tlb_wait;
+extern volatile uint32_t smp_tlb_generation;
extern struct pmap *smp_tlb_pmap;
extern u_int xhits_gbl[];
extern u_int xhits_pg[];
Index: head/sys/x86/x86/mp_x86.c
===================================================================
--- head/sys/x86/x86/mp_x86.c
+++ head/sys/x86/x86/mp_x86.c
@@ -1304,12 +1304,22 @@
void
invlcache_handler(void)
{
+ uint32_t generation;
+
#ifdef COUNT_IPIS
(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */
+ /*
+ * Reading the generation here allows greater parallelism
+ * since wbinvd is a serializing instruction. Without the
+ * temporary, we'd wait for wbinvd to complete, then the read
+ * would execute, then the dependent write, whuch must then
+ * complete before return from interrupt.
+ */
+ generation = smp_tlb_generation;
wbinvd();
- atomic_add_int(&smp_tlb_wait, 1);
+ PCPU_SET(smp_tlb_done, generation);
}
/*
@@ -1367,7 +1377,7 @@
/* Variables needed for SMP tlb shootdown. */
static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
pmap_t smp_tlb_pmap;
-volatile int smp_tlb_wait;
+volatile uint32_t smp_tlb_generation;
#ifdef __amd64__
#define read_eflags() read_rflags()
@@ -1377,15 +1387,16 @@
smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
vm_offset_t addr1, vm_offset_t addr2)
{
- int cpu, ncpu, othercpus;
-
- othercpus = mp_ncpus - 1; /* does not shootdown self */
+ cpuset_t other_cpus;
+ volatile uint32_t *p_cpudone;
+ uint32_t generation;
+ int cpu;
/*
* Check for other cpus. Return if none.
*/
if (CPU_ISFULLSET(&mask)) {
- if (othercpus < 1)
+ if (mp_ncpus <= 1)
return;
} else {
CPU_CLR(PCPU_GET(cpuid), &mask);
@@ -1399,23 +1410,28 @@
smp_tlb_addr1 = addr1;
smp_tlb_addr2 = addr2;
smp_tlb_pmap = pmap;
- smp_tlb_wait = 0;
+ generation = ++smp_tlb_generation;
if (CPU_ISFULLSET(&mask)) {
- ncpu = othercpus;
ipi_all_but_self(vector);
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
} else {
- ncpu = 0;
+ other_cpus = mask;
while ((cpu = CPU_FFS(&mask)) != 0) {
cpu--;
CPU_CLR(cpu, &mask);
CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
cpu, vector);
ipi_send_cpu(cpu, vector);
- ncpu++;
}
}
- while (smp_tlb_wait < ncpu)
- ia32_pause();
+ while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+ cpu--;
+ CPU_CLR(cpu, &other_cpus);
+ p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+ while (*p_cpudone != generation)
+ ia32_pause();
+ }
mtx_unlock_spin(&smp_ipi_mtx);
}
@@ -1473,6 +1489,8 @@
void
invltlb_handler(void)
{
+ uint32_t generation;
+
#ifdef COUNT_XINVLTLB_HITS
xhits_gbl[PCPU_GET(cpuid)]++;
#endif /* COUNT_XINVLTLB_HITS */
@@ -1480,16 +1498,23 @@
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */
+ /*
+ * Reading the generation here allows greater parallelism
+ * since invalidating the TLB is a serializing operation.
+ */
+ generation = smp_tlb_generation;
if (smp_tlb_pmap == kernel_pmap)
invltlb_glob();
else
invltlb();
- atomic_add_int(&smp_tlb_wait, 1);
+ PCPU_SET(smp_tlb_done, generation);
}
void
invlpg_handler(void)
{
+ uint32_t generation;
+
#ifdef COUNT_XINVLTLB_HITS
xhits_pg[PCPU_GET(cpuid)]++;
#endif /* COUNT_XINVLTLB_HITS */
@@ -1497,14 +1522,16 @@
(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */
+ generation = smp_tlb_generation; /* Overlap with serialization */
invlpg(smp_tlb_addr1);
- atomic_add_int(&smp_tlb_wait, 1);
+ PCPU_SET(smp_tlb_done, generation);
}
void
invlrng_handler(void)
{
- vm_offset_t addr;
+ vm_offset_t addr, addr2;
+ uint32_t generation;
#ifdef COUNT_XINVLTLB_HITS
xhits_rng[PCPU_GET(cpuid)]++;
@@ -1514,10 +1541,12 @@
#endif /* COUNT_IPIS */
addr = smp_tlb_addr1;
+ addr2 = smp_tlb_addr2;
+ generation = smp_tlb_generation; /* Overlap with serialization */
do {
invlpg(addr);
addr += PAGE_SIZE;
- } while (addr < smp_tlb_addr2);
+ } while (addr < addr2);
- atomic_add_int(&smp_tlb_wait, 1);
+ PCPU_SET(smp_tlb_done, generation);
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Feb 3, 8:48 PM (16 h, 3 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28426858
Default Alt Text
D8041.id21037.diff (6 KB)
Attached To
Mode
D8041: Reduce the cost of TLB invalidation on x86 by using per-CPU completion flags.
Attached
Detach File
Event Timeline
Log In to Comment