Index: head/sys/amd64/amd64/mp_machdep.c =================================================================== --- head/sys/amd64/amd64/mp_machdep.c +++ head/sys/amd64/amd64/mp_machdep.c @@ -409,6 +409,7 @@ invltlb_invpcid_handler(void) { struct invpcid_descr d; + uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; @@ -417,17 +418,20 @@ (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + generation = smp_tlb_generation; d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB : INVPCID_CTX); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } void invltlb_pcid_handler(void) { + uint32_t generation; + #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ @@ -435,6 +439,7 @@ (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + generation = smp_tlb_generation; /* Overlap with serialization */ if (smp_tlb_pmap == kernel_pmap) { invltlb_glob(); } else { @@ -450,5 +455,5 @@ smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid); } } - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } Index: head/sys/amd64/include/pcpu.h =================================================================== --- head/sys/amd64/include/pcpu.h +++ head/sys/amd64/include/pcpu.h @@ -65,7 +65,8 @@ u_int pc_vcpu_id; /* Xen vCPU ID */ \ uint32_t pc_pcid_next; \ uint32_t pc_pcid_gen; \ - char __pad[149] /* be divisor of PAGE_SIZE \ + uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ + char __pad[145] /* be divisor of PAGE_SIZE \ after cache alignment */ #define PC_DBREG_CMD_NONE 0 Index: head/sys/i386/include/pcpu.h =================================================================== --- head/sys/i386/include/pcpu.h +++ head/sys/i386/include/pcpu.h @@ -59,7 +59,8 @@ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ vm_offset_t pc_qmap_addr; /* KVA for temporary mappings */\ - char __pad[229] + uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ + char __pad[225] #ifdef _KERNEL Index: head/sys/x86/include/x86_smp.h =================================================================== --- head/sys/x86/include/x86_smp.h +++ head/sys/x86/include/x86_smp.h @@ -35,7 +35,7 @@ extern struct mtx ap_boot_mtx; extern int cpu_logical; extern int cpu_cores; -extern volatile int smp_tlb_wait; +extern volatile uint32_t smp_tlb_generation; extern struct pmap *smp_tlb_pmap; extern u_int xhits_gbl[]; extern u_int xhits_pg[]; Index: head/sys/x86/x86/mp_x86.c =================================================================== --- head/sys/x86/x86/mp_x86.c +++ head/sys/x86/x86/mp_x86.c @@ -1304,12 +1304,22 @@ void invlcache_handler(void) { + uint32_t generation; + #ifdef COUNT_IPIS (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + /* + * Reading the generation here allows greater parallelism + * since wbinvd is a serializing instruction. Without the + * temporary, we'd wait for wbinvd to complete, then the read + * would execute, then the dependent write, whuch must then + * complete before return from interrupt. + */ + generation = smp_tlb_generation; wbinvd(); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } /* @@ -1367,7 +1377,7 @@ /* Variables needed for SMP tlb shootdown. */ static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; pmap_t smp_tlb_pmap; -volatile int smp_tlb_wait; +volatile uint32_t smp_tlb_generation; #ifdef __amd64__ #define read_eflags() read_rflags() @@ -1377,15 +1387,16 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { - int cpu, ncpu, othercpus; - - othercpus = mp_ncpus - 1; /* does not shootdown self */ + cpuset_t other_cpus; + volatile uint32_t *p_cpudone; + uint32_t generation; + int cpu; /* * Check for other cpus. Return if none. */ if (CPU_ISFULLSET(&mask)) { - if (othercpus < 1) + if (mp_ncpus <= 1) return; } else { CPU_CLR(PCPU_GET(cpuid), &mask); @@ -1399,23 +1410,28 @@ smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; smp_tlb_pmap = pmap; - smp_tlb_wait = 0; + generation = ++smp_tlb_generation; if (CPU_ISFULLSET(&mask)) { - ncpu = othercpus; ipi_all_but_self(vector); + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); } else { - ncpu = 0; + other_cpus = mask; while ((cpu = CPU_FFS(&mask)) != 0) { cpu--; CPU_CLR(cpu, &mask); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, vector); ipi_send_cpu(cpu, vector); - ncpu++; } } - while (smp_tlb_wait < ncpu) - ia32_pause(); + while ((cpu = CPU_FFS(&other_cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &other_cpus); + p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; + while (*p_cpudone != generation) + ia32_pause(); + } mtx_unlock_spin(&smp_ipi_mtx); } @@ -1473,6 +1489,8 @@ void invltlb_handler(void) { + uint32_t generation; + #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ @@ -1480,16 +1498,23 @@ (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + /* + * Reading the generation here allows greater parallelism + * since invalidating the TLB is a serializing operation. + */ + generation = smp_tlb_generation; if (smp_tlb_pmap == kernel_pmap) invltlb_glob(); else invltlb(); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } void invlpg_handler(void) { + uint32_t generation; + #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ @@ -1497,14 +1522,16 @@ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ + generation = smp_tlb_generation; /* Overlap with serialization */ invlpg(smp_tlb_addr1); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); } void invlrng_handler(void) { - vm_offset_t addr; + vm_offset_t addr, addr2; + uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; @@ -1514,10 +1541,12 @@ #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; + addr2 = smp_tlb_addr2; + generation = smp_tlb_generation; /* Overlap with serialization */ do { invlpg(addr); addr += PAGE_SIZE; - } while (addr < smp_tlb_addr2); + } while (addr < addr2); - atomic_add_int(&smp_tlb_wait, 1); + PCPU_SET(smp_tlb_done, generation); }