Index: sys/amd64/amd64/machdep.c =================================================================== --- sys/amd64/amd64/machdep.c +++ sys/amd64/amd64/machdep.c @@ -216,9 +216,6 @@ static struct trapframe proc0_tf; struct region_descriptor r_idt; -struct pcpu *__pcpu; -struct pcpu temp_bsp_pcpu; - struct mtx icu_lock; struct mem_range_softc mem_range_softc; @@ -1670,13 +1667,20 @@ */ pmap_thread_init_invl_gen(&thread0); - pc = &temp_bsp_pcpu; + /* + * Initialize the static and dynamic per-CPU areas. The latter must + * immediately follow the former. + */ + pc = (struct pcpu *)(physfree + KERNBASE); + physfree += sizeof(struct pcpu); pcpu_init(pc, 0, sizeof(struct pcpu)); - gdt = &temp_bsp_pcpu.pc_gdt[0]; + dpcpu_init((void *)(physfree + KERNBASE), 0); + physfree += DPCPU_SIZE; /* * make gdt memory segments */ + gdt = &pc->pc_gdt[0]; for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) @@ -1694,8 +1698,6 @@ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ - dpcpu_init((void *)(physfree + KERNBASE), 0); - physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ Index: sys/amd64/amd64/mp_machdep.c =================================================================== --- sys/amd64/amd64/mp_machdep.c +++ sys/amd64/amd64/mp_machdep.c @@ -61,8 +61,13 @@ #include #include #include +#include +#include #include #include +#include +#include +#include #include #include @@ -124,6 +129,173 @@ return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem); } +/* + * Initialize the bootstrap allocator for dynamic per-CPU memory allocations. + * 2MB is reserved by pmap_bootstrap() for the BSP, from which its pcpu and + * dpcpu regions are allocated. The rest can be used by UMA to satisfy dynamic + * per-CPU allocations until SI_SUB_CPU, when the pcpu regions for the APs are + * laid out. At that point, unused portions of the initial 2MB allocation may + * be used for APs as well as the BSP. + */ +static void +pcpu_bootstrap(void *arg __unused) +{ + uma_pcpu_init1(VM_PCPU_BASE_START + sizeof(struct pcpu) + DPCPU_SIZE, + VM_PCPU_BOOTSTRAP_SIZE - (sizeof(struct pcpu) + DPCPU_SIZE)); +} +SYSINIT(pcpu_bootstrap, SI_SUB_VM, SI_ORDER_ANY, pcpu_bootstrap, NULL); + +static int +pcpu_domidx(int domain) +{ + int bspdom; + + bspdom = PCPU_GET(domain); + if (bspdom == 0) + return (domain); + if (domain == bspdom) + return (0); + return (domain > bspdom ? domain : domain + 1); +} + +/* + * Place per-CPU structures. Each AP requires a pcpu and dpcpu region. The + * pcpu region of a CPU is its base pcpu address. A pointer to per-CPU data is + * an offset relative to the base pcpu address, and UMA's per-CPU allocator + * ensures that adding that offset to the base address always gives the address + * of memory allocated for the corresponding CPU. + * + * The layout attempts to maximize use of 2MB mappings while also providing + * domain-local memory on NUMA systems. It uses 2 parameters, N, the number of + * 4KB pages per CPU, and M, the number of 2MB pages per allocation quantum. M + * is a multiple of vm_ndomains and they are usually equal. N has a lower bound + * of L = sizeof(struct pcpu) + DPCPU_SIZE + uma_pcpu_bootstrap_used(), where + * the last term is the amount of memory used by the bootstrap per-CPU + * allocator. Each 2MB page hosts per-CPU data for CPUs belonging to the domain + * from which the page was allocated, so we first compute M by determining the + * maximum number of CPUs per domain and multiplying that by L. Then N is given + * by M*2MB divided by the number of CPUs per domain. + * + * __________ N 4KB pages __________ + * / \ + * VM_PCPU_BASE_START -------> +----------+-----------+--------------+ + * | BSP pcpu | BSP dpcpu | UMA data ... |\ + * +----------+-----------+--------------+ | + * | AP1 pcpu | AP1 dpcpu | UMA data ... | | + * +----------+-----------+--------------+ | + * | ... | | M 2MB + * +----------+--------------------------+ | pages + * | APi pcpu | APi dpcpu | UMA data ... | | + * +----------+-----------+--------------+ | + * | ... | | + * | ... |/ + * +-------------------------------------+ + * + * If the original region is exhausted, for example because a subsystem + * allocates many per-CPU counters, UMA allocaates another M*2MB region of KVA + * to mirror the base region. + */ +static void +pcpu_layout(void) +{ + vm_offset_t addr; + vm_size_t size, used; + int count[MAXMEMDOM], domoff[MAXMEMDOM]; + int domain, error, i, maxcpupdom, n2mpgpdom, n4kpgpcpu, nbpdom; + + /* + * Compute the maximum count of CPUs in a single domain. Domains are + * typically symmetric but this is not required. + */ + memset(count, 0, sizeof(count)); + for (i = 0; i <= mp_maxid; i++) { + if (vm_ndomains > 1 && cpu_apic_ids[i] != -1) + domain = acpi_pxm_get_cpu_locality(cpu_apic_ids[i]); + else + domain = 0; + count[domain]++; + } + for (i = 0, maxcpupdom = -1; i < vm_ndomains; i++) + if (count[i] > maxcpupdom) + maxcpupdom = count[i]; + + /* + * Compute layout parameters: the number of 4KB pages per CPU, and the + * number of 2MB pages per domain. The amount of memory already + * allocated by the bootstrap allocator gives a lower bound for the + * former, and we use that bound to compute the number of 2MB pages + * per domain. + */ + used = uma_pcpu_bootstrap_used(); + n2mpgpdom = howmany(atop(used) * maxcpupdom, NPDEPG); + n4kpgpcpu = atop(NBPDR * n2mpgpdom) / maxcpupdom; + + /* + * Assign a pcpu base address to each CPU. Handle the possibility that + * the BSP is not local to domain 0. + */ + memset(domoff, 0, sizeof(domoff)); + for (i = 0; i <= mp_maxid; i++) { + if (vm_ndomains > 1 && cpu_apic_ids[i] != -1) + domain = acpi_pxm_get_cpu_locality(cpu_apic_ids[i]); + else + domain = 0; + + addr = VM_PCPU_BASE_START + + pcpu_domidx(domain) * n2mpgpdom * NBPDR + + domoff[domain] * n4kpgpcpu * PAGE_SIZE; + cpuid_to_pcpu[i] = (struct pcpu *)addr; + domoff[domain]++; + } + + /* + * Ensure that the remaining bootstrap region is backed by physical + * pages. + */ + nbpdom = n2mpgpdom * NBPDR; + for (domain = 0; domain < vm_ndomains; domain++) { + addr = VM_PCPU_BASE_START + nbpdom * pcpu_domidx(domain); + size = nbpdom; + if (domain == PCPU_GET(domain)) { + /* This 2MB page was allocated by pmap_bootstrap(). */ + addr += NBPDR; + size -= NBPDR; + if (size == 0) + continue; + } + if (VM_DOMAIN_EMPTY(domain)) + error = kmem_back(kernel_object, addr, size, + M_WAITOK | M_ZERO); + else + error = kmem_back_domain(domain, kernel_object, addr, + size, M_WAITOK | M_ZERO); + if (error != KERN_SUCCESS) + panic("%s: failed to allocate memory: %d", + __func__, error); + } + + /* + * Release reserved, unused KVA back to the system. + */ + vm_map_lock(kernel_map); + error = vm_map_delete(kernel_map, + VM_PCPU_BASE_START + vm_ndomains * nbpdom, + VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE); + if (error != KERN_SUCCESS) + panic("%s: failed to release KVA: %d", __func__, error); + vm_map_unlock(kernel_map); + + /* + * Finally, provide layout parameters to the allocator so that it can + * finish bootstrapping. + */ + uma_pcpu_init2(n4kpgpcpu, n2mpgpdom); + + if (bootverbose) + printf("%s: %d 2MB pages per domain, %d 4KB pages per CPU\n", + __func__, n2mpgpdom, n4kpgpcpu); +} + /* * Calculate usable address in base memory for AP trampoline code. */ @@ -263,6 +435,9 @@ assign_cpu_ids(); + /* Place AP pcpu structures now that CPU IDs are defined. */ + pcpu_layout(); + /* Start each Application Processor */ init_ops.start_all_aps(); @@ -292,12 +467,9 @@ /* Update microcode before doing anything else. */ ucode_load_ap(cpu); - /* Get per-cpu data and save */ - pc = &__pcpu[cpu]; - - /* prime data page for it to use */ + pc = cpuid_to_pcpu[cpu]; pcpu_init(pc, cpu, sizeof(struct pcpu)); - dpcpu_init(dpcpu, cpu); + dpcpu_init((void *)DPCPU_BASE(pc), cpu); pc->pc_apic_id = cpu_apic_ids[cpu]; pc->pc_prvspace = pc; pc->pc_curthread = 0; @@ -315,7 +487,7 @@ pc->pc_pcid_gen = 1; /* Init tss */ - pc->pc_common_tss = __pcpu[0].pc_common_tss; + pc->pc_common_tss = cpuid_to_pcpu[0]->pc_common_tss; pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; pc->pc_common_tss.tss_rsp0 = 0; @@ -388,27 +560,6 @@ * local functions and data */ -#ifdef NUMA -static void -mp_realloc_pcpu(int cpuid, int domain) -{ - vm_page_t m; - vm_offset_t oa, na; - - oa = (vm_offset_t)&__pcpu[cpuid]; - if (_vm_phys_domain(pmap_kextract(oa)) == domain) - return; - m = vm_page_alloc_domain(NULL, 0, domain, - VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); - if (m == NULL) - return; - na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); - pagecopy((void *)oa, (void *)na); - pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1); - /* XXX old pcpu page leaked. */ -} -#endif - /* * start each AP in our list */ @@ -456,16 +607,6 @@ outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ - /* Relocate pcpu areas to the correct domain. */ -#ifdef NUMA - if (vm_ndomains > 1) - for (cpu = 1; cpu < mp_ncpus; cpu++) { - apic_id = cpu_apic_ids[cpu]; - domain = acpi_pxm_get_cpu_locality(apic_id); - mp_realloc_pcpu(cpu, domain); - } -#endif - /* start each AP */ domain = 0; for (cpu = 1; cpu < mp_ncpus; cpu++) { @@ -484,8 +625,6 @@ DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO); dbg_stack = (char *)kmem_malloc_domainset( DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO); - dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain), - DPCPU_SIZE, M_WAITOK | M_ZERO); bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8; Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -1415,6 +1415,17 @@ return (ret); } +static uint64_t +alloc2mpage(vm_paddr_t *firstaddr) +{ + uint64_t ret; + + ret = roundup2(*firstaddr, NBPDR); + bzero((void *)ret, NBPDR); + *firstaddr = ret + NBPDR; + return (ret); +} + CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ @@ -1660,6 +1671,59 @@ } } +static void +bootstrap_pcpu(vm_paddr_t pcpupg, vm_paddr_t pdppg) +{ + struct region_descriptor r_gdt; + struct pcpu *oldpc, *pc; + void *dpcpu; + vm_offset_t va; + pdp_entry_t *pdpe; + pd_entry_t *pde; + + /* + * Map the bootstrap per-CPU region. + */ + va = VM_PCPU_BASE_START; + pdpe = pmap_pdpe(kernel_pmap, va); + if ((*pdpe & X86_PG_V) != 0) + panic("pdpe for %#lx is already valid", va); + *pdpe = pdppg | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; + pde = pmap_pde(kernel_pmap, va); + pde_store(pde, pcpupg | X86_PG_V | X86_PG_PS | X86_PG_RW | X86_PG_A | + X86_PG_M | pg_nx | pg_g); + + /* + * Re-initialize PCPU area for BSP after switching. + * Make hardware use gdt and common_tss from the new PCPU. + * Copy dynamic PCPU data following the PCPU structure. + */ + STAILQ_INIT(&cpuhead); + pc = (struct pcpu *)va; + oldpc = get_pcpu(); + wrmsr(MSR_GSBASE, (uintptr_t)pc); + pcpu_init(pc, 0, sizeof(struct pcpu)); + amd64_bsp_pcpu_init1(pc); + amd64_bsp_ist_init(pc); + pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + + IOPERM_BITMAP_SIZE; + memcpy(pc->pc_gdt, oldpc->pc_gdt, NGDT * + sizeof(struct user_segment_descriptor)); + gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; + ssdtosyssd(&gdt_segs[GPROC0_SEL], + (struct system_segment_descriptor *)&pc->pc_gdt[GPROC0_SEL]); + r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; + r_gdt.rd_base = (long)pc->pc_gdt; + lgdt(&r_gdt); + wrmsr(MSR_GSBASE, (uintptr_t)pc); + ltr(GSEL(GPROC0_SEL, SEL_KPL)); + pc->pc_acpi_id = oldpc->pc_acpi_id; + + dpcpu = (void *)DPCPU_BASE(pc); + dpcpu_init(dpcpu, 0); + memcpy(dpcpu, (void *)DPCPU_BASE(oldpc), DPCPU_BYTES); +} + /* * Bootstrap the system enough to run with virtual memory. * @@ -1674,10 +1738,9 @@ pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; - pt_entry_t *pte, *pcpu_pte; - struct region_descriptor r_gdt; - uint64_t cr4, pcpu_phys; - u_long res; + pt_entry_t *pte; + uint64_t cr4; + u_long res, pcpupg, pdppg; int i; KERNend = *firstaddr; @@ -1691,8 +1754,6 @@ */ create_pagetables(firstaddr); - pcpu_phys = allocpages(firstaddr, MAXCPU); - /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures @@ -1708,6 +1769,20 @@ virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); virtual_end = VM_MAX_KERNEL_ADDRESS; + /* + * Reserve physical memory to bootstrap the per-CPU allocator, as well + * as a PD page used to map it into the kernel map. Minimize the amount + * of memory wasted to maintain alignment. + */ + if ((*firstaddr & PDRMASK) != 0) { + pdppg = allocpages(firstaddr, 1); + pcpupg = alloc2mpage(firstaddr); + } else { + pcpupg = alloc2mpage(firstaddr); + pdppg = allocpages(firstaddr, 1); + } + vm_phys_early_add_seg(pcpupg, pcpupg + NBPDR); + /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it @@ -1759,38 +1834,12 @@ */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; - - SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU); virtual_avail = va; - for (i = 0; i < MAXCPU; i++) { - pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW | - pg_g | pg_nx | X86_PG_M | X86_PG_A; - } - /* - * Re-initialize PCPU area for BSP after switching. - * Make hardware use gdt and common_tss from the new PCPU. + * Bootstrap the per-CPU allocator. */ - STAILQ_INIT(&cpuhead); - wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); - pcpu_init(&__pcpu[0], 0, sizeof(struct pcpu)); - amd64_bsp_pcpu_init1(&__pcpu[0]); - amd64_bsp_ist_init(&__pcpu[0]); - __pcpu[0].pc_common_tss.tss_iobase = sizeof(struct amd64tss) + - IOPERM_BITMAP_SIZE; - memcpy(__pcpu[0].pc_gdt, temp_bsp_pcpu.pc_gdt, NGDT * - sizeof(struct user_segment_descriptor)); - gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&__pcpu[0].pc_common_tss; - ssdtosyssd(&gdt_segs[GPROC0_SEL], - (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); - r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; - r_gdt.rd_base = (long)__pcpu[0].pc_gdt; - lgdt(&r_gdt); - wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); - ltr(GSEL(GPROC0_SEL, SEL_KPL)); - __pcpu[0].pc_dynamic = temp_bsp_pcpu.pc_dynamic; - __pcpu[0].pc_acpi_id = temp_bsp_pcpu.pc_acpi_id; + bootstrap_pcpu(pcpupg, pdppg); /* * Initialize the PAT MSR. @@ -4109,7 +4158,7 @@ vm_page_array_size = pages; - start = VM_MIN_KERNEL_ADDRESS; + start = VM_PAGE_ARRAY_START; end = start + pages * sizeof(struct vm_page); for (va = start; va < end; va += NBPDR) { pfn = first_page + (va - start) / sizeof(struct vm_page); @@ -9818,6 +9867,7 @@ { vm_page_t pml4_pg; pdp_entry_t *pdpe; + struct pcpu *pc; vm_offset_t va; int i; @@ -9832,23 +9882,24 @@ pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } - pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], - (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + sizeof(struct gate_descriptor) * NIDT, false); CPU_FOREACH(i) { + pc = cpuid_to_pcpu[i]; + pmap_pti_add_kva_locked((vm_offset_t)pc, (vm_offset_t)(pc + 1), + false); /* Doublefault stack IST 1 */ - va = __pcpu[i].pc_common_tss.tss_ist1; + va = pc->pc_common_tss.tss_ist1; pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* NMI stack IST 2 */ - va = __pcpu[i].pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); + va = pc->pc_common_tss.tss_ist2 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* MC# stack IST 3 */ - va = __pcpu[i].pc_common_tss.tss_ist3 + + va = pc->pc_common_tss.tss_ist3 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* DB# stack IST 4 */ - va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); + va = pc->pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); } pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, Index: sys/amd64/amd64/uma_machdep.c =================================================================== --- sys/amd64/amd64/uma_machdep.c +++ sys/amd64/amd64/uma_machdep.c @@ -3,6 +3,10 @@ * * Copyright (c) 2003 Alan L. Cox * All rights reserved. + * Copyright (c) 2020 The FreeBSD Foundation + * + * Portions of this software were developed by Mark Johnston under + * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,18 +35,36 @@ #include #include +#include #include #include +#include #include +#include #include + #include +#include +#include +#include +#include #include #include +#include +#include #include #include + #include #include +/* Bootstrap data. */ +static bool uma_pcpu_bootstrapped = false; +static vm_offset_t uma_pcpu_bootstrap_addr; +static vm_size_t uma_pcpu_bootstrap_size; + +static vmem_t *uma_pcpu_arena; + void * uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags, int wait) @@ -77,3 +99,150 @@ vm_page_unwire_noq(m); vm_page_free(m); } + +void * +uma_pcpu_alloc(uma_zone_t zone, vm_size_t size, int domain, uint8_t *flags, + int wait) +{ + void *pcpu_offset; + vm_offset_t addr, pcpu_addr; + vm_size_t pcpu_size; + int error, i; + + KASSERT(size == (mp_maxid + 1) * PAGE_SIZE, + ("%s: unexpected alloc size %#lx", __func__, size)); + + *flags = UMA_SLAB_PRIV; + pcpu_size = PAGE_SIZE; + + if (!uma_pcpu_bootstrapped) { + if (uma_pcpu_bootstrap_size == 0) + panic("%s: ran out of per-CPU pages", __func__); + addr = uma_pcpu_bootstrap_addr; + uma_pcpu_bootstrap_addr += pcpu_size; + uma_pcpu_bootstrap_size -= pcpu_size; + return ((void *)addr); + } + + error = vmem_alloc(uma_pcpu_arena, pcpu_size, M_BESTFIT | wait, &addr); + if (error != 0) + return (NULL); + + /* + * If the address comes from the bootstrap region, it is already backed + * by physical memory. Otherwise we must allocate memory. + */ + pcpu_offset = zpcpu_base_to_offset((void *)addr); + if ((vm_offset_t)pcpu_offset >= VM_PCPU_BOOTSTRAP_SIZE) { + for (i = 0; i <= mp_maxid; i++) { + domain = cpuid_to_pcpu[i]->pc_domain; + pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i); + if (VM_DOMAIN_EMPTY(domain)) + error = kmem_back(kernel_object, pcpu_addr, + pcpu_size, wait | M_ZERO); + else + error = kmem_back_domain(domain, kernel_object, + pcpu_addr, pcpu_size, wait | M_ZERO); + if (error != KERN_SUCCESS) + goto fail; + } + } + return ((void *)addr); + +fail: + for (; i > 0; i--) { + pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i - 1); + kmem_unback(kernel_object, pcpu_addr, pcpu_size); + } + vmem_xfree(uma_pcpu_arena, addr, pcpu_size); + return (NULL); +} + +void +uma_pcpu_free(void *mem, vm_size_t size, uint8_t flags) +{ + void *pcpu_offset; + vm_offset_t pcpu_addr; + vm_size_t pcpu_size; + int i; + + KASSERT(uma_pcpu_bootstrapped, + ("%s: not bootstrapped", __func__)); + KASSERT(size == (mp_maxid + 1) * PAGE_SIZE, + ("%s: unexpected free size %#lx", __func__, size)); + + pcpu_offset = zpcpu_base_to_offset(mem); + pcpu_size = PAGE_SIZE; + + /* + * Memory allocated from the bootstrap region remains permanently + * allocated. + */ + if ((vm_offset_t)pcpu_offset >= VM_PCPU_BOOTSTRAP_SIZE) + for (i = 0; i <= mp_maxid; i++) { + pcpu_addr = (vm_offset_t)zpcpu_get_cpu(pcpu_offset, i); + kmem_unback(kernel_object, pcpu_addr, pcpu_size); + } + + vmem_free(uma_pcpu_arena, (vm_offset_t)mem, pcpu_size); +} + +static int +pcpu_import(void *arg, vmem_size_t size, int flags, vmem_addr_t *addrp) +{ + vm_size_t kvasize, nbpdom; + + nbpdom = (int)(uintptr_t)arg * NBPDR; + kvasize = nbpdom * vm_ndomains; + return (vmem_xalloc(kernel_arena, kvasize, VM_PCPU_ALIGN, 0, 0, + 0, ~(vmem_addr_t)0, M_BESTFIT | flags, addrp)); +} + +void +uma_pcpu_init1(vm_offset_t addr, vm_size_t size) +{ + uma_pcpu_bootstrap_addr = addr; + uma_pcpu_bootstrap_size = size; +} + +void +uma_pcpu_init2(int n4kpgpcpu, int n2mpgpdom) +{ + vmem_addr_t addr, addr1; + vmem_size_t pcpu_size; + int error; + + KASSERT(!smp_started, ("%s: called after SMP is started", __func__)); + + pcpu_size = PAGE_SIZE; + + uma_pcpu_arena = vmem_create("UMA pcpu arena", 0, 0, pcpu_size, 0, + M_WAITOK); + vmem_set_import(uma_pcpu_arena, pcpu_import, NULL, + (void *)(uintptr_t)n2mpgpdom, ptoa(n4kpgpcpu)); + + /* + * Add the bootstrap region. Structures allocated during boot may be + * freed, for example if a preloaded module is unloaded, so they are + * marked here as allocated. + */ + error = vmem_add(uma_pcpu_arena, VM_PCPU_BASE_START, ptoa(n4kpgpcpu), + M_WAITOK); + if (error != 0) + panic("%s: vmem_add() failed: %d", __func__, error); + for (addr = VM_PCPU_BASE_START; addr < uma_pcpu_bootstrap_addr; + addr += pcpu_size) { + error = vmem_xalloc(uma_pcpu_arena, pcpu_size, 0, 0, 0, + addr, addr + pcpu_size, M_BESTFIT | M_WAITOK, &addr1); + if (error != 0) + panic("%s: vmem_xalloc() failed: %d", __func__, error); + } + + uma_pcpu_bootstrapped = true; +} + +vm_size_t +uma_pcpu_bootstrap_used(void) +{ + return (uma_pcpu_bootstrap_addr - VM_PCPU_BASE_START); +} Index: sys/amd64/include/pcpu.h =================================================================== --- sys/amd64/include/pcpu.h +++ sys/amd64/include/pcpu.h @@ -37,6 +37,7 @@ #include #include +#include #define PC_PTI_STACK_SZ 16 @@ -238,11 +239,23 @@ #define PCPU_PTR(member) __PCPU_PTR(pc_ ## member) #define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val) +#define DPCPU_BASE(pc) ((uintptr_t)((struct pcpu *)(pc) + 1)) + +/* + * Kernel modules use a dynamically allocated region in the DPCPU area, + * so they must fall back to the indirection through pc_dynamic. + */ +#ifndef KLD_MODULE +#define DPCPU_BASE_OFFSET(pc) (DPCPU_BASE(pc) - DPCPU_START) +#endif + #define IS_BSP() (PCPU_GET(cpuid) == 0) -#define zpcpu_offset_cpu(cpu) ((uintptr_t)&__pcpu[0] + UMA_PCPU_ALLOC_SIZE * cpu) -#define zpcpu_base_to_offset(base) (void *)((uintptr_t)(base) - (uintptr_t)&__pcpu[0]) -#define zpcpu_offset_to_base(base) (void *)((uintptr_t)(base) + (uintptr_t)&__pcpu[0]) +#define zpcpu_offset_cpu(cpu) ((uintptr_t)cpuid_to_pcpu[cpu]) +#define zpcpu_base_to_offset(base) ((void *)((uintptr_t)(base) - \ + (uintptr_t)VM_PCPU_BASE_START)) +#define zpcpu_offset_to_base(base) ((void *)((uintptr_t)(base) + \ + (uintptr_t)VM_PCPU_BASE_START)) #define zpcpu_sub_protected(base, n) do { \ ZPCPU_ASSERT_PROTECTED(); \ Index: sys/amd64/include/pcpu_aux.h =================================================================== --- sys/amd64/include/pcpu_aux.h +++ sys/amd64/include/pcpu_aux.h @@ -42,10 +42,7 @@ #endif /* Required for counters(9) to work on x86. */ -_Static_assert(sizeof(struct pcpu) == UMA_PCPU_ALLOC_SIZE, "fix pcpu size"); - -extern struct pcpu *__pcpu; -extern struct pcpu temp_bsp_pcpu; +_Static_assert(sizeof(struct pcpu) % PAGE_SIZE == 0, "fix pcpu size"); static __inline __pure2 struct thread * __curthread(void) Index: sys/amd64/include/vmparam.h =================================================================== --- sys/amd64/include/vmparam.h +++ sys/amd64/include/vmparam.h @@ -78,6 +78,12 @@ */ #define UMA_MD_SMALL_ALLOC +/* + * We provide a machine specific per-CPU allocator which returns 2MB mappings + * when possible. + */ +#define UMA_MD_PCPU_ALLOC + /* * The physical address space is densely populated. */ @@ -165,7 +171,8 @@ * * Within the kernel map: * - * 0xfffffe0000000000 vm_page_array + * 0xfffffe0000000000 bootstrap pcpu region + * 0xfffffe0020000000 vm_page_array * 0xffffffff80000000 KERNBASE */ @@ -192,6 +199,13 @@ #define VM_MAX_ADDRESS UPT_MAX_ADDRESS #define VM_MIN_ADDRESS (0) +#define VM_PCPU_BASE_START VM_MIN_KERNEL_ADDRESS +#define VM_PCPU_BASE_SIZE (MAXCPU * NBPDR) +#define VM_PCPU_BOOTSTRAP_SIZE NBPDR +#define VM_PCPU_ALIGN NBPDR + +#define VM_PAGE_ARRAY_START (VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE) + /* * XXX Allowing dmaplimit == 0 is a temporary workaround for vt(4) efifb's * early use of PHYS_TO_DMAP before the mapping is actually setup. This works Index: sys/i386/i386/mp_machdep.c =================================================================== --- sys/i386/i386/mp_machdep.c +++ sys/i386/i386/mp_machdep.c @@ -146,6 +146,9 @@ static char *ap_copyout_buf; static char *ap_tramp_stack_base; + +static void *dpcpu; + /* * Initialize the IPI handlers and start up the AP's. */ Index: sys/sys/pcpu.h =================================================================== --- sys/sys/pcpu.h +++ sys/sys/pcpu.h @@ -109,6 +109,10 @@ static t DPCPU_NAME(n) __section(DPCPU_SETNAME) __used #endif +#ifndef DPCPU_BASE_OFFSET +#define DPCPU_BASE_OFFSET(pc) ((pc)->pc_dynamic) +#endif + /* * Accessors with a given base. */ @@ -120,7 +124,7 @@ /* * Accessors for the current cpu. */ -#define DPCPU_PTR(n) _DPCPU_PTR(PCPU_GET(dynamic), n) +#define DPCPU_PTR(n) _DPCPU_PTR(DPCPU_BASE_OFFSET(get_pcpu()), n) #define DPCPU_GET(n) (*DPCPU_PTR(n)) #define DPCPU_SET(n, v) (*DPCPU_PTR(n) = v) Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -280,11 +280,13 @@ static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +#ifndef UMA_MD_SMALL_ALLOC static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); +static void pcpu_page_free(void *, vm_size_t, uint8_t); +#endif static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void *contig_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); static void page_free(void *, vm_size_t, uint8_t); -static void pcpu_page_free(void *, vm_size_t, uint8_t); static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int); static void cache_drain(uma_zone_t); static void bucket_drain(uma_zone_t, uma_bucket_t); @@ -1514,6 +1516,7 @@ uma_alloc allocf; uma_slab_t slab; unsigned long size; + int pperslab; uint8_t *mem; uint8_t sflags; int i; @@ -1569,10 +1572,18 @@ else slab_tohashslab(slab)->uhs_data = mem; - if (keg->uk_flags & UMA_ZFLAG_VTOSLAB) - for (i = 0; i < keg->uk_ppera; i++) - vsetzoneslab((vm_offset_t)mem + (i * PAGE_SIZE), + if ((keg->uk_flags & UMA_ZFLAG_VTOSLAB) != 0) { + /* + * Per-CPU slabs have a special layout. Only pages belonging to + * the base of the allocation need to be marked, and the slab + * may not be contiguous. + */ + pperslab = (keg->uk_flags & UMA_ZONE_PCPU) != 0 ? + atop(UMA_PCPU_ALLOC_SIZE) : keg->uk_ppera; + for (i = 0; i < pperslab; i++) + vsetzoneslab((vm_offset_t)mem + i * PAGE_SIZE, zone, slab); + } slab->us_freecount = keg->uk_ipers; slab->us_flags = sflags; @@ -1701,6 +1712,7 @@ return (p); } +#ifndef UMA_MD_PCPU_ALLOC static void * pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait) @@ -1755,6 +1767,7 @@ } return (NULL); } +#endif /* * Allocates a number of pages from within an object @@ -1856,6 +1869,7 @@ kmem_free((vm_offset_t)mem, size); } +#ifndef UMA_MD_PCPU_ALLOC /* * Frees pcpu zone allocations * @@ -1891,7 +1905,7 @@ pmap_qremove(sva, size >> PAGE_SHIFT); kva_free(sva, size); } - +#endif /* * Zero fill initializer @@ -2243,7 +2257,11 @@ if (booted < BOOT_KVA) keg->uk_allocf = startup_alloc; else if (keg->uk_flags & UMA_ZONE_PCPU) +#ifdef UMA_MD_PCPU_ALLOC + keg->uk_allocf = uma_pcpu_alloc; +#else keg->uk_allocf = pcpu_page_alloc; +#endif else if ((keg->uk_flags & UMA_ZONE_CONTIG) != 0 && keg->uk_ppera > 1) keg->uk_allocf = contig_alloc; else @@ -2254,7 +2272,11 @@ else #endif if (keg->uk_flags & UMA_ZONE_PCPU) +#ifdef UMA_MD_PCPU_ALLOC + keg->uk_freef = uma_pcpu_free; +#else keg->uk_freef = pcpu_page_free; +#endif else keg->uk_freef = page_free; @@ -3114,10 +3136,21 @@ if (item == NULL) return (NULL); pcpu_item = zpcpu_base_to_offset(item); - if (flags & M_ZERO) { + if ((flags & M_ZERO) != 0) { #ifdef SMP - for (i = 0; i <= mp_maxid; i++) + for (i = 0; i <= mp_maxid; i++) { bzero(zpcpu_get_cpu(pcpu_item, i), zone->uz_size); +#ifdef UMA_MD_PCPU_ALLOC + if (__predict_false(booted < BOOT_RUNNING)) + /* + * Only CPU's 0 memory is accessible if the + * per-CPU allocator is still being + * bootstrapped. The allocator guarantees that + * early allocations will be zero-filled. + */ + break; +#endif + } #else bzero(item, zone->uz_size); #endif Index: sys/vm/uma_int.h =================================================================== --- sys/vm/uma_int.h +++ sys/vm/uma_int.h @@ -664,6 +664,7 @@ uma_reclaim_wakeup(); } +#ifdef UMA_MD_SMALL_ALLOC /* * The following two functions may be defined by architecture specific code * if they can provide more efficient allocation functions. This is useful @@ -672,6 +673,19 @@ void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag, int wait); void uma_small_free(void *mem, vm_size_t size, uint8_t flags); +#endif + +#ifdef UMA_MD_PCPU_ALLOC +void *uma_pcpu_alloc(uma_zone_t zone, vm_size_t bytes, int domain, + uint8_t *pflag, int wait); +void uma_pcpu_free(void *mem, vm_size_t size, uint8_t flags); + +#ifdef __amd64__ +void uma_pcpu_init1(vm_offset_t addr, vm_size_t size); +void uma_pcpu_init2(int ptpstride, int npdepdom); +vm_size_t uma_pcpu_bootstrap_used(void); +#endif +#endif /* Set a global soft limit on UMA managed memory. */ void uma_set_limit(unsigned long limit); Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -767,6 +767,14 @@ /* ... and ending with the completion of the above `insert' */ #ifdef __amd64__ + /* + * Mark the PCPU bootstrap region as allocated. In practice most of + * this region will be released back to the VM during boot. + */ + (void)vm_map_insert(m, NULL, 0, VM_PCPU_BASE_START, + VM_PCPU_BASE_START + VM_PCPU_BASE_SIZE, + VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT); + /* * Mark KVA used for the page array as allocated. Other platforms * that handle vm_page_array allocation can simply adjust virtual_avail Index: sys/x86/include/x86_smp.h =================================================================== --- sys/x86/include/x86_smp.h +++ sys/x86/include/x86_smp.h @@ -29,7 +29,6 @@ extern struct pcb stoppcbs[]; extern int cpu_apic_ids[]; extern int bootAP; -extern void *dpcpu; extern char *bootSTK; extern void *bootstacks[]; extern unsigned int boot_address; Index: sys/x86/x86/mp_x86.c =================================================================== --- sys/x86/x86/mp_x86.c +++ sys/x86/x86/mp_x86.c @@ -94,7 +94,6 @@ /* Free these after use */ void *bootstacks[MAXCPU]; -void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; Index: sys/x86/xen/pv.c =================================================================== --- sys/x86/xen/pv.c +++ sys/x86/xen/pv.c @@ -365,7 +365,6 @@ mce_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO); dbg_stack = (void *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO); - dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO); bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8; bootAP = cpu;