Changeset View
Standalone View
sys/arm64/arm64/mp_machdep.c
Show First 20 Lines • Show All 140 Lines • ▼ Show 20 Lines | ||||||||||
static void *bootstacks[MAXCPU]; | static void *bootstacks[MAXCPU]; | |||||||||
/* Count of started APs, used to synchronize access to bootstack. */ | /* Count of started APs, used to synchronize access to bootstack. */ | |||||||||
static volatile int aps_started; | static volatile int aps_started; | |||||||||
/* Set to 1 once we're ready to let the APs out of the pen. */ | /* Set to 1 once we're ready to let the APs out of the pen. */ | |||||||||
static volatile int aps_ready; | static volatile int aps_ready; | |||||||||
static vm_offset_t pcpu_mem[MAXMEMDOM]; | ||||||||||
static vm_offset_t pcpu_off[MAXMEMDOM]; | ||||||||||
#ifdef INVARIANTS | ||||||||||
static int pcpu_alloc_id; | ||||||||||
vm_offset_t pcpu_allocations[MAXCPU]; | ||||||||||
#endif | ||||||||||
/* Temporary variables for init_secondary() */ | /* Temporary variables for init_secondary() */ | |||||||||
void *dpcpu[MAXCPU - 1]; | void *dpcpu[MAXCPU - 1]; | |||||||||
static bool | static bool | |||||||||
is_boot_cpu(uint64_t target_cpu) | is_boot_cpu(uint64_t target_cpu) | |||||||||
{ | { | |||||||||
return (cpuid_to_pcpu[0]->pc_mpidr == (target_cpu & CPU_AFF_MASK)); | return (cpuid_to_pcpu[0]->pc_mpidr == (target_cpu & CPU_AFF_MASK)); | |||||||||
▲ Show 20 Lines • Show All 323 Lines • ▼ Show 20 Lines | ||||||||||
cpu_mp_probe(void) | cpu_mp_probe(void) | |||||||||
{ | { | |||||||||
/* ARM64TODO: Read the u bit of mpidr_el1 to determine this */ | /* ARM64TODO: Read the u bit of mpidr_el1 to determine this */ | |||||||||
return (1); | return (1); | |||||||||
} | } | |||||||||
/* | /* | |||||||||
* Allocate memory for the PCPU and DPCPU data. We allocate a level 2 | ||||||||||
markj: IMO this is a hack, especially given that it'll waste memory on small systems, so we definitely… | ||||||||||
* sized and aligned region so later allocations won't cause this memory | ||||||||||
* to be promoted to a level 2 block. This can cause recursive exceptions | ||||||||||
* as the exception handler needs to dereference the PCPU region and | ||||||||||
* the promotion code needs to temporary mark the mapping invalid in the | ||||||||||
markjUnsubmitted Not Done Inline Actions
markj: | ||||||||||
* page table before marking the new level 2 block valid. This is normally | ||||||||||
* fine as this is fast so the memory has been remapped by the time the | ||||||||||
* exception handler is run, however when running on a VM we may exit to the | ||||||||||
* host while this mapping is invalid meaning the memory is not remapped | ||||||||||
* in time. | ||||||||||
* | ||||||||||
* Work around this by making sure the allocation takes an entire level 2 | ||||||||||
* block so won't be promoted. To reduce the wasted memory reuse this for | ||||||||||
* as many CPUs in the same comain as possible. | ||||||||||
markjUnsubmitted Not Done Inline ActionsTo be clear, this change doesn't actually guarantee that the allocation is mapped by an L2 block (transparent superpages could be administratively disabled), and we're also assuming here that we'll never transparently use L1 blocks. So the approach really isn't ideal. Taking a step back, I wonder if we can use IPIs to pause all other CPUs when pmap_update_entry() is promoting the L2 block containing pcpu pages? markj: To be clear, this change doesn't actually guarantee that the allocation is mapped by an L2… | ||||||||||
kibUnsubmitted Not Done Inline Actions
Wouldn't IPI have the same problem: you need to ensure that IPI does not touch anything that could be broken for either promotion or demotion. So for instance we must ensure that pages containing global variables used by smp_rendezvous_cpus() are safe. kib: Wouldn't IPI have the same problem: you need to ensure that IPI does not touch anything that… | ||||||||||
markjUnsubmitted Not Done Inline ActionsHmm, can we do all of the work from the smp_rendezvous callback? That is, the initiator's callback looks like this: while (atomic_load_acq_int(&spinning) != mp_ncpus - 1) cpu_spinwait(); pmap_clear_bits(pte, ATTR_DESCR_VALID); pmap_invalidate_range(pmap, va, va + size, false); pmap_store(pte, newpte); dsb(ishst); atomic_store_rel_int(&done, 1); and on targets: atomic_add_rel_int(&spinning, 1); while (atomic_load_acq_int(&done) == 0) cpu_spinwait(); I think this would solve the problem you pointed out. markj: Hmm, can we do all of the work from the smp_rendezvous callback? That is, the initiator's… | ||||||||||
kibUnsubmitted Not Done Inline ActionsWe need to execute the code to get into the callback? For instance, what if the demotion needs to occur for L2 page where smp_ipi_mtx is located? Might be, there should be a dedicated IPI vector and dedicated L1 page with the spinning indicator (kind of barrier) that would allow to safely 'ground down' all other CPUs while current one is doing in-kernel promotion/demotion for specific unsafe places. kib: We need to execute the code to get into the callback? For instance, what if the demotion needs… | ||||||||||
markjUnsubmitted Not Done Inline ActionsBut with this approach all CPUs are "parked" while the L2 PTE is updated. Nothing will try to acquire the smp_ipi mutex during the window where the mapping is invalid. Maybe I'm missing something. markj: But with this approach all CPUs are "parked" while the L2 PTE is updated. Nothing will try to… | ||||||||||
kibUnsubmitted Not Done Inline ActionsOk, it is not smp_ipi mutex itself, but still a page containing some variable you need to re-check in the loop to detect the parking end. kib: Ok, it is not smp_ipi mutex itself, but still a page containing some variable you need to re… | ||||||||||
*/ | ||||||||||
static vm_offset_t | ||||||||||
alloc_pcpu(int domain, vm_size_t size) | ||||||||||
{ | ||||||||||
vm_offset_t addr; | ||||||||||
domain = domain % MAXMEMDOM; | ||||||||||
if (pcpu_mem[domain] == 0 || | ||||||||||
(pcpu_off[domain] + size) >= (pcpu_mem[domain] + L2_SIZE)) { | ||||||||||
pcpu_mem[domain] = kmem_alloc_contig_domainset( | ||||||||||
DOMAINSET_PREF(domain), L2_SIZE, M_WAITOK | M_ZERO, | ||||||||||
0, ~(vm_paddr_t)0, L2_SIZE, 0, VM_MEMATTR_DEFAULT); | ||||||||||
pcpu_off[domain] = pcpu_mem[domain]; | ||||||||||
#ifdef INVARIANTS | ||||||||||
/* Record the allocation to check we don't promote it */ | ||||||||||
pcpu_allocations[pcpu_alloc_id] = pcpu_mem[domain]; | ||||||||||
pcpu_alloc_id++; | ||||||||||
#endif | ||||||||||
} | ||||||||||
addr = pcpu_off[domain]; | ||||||||||
/* Align the next allocation */ | ||||||||||
pcpu_off[domain] = roundup2(addr + size, _Alignof(void *)); | ||||||||||
return (addr); | ||||||||||
} | ||||||||||
/* | ||||||||||
* Starts a given CPU. If the CPU is already running, i.e. it is the boot CPU, | * Starts a given CPU. If the CPU is already running, i.e. it is the boot CPU, | |||||||||
* do nothing. Returns true if the CPU is present and running. | * do nothing. Returns true if the CPU is present and running. | |||||||||
*/ | */ | |||||||||
static bool | static bool | |||||||||
start_cpu(u_int cpuid, uint64_t target_cpu, int domain) | start_cpu(u_int cpuid, uint64_t target_cpu, int domain) | |||||||||
{ | { | |||||||||
struct pcpu *pcpup; | struct pcpu *pcpup; | |||||||||
vm_paddr_t pa; | vm_paddr_t pa; | |||||||||
int err, naps; | int err, naps; | |||||||||
/* Check we are able to start this cpu */ | /* Check we are able to start this cpu */ | |||||||||
if (cpuid > mp_maxid) | if (cpuid > mp_maxid) | |||||||||
return (false); | return (false); | |||||||||
/* Skip boot CPU */ | /* Skip boot CPU */ | |||||||||
if (is_boot_cpu(target_cpu)) | if (is_boot_cpu(target_cpu)) | |||||||||
return (true); | return (true); | |||||||||
KASSERT(cpuid < MAXCPU, ("Too many CPUs")); | KASSERT(cpuid < MAXCPU, ("Too many CPUs")); | |||||||||
pcpup = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain), | pcpup = (void *)alloc_pcpu(domain, sizeof(*pcpup)); | |||||||||
sizeof(*pcpup), M_WAITOK | M_ZERO); | ||||||||||
pcpu_init(pcpup, cpuid, sizeof(struct pcpu)); | pcpu_init(pcpup, cpuid, sizeof(struct pcpu)); | |||||||||
pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK; | pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK; | |||||||||
dpcpu[cpuid - 1] = (void *)kmem_malloc_domainset( | dpcpu[cpuid - 1] = (void *)alloc_pcpu(domain, DPCPU_SIZE); | |||||||||
DOMAINSET_PREF(domain), DPCPU_SIZE, M_WAITOK | M_ZERO); | ||||||||||
dpcpu_init(dpcpu[cpuid - 1], cpuid); | dpcpu_init(dpcpu[cpuid - 1], cpuid); | |||||||||
bootstacks[cpuid] = (void *)kmem_malloc_domainset( | bootstacks[cpuid] = (void *)kmem_malloc_domainset( | |||||||||
DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO); | DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO); | |||||||||
naps = atomic_load_int(&aps_started); | naps = atomic_load_int(&aps_started); | |||||||||
bootstack = (char *)bootstacks[cpuid] + PAGE_SIZE; | bootstack = (char *)bootstacks[cpuid] + PAGE_SIZE; | |||||||||
printf("Starting CPU %u (%lx)\n", cpuid, target_cpu); | printf("Starting CPU %u (%lx)\n", cpuid, target_cpu); | |||||||||
pa = pmap_extract(kernel_pmap, (vm_offset_t)mpentry); | pa = pmap_extract(kernel_pmap, (vm_offset_t)mpentry); | |||||||||
err = psci_cpu_on(target_cpu, pa, cpuid); | err = psci_cpu_on(target_cpu, pa, cpuid); | |||||||||
if (err != PSCI_RETVAL_SUCCESS) { | if (err != PSCI_RETVAL_SUCCESS) { | |||||||||
/* | /* | |||||||||
* Panic here if INVARIANTS are enabled and PSCI failed to | * Panic here if INVARIANTS are enabled and PSCI failed to | |||||||||
* start the requested CPU. psci_cpu_on() returns PSCI_MISSING | * start the requested CPU. psci_cpu_on() returns PSCI_MISSING | |||||||||
* to indicate we are unable to use it to start the given CPU. | * to indicate we are unable to use it to start the given CPU. | |||||||||
*/ | */ | |||||||||
KASSERT(err == PSCI_MISSING || | KASSERT(err == PSCI_MISSING || | |||||||||
(mp_quirks & MP_QUIRK_CPULIST) == MP_QUIRK_CPULIST, | (mp_quirks & MP_QUIRK_CPULIST) == MP_QUIRK_CPULIST, | |||||||||
("Failed to start CPU %u (%lx), error %d\n", | ("Failed to start CPU %u (%lx), error %d\n", | |||||||||
cpuid, target_cpu, err)); | cpuid, target_cpu, err)); | |||||||||
pcpu_destroy(pcpup); | pcpu_destroy(pcpup); | |||||||||
kmem_free((vm_offset_t)dpcpu[cpuid - 1], DPCPU_SIZE); | ||||||||||
dpcpu[cpuid - 1] = NULL; | dpcpu[cpuid - 1] = NULL; | |||||||||
kmem_free((vm_offset_t)bootstacks[cpuid], PAGE_SIZE); | kmem_free((vm_offset_t)bootstacks[cpuid], PAGE_SIZE); | |||||||||
Done Inline Actionsbootstacks are still allocated directly, so this line should stay. markj: bootstacks are still allocated directly, so this line should stay. | ||||||||||
bootstacks[cpuid] = NULL; | bootstacks[cpuid] = NULL; | |||||||||
mp_ncpus--; | mp_ncpus--; | |||||||||
return (false); | return (false); | |||||||||
} | } | |||||||||
/* Wait for the AP to switch to its boot stack. */ | /* Wait for the AP to switch to its boot stack. */ | |||||||||
while (atomic_load_int(&aps_started) < naps + 1) | while (atomic_load_int(&aps_started) < naps + 1) | |||||||||
cpu_spinwait(); | cpu_spinwait(); | |||||||||
▲ Show 20 Lines • Show All 397 Lines • Show Last 20 Lines |
IMO this is a hack, especially given that it'll waste memory on small systems, so we definitely need a comment explaining why allocation is done this way.