diff --git a/sys/riscv/include/md_var.h b/sys/riscv/include/md_var.h --- a/sys/riscv/include/md_var.h +++ b/sys/riscv/include/md_var.h @@ -44,6 +44,7 @@ /* Supervisor-mode extension support */ extern bool has_sstc; extern bool has_sscofpmf; +extern bool has_svpbmt; struct dumperinfo; struct minidumpstate; diff --git a/sys/riscv/include/pte.h b/sys/riscv/include/pte.h --- a/sys/riscv/include/pte.h +++ b/sys/riscv/include/pte.h @@ -83,6 +83,25 @@ #define PTE_PROMOTE (PTE_V | PTE_RWX | PTE_D | PTE_G | PTE_U | \ PTE_SW_MANAGED | PTE_SW_WIRED) +/* + * Svpbmt Memory Attribute (MA) bits [62:61]. + * + * +------+-------+----------------------------------------------------------------+ + * | Mode | Value | Requested Memory Attributes | + * +------+-------+----------------------------------------------------------------+ + * | PMA | 00 | None, inherited from Physical Memory Attributes (firmware) | + * | NC | 01 | Non-cacheable, idempotent, weakly-ordered (RVWMO), main memory | + * | IO | 10 | Non-cacheable, non-idempotent, strongly-ordered, I/O | + * | -- | 11 | Reserved | + * +------+-------+----------------------------------------------------------------+ + */ +#define PTE_MA_SHIFT 61 +#define PTE_MA_MASK (0x3ul << PTE_MA_SHIFT) +#define PTE_MA_TO_MODE(pte) ((pte & PTE_MA_MASK) >> PTE_MA_SHIFT) +#define PTE_MA_NONE (0ul) +#define PTE_MA_NC (1ul << PTE_MA_SHIFT) +#define PTE_MA_IO (2ul << PTE_MA_SHIFT) + /* Bits 63 - 54 are reserved for future use. */ #define PTE_HI_MASK 0xFFC0000000000000ULL diff --git a/sys/riscv/include/vm.h b/sys/riscv/include/vm.h --- a/sys/riscv/include/vm.h +++ b/sys/riscv/include/vm.h @@ -28,10 +28,14 @@ #define _MACHINE_VM_H_ /* Memory attribute configuration. */ -#define VM_MEMATTR_DEVICE 0 +#define VM_MEMATTR_PMA 0 #define VM_MEMATTR_UNCACHEABLE 1 -#define VM_MEMATTR_WRITE_BACK 2 +#define VM_MEMATTR_DEVICE 2 -#define VM_MEMATTR_DEFAULT VM_MEMATTR_WRITE_BACK +#define VM_MEMATTR_WRITE_BACK VM_MEMATTR_PMA +#define VM_MEMATTR_DEFAULT VM_MEMATTR_PMA + +#define VM_MEMATTR_LAST VM_MEMATTR_DEVICE +#define VM_MEMATTR_TOTAL (VM_MEMATTR_LAST + 1) #endif /* !_MACHINE_VM_H_ */ diff --git a/sys/riscv/riscv/identcpu.c b/sys/riscv/riscv/identcpu.c --- a/sys/riscv/riscv/identcpu.c +++ b/sys/riscv/riscv/identcpu.c @@ -74,6 +74,7 @@ /* Supervisor-mode extension support. */ bool __read_frequently has_sstc; bool __read_frequently has_sscofpmf; +bool __read_frequently has_svpbmt; struct cpu_desc { const char *cpu_mvendor_name; @@ -414,6 +415,7 @@ /* Supervisor-mode extension support. */ UPDATE_CAP(has_sstc, (desc->smode_extensions & SV_SSTC) != 0); UPDATE_CAP(has_sscofpmf, (desc->smode_extensions & SV_SSCOFPMF) != 0); + UPDATE_CAP(has_svpbmt, (desc->smode_extensions & SV_SVPBMT) != 0); #undef UPDATE_CAP } diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -527,6 +527,24 @@ mtx_unlock(&allpmaps_lock); } +/* + * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability. + * + * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h. + * + * The array will be empty if no mode bits are supported by the CPU, e.g. when + * lacking Svpbmt extension. + */ +static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL]; + +static __inline pt_entry_t +pmap_memattr_bits(vm_memattr_t mode) +{ + KASSERT(pmap_is_valid_memattr(kernel_pmap, mode), + ("invalid memory mode %u\n", mode)); + return (memattr_bits[(int)mode]); +} + /* * This should only be used during pmap bootstrap e.g. by * pmap_create_pagetables(). @@ -560,6 +578,7 @@ vm_offset_t va; vm_paddr_t min_pa, max_pa, pa, endpa; pd_entry_t *l2; + pt_entry_t memattr; u_int l1slot, l2slot; int physmap_idx; @@ -574,6 +593,8 @@ /* Lower physical address aligned to 1GB. */ dmap_phys_base = rounddown(min_pa, L1_SIZE); + memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT); + /* Walk the physmap table. */ l2 = NULL; l1slot = Ln_ENTRIES; /* sentinel value */ @@ -607,7 +628,7 @@ /* map l2 pages */ l2slot = pmap_l2_index(va); - pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN)); + pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr)); pa += L2_SIZE; va += L2_SIZE; @@ -617,7 +638,7 @@ while (pa + L1_SIZE - 1 < endpa) { /* map l1 pages */ l1slot = pmap_l1_index(va); - pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN)); + pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr)); pa += L1_SIZE; va += L1_SIZE; @@ -637,7 +658,7 @@ /* map l2 pages */ l2slot = pmap_l2_index(va); - pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN)); + pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr)); pa += L2_SIZE; va += L2_SIZE; @@ -669,6 +690,7 @@ pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen) { pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3; + pt_entry_t memattr; pd_entry_t *devmap_l2; vm_paddr_t kernend, freemempos, pa; int nkernl2, nkernl3, ndevmapl3; @@ -741,6 +763,9 @@ if (freemempos < roundup2(kernend, L2_SIZE)) freemempos = roundup2(kernend, L2_SIZE); + /* Memory attributes for standard/main memory. */ + memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT); + /* * Map the kernel (and preloaded modules or data) using L2 superpages. * @@ -753,7 +778,8 @@ */ slot = pmap_l2_index(KERNBASE); for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) { - pmap_store(&kern_l2[slot], L2_PTE(pa, PTE_KERN | PTE_X)); + pmap_store(&kern_l2[slot], + L2_PTE(pa, PTE_KERN | PTE_X | memattr)); } /* @@ -825,6 +851,15 @@ */ CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active); + /* + * Set up the memory attribute bits. + */ + if (has_svpbmt) { + memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE; + memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC; + memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO; + } + /* Create a new set of pagetables to run the kernel in. */ freemempos = pmap_create_pagetables(kernstart, kernlen); @@ -890,7 +925,7 @@ { TAILQ_INIT(&m->md.pv_list); - m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; + m->md.pv_memattr = VM_MEMATTR_DEFAULT; } /* @@ -1143,10 +1178,11 @@ ***************************************************/ void -pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused) +pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pt_entry_t entry; pt_entry_t *l3; + pt_entry_t memattr; vm_offset_t va; pn_t pn; @@ -1157,6 +1193,7 @@ KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter_device: Mapping is not page-sized")); + memattr = pmap_memattr_bits(mode); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); @@ -1164,6 +1201,7 @@ pn = (pa / PAGE_SIZE); entry = PTE_KERN; + entry |= memattr; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); @@ -1251,7 +1289,8 @@ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { - pt_entry_t *l3, pa; + pt_entry_t *l3; + vm_paddr_t pa; vm_offset_t va; vm_page_t m; pt_entry_t entry; @@ -1266,6 +1305,7 @@ l3 = pmap_l3(kernel_pmap, va); entry = PTE_KERN; + entry |= pmap_memattr_bits(m->md.pv_memattr); entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); @@ -3044,6 +3084,7 @@ new_l3 |= (pn << PTE_PPN0_S); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= PTE_SW_WIRED; + new_l3 |= pmap_memattr_bits(m->md.pv_memattr); /* * Set modified bit gratuitously for writeable mappings if @@ -3251,13 +3292,13 @@ pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { - pd_entry_t new_l2; - pn_t pn; + pt_entry_t new_l2; + //pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; - new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); + new_l2 = L2_PTE(VM_PAGE_TO_PHYS(m), PTE_R | PTE_V | + pmap_memattr_bits(m->md.pv_memattr)); if ((m->oflags & VPO_UNMANAGED) == 0) new_l2 |= PTE_SW_MANAGED; if ((prot & VM_PROT_EXECUTE) != 0) @@ -3600,7 +3641,7 @@ pmap_resident_count_inc(pmap, 1); newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | - PTE_V | PTE_R; + PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr); if ((prot & VM_PROT_EXECUTE) != 0) newl3 |= PTE_X; if ((m->oflags & VPO_UNMANAGED) == 0) @@ -4707,6 +4748,8 @@ pd_entry_t *l1, l1e; pd_entry_t *l2, l2e; pt_entry_t *l3, l3e; + pt_entry_t bits, mask; + bool anychanged = false; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); @@ -4717,45 +4760,124 @@ !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) return (EINVAL); + bits = pmap_memattr_bits(mode); + mask = PTE_MA_MASK; + + /* First loop: perform PTE validation and demotions as necessary. */ for (tmpva = base; tmpva < base + size; ) { l1 = pmap_l1(kernel_pmap, tmpva); if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0) return (EINVAL); if ((l1e & PTE_RWX) != 0) { /* - * TODO: Demote if attributes don't match and there - * isn't an L1 page left in the range, and update the - * L1 entry if the attributes don't match but there is - * an L1 page left in the range, once we support the - * upcoming Svpbmt extension. + * If the existing PTE has the correct attributes, then + * no need to demote. */ - tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; - continue; + if ((l1e & mask) == bits) { + tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; + continue; + } + + /* + * If the 1GB page fits in the remaining range, we + * don't need to demote. + */ + if ((tmpva & L1_OFFSET) == 0 && + tmpva + L1_SIZE /* -1? */ < base + size) { + tmpva += L1_SIZE; + continue; + } + + /* TODO */ + panic("l1 demotion"); } l2 = pmap_l1_to_l2(l1, tmpva); if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) return (EINVAL); if ((l2e & PTE_RWX) != 0) { /* - * TODO: Demote if attributes don't match and there - * isn't an L2 page left in the range, and update the - * L2 entry if the attributes don't match but there is - * an L2 page left in the range, once we support the - * upcoming Svpbmt extension. + * If the existing PTE has the correct attributes, then + * no need to demote. */ - tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; - continue; + if ((l2e & mask) == bits) { + tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; + continue; + } + + /* + * If the 2MB page fits in the remaining range, we + * don't need to demote. + */ + if ((tmpva & L2_OFFSET) == 0 && + tmpva + L2_SIZE /* -1? */ < base + size) { + tmpva += L2_SIZE; + continue; + } + + if (!pmap_demote_l2(kernel_pmap, l2, tmpva)) + panic("l2 demotion failed"); } l3 = pmap_l2_to_l3(l2, tmpva); if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0) return (EINVAL); - /* - * TODO: Update the L3 entry if the attributes don't match once - * we support the upcoming Svpbmt extension. - */ + + tmpva += PAGE_SIZE; + } + + /* Second loop: perform PTE updates. */ + for (tmpva = base; tmpva < base + size; ) { + l1 = pmap_l1(kernel_pmap, tmpva); + l1e = pmap_load(l1); + if ((l1e & PTE_RWX) != 0) { + /* Update PTE if changed. */ + if ((l1e & mask) != bits) { + l1e &= ~mask; + l1e |= bits; + pmap_store(l1, l1e); + anychanged = true; + } + + /* TODO: update direct map */ + tmpva += L1_SIZE; + continue; + } + + l2 = pmap_l1_to_l2(l1, tmpva); + l2e = pmap_load(l2); + if ((l2e & PTE_RWX) != 0) { + /* Update PTE if changed. */ + if ((l2e & mask) != bits) { + l2e &= ~mask; + l2e |= bits; + pmap_store(l2, l2e); + anychanged = true; + } + + /* TODO: update direct map */ + tmpva += L2_SIZE; + continue; + } + + l3 = pmap_l2_to_l3(l2, tmpva); + l3e = pmap_load(l3); + + if ((l3e & mask) != bits) { + l3e &= ~mask; + l3e |= bits; + pmap_store(l3, l3e); + anychanged = true; + } + + /* TODO: update direct map */ + tmpva += PAGE_SIZE; } + if (anychanged) { + pmap_invalidate_range(kernel_pmap, base, tmpva); + /* TODO: conditionally flush data cache? */ + } + return (0); } @@ -4993,7 +5115,7 @@ pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { - return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); + return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST); } bool @@ -5049,17 +5171,33 @@ sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, vm_offset_t eva) { + char *mode; if (eva <= range->sva) return; - sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n", + switch (PTE_MA_TO_MODE(range->attrs)) { + case VM_MEMATTR_PMA: + mode = "PMA"; + break; + case VM_MEMATTR_UNCACHEABLE: + mode = "NC "; + break; + case VM_MEMATTR_DEVICE: + mode = "IO "; + break; + default: + mode = "???"; + break; + } + + sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n", range->sva, eva, (range->attrs & PTE_W) == PTE_W ? 'w' : '-', (range->attrs & PTE_X) == PTE_X ? 'x' : '-', (range->attrs & PTE_U) == PTE_U ? 'u' : 's', (range->attrs & PTE_G) == PTE_G ? 'g' : '-', - range->l1pages, range->l2pages, range->l3pages); + mode, range->l1pages, range->l2pages, range->l3pages); /* Reset to sentinel value. */ range->sva = 0xfffffffffffffffful; @@ -5099,14 +5237,19 @@ /* The PTE global bit is inherited by lower levels. */ attrs = l1e & PTE_G; - if ((l1e & PTE_RWX) != 0) + if ((l1e & PTE_RWX) != 0) { attrs |= l1e & (PTE_RWX | PTE_U); - else if (l2e != 0) + attrs |= l1e & PTE_MA_MASK; + } else if (l2e != 0) attrs |= l2e & PTE_G; - if ((l2e & PTE_RWX) != 0) + + if ((l2e & PTE_RWX) != 0) { attrs |= l2e & (PTE_RWX | PTE_U); - else if (l3e != 0) + attrs |= l2e & PTE_MA_MASK; + } else if (l3e != 0) { attrs |= l3e & (PTE_RWX | PTE_U | PTE_G); + attrs |= l3e & PTE_MA_MASK; + } if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { sysctl_kmaps_dump(sb, range, va);