Differential D947 Diff 2086 sys/amd64/amd64/pmap.c

Changeset View

Standalone View

sys/amd64/amd64/pmap.c

Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines
#include <sys/ktr.h>		#include <sys/ktr.h>
#include <sys/lock.h>		#include <sys/lock.h>
#include <sys/malloc.h>		#include <sys/malloc.h>
#include <sys/mman.h>		#include <sys/mman.h>
#include <sys/mutex.h>		#include <sys/mutex.h>
#include <sys/proc.h>		#include <sys/proc.h>
#include <sys/rwlock.h>		#include <sys/rwlock.h>
#include <sys/sx.h>		#include <sys/sx.h>
		#include <sys/vmem.h>
#include <sys/vmmeter.h>		#include <sys/vmmeter.h>
#include <sys/sched.h>		#include <sys/sched.h>
#include <sys/sysctl.h>		#include <sys/sysctl.h>
#include <sys/_unrhdr.h>		#include <sys/_unrhdr.h>
#include <sys/smp.h>		#include <sys/smp.h>

#include <vm/vm.h>		#include <vm/vm.h>
#include <vm/vm_param.h>		#include <vm/vm_param.h>
▲ Show 20 Lines • Show All 270 Lines • ▼ Show 20 Lines	pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
CPU_FOREACH(i) {		CPU_FOREACH(i) {
res += cpuid_to_pcpu[i]->pc_pm_save_cnt;		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
}		}
return (sysctl_handle_64(oidp, &res, 0, req));		return (sysctl_handle_64(oidp, &res, 0, req));
}		}
SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 \| CTLFLAG_RW \|		SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 \| CTLFLAG_RW \|
CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",		CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
"Count of saved TLB context on switch");		"Count of saved TLB context on switch");

/* pmap_copy_pages() over non-DMAP */
static struct mtx cpage_lock;
static vm_offset_t cpage_a;
static vm_offset_t cpage_b;

/*		/*
		kibUnsubmitted Not Done Inline Actions Using DPCPU for something that cannot be loaded as module is excessive. PMAP traditionally adds the required members to static pcpu structure. Looking at big picture, does the non-DMAP mapped access is so contended that it makes the per-cpu scratch page frames reasonable ? kib: Using DPCPU for something that cannot be loaded as module is excessive. PMAP traditionally…
		roygerUnsubmitted Not Done Inline Actions OK, I will add them to the static pcpu structure then. I've just used DPCPU because it seems cleaner to have this variables declared in the file where they are used. When running FreeBSD as Dom0 physical addresses outside of the memory map are used to map memory from other domains (remember the privcmd driver). This is used to map memory used by the guest to perform IO operations, and also to map all the guest memory during save/restore/migration. If there are several HVM guests performing heavy IO this is going to be quite contented. royger: OK, I will add them to the static pcpu structure then. I've just used DPCPU because it seems…
* Crashdump maps.		* Crashdump maps.
		kibUnsubmitted Not Done Inline Actions You use sx because uiomove_fromphys() may fault while keeping the address allocated, right ? I am not sure could it recurse the lock in page fault. Also, this lock is held for unbound time. E.g., nfs server could stop responding while pager requests the page content, and the situation results in the lock cascade for unlucky unreated threads which happen to execute on the same CPU as the thread initiated the pagein. kib: You use sx because uiomove_fromphys() may fault while keeping the address allocated, right ? I…
		roygerUnsubmitted Not Done Inline Actions Yes, a sx lock is needed since uiomove_fromphys may fault. So far I haven't seen it recurse, and looking at the paths in the vm fault handler it doesn't look like (although it's quite convoluted, so I wouldn't be surprised that I've missed something). Maybe it would be better to get rid of the pcpu pages/lock and dynamically alloc them using kva_alloc/kva_free? royger: Yes, a sx lock is needed since uiomove_fromphys may fault. So far I haven't seen it recurse…
*/		*/
static caddr_t crashdumpmap;		static caddr_t crashdumpmap;

static void free_pv_chunk(struct pv_chunk *pc);		static void free_pv_chunk(struct pv_chunk *pc);
static void free_pv_entry(pmap_t pmap, pv_entry_t pv);		static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);		static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
static int popcnt_pc_map_elem(uint64_t elem);		static int popcnt_pc_map_elem(uint64_t elem);
static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);		static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
▲ Show 20 Lines • Show All 647 Lines • ▼ Show 20 Lines	pmap_init(void)
* Allocate memory for the pv head table for superpages.		* Allocate memory for the pv head table for superpages.
*/		*/
s = (vm_size_t)(pv_npg * sizeof(struct md_page));		s = (vm_size_t)(pv_npg * sizeof(struct md_page));
s = round_page(s);		s = round_page(s);
pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,		pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
M_WAITOK \| M_ZERO);		M_WAITOK \| M_ZERO);
for (i = 0; i < pv_npg; i++)		for (i = 0; i < pv_npg; i++)
TAILQ_INIT(&pv_table[i].pv_list);		TAILQ_INIT(&pv_table[i].pv_list);

mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF);
cpage_a = kva_alloc(PAGE_SIZE);
cpage_b = kva_alloc(PAGE_SIZE);
}		}

static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,		static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
"2MB page mapping counters");		"2MB page mapping counters");

static u_long pmap_pde_demotions;		static u_long pmap_pde_demotions;
SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,		SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
&pmap_pde_demotions, 0, "2MB page demotions");		&pmap_pde_demotions, 0, "2MB page demotions");
▲ Show 20 Lines • Show All 3,964 Lines • ▼ Show 20 Lines

int unmapped_buf_allowed = 1;		int unmapped_buf_allowed = 1;

void		void
pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],		pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
vm_offset_t b_offset, int xfersize)		vm_offset_t b_offset, int xfersize)
{		{
void a_cp, b_cp;		void a_cp, b_cp;
vm_page_t m_a, m_b;		vm_page_t pages[2];
vm_paddr_t p_a, p_b;		vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
pt_entry_t *pte;
vm_offset_t a_pg_offset, b_pg_offset;
int cnt;		int cnt;
boolean_t pinned;		boolean_t mapped;

/*
* NB: The sequence of updating a page table followed by accesses
* to the corresponding pages used in the !DMAP case is subject to
* the situation described in the "AMD64 Architecture Programmer's
* Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
* Coherency Considerations". Therefore, issuing the INVLPG right
* after modifying the PTE bits is crucial.
*/
pinned = FALSE;
while (xfersize > 0) {		while (xfersize > 0) {
a_pg_offset = a_offset & PAGE_MASK;		a_pg_offset = a_offset & PAGE_MASK;
m_a = ma[a_offset >> PAGE_SHIFT];		pages[0] = ma[a_offset >> PAGE_SHIFT];
p_a = m_a->phys_addr;
b_pg_offset = b_offset & PAGE_MASK;		b_pg_offset = b_offset & PAGE_MASK;
m_b = mb[b_offset >> PAGE_SHIFT];		pages[1] = mb[b_offset >> PAGE_SHIFT];
p_b = m_b->phys_addr;
cnt = min(xfersize, PAGE_SIZE - a_pg_offset);		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
cnt = min(cnt, PAGE_SIZE - b_pg_offset);		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
if (__predict_false(p_a < DMAP_MIN_ADDRESS \|\|		mapped = pmap_get_vaddr(pages, vaddr, 2, FALSE);
p_a > DMAP_MIN_ADDRESS + dmaplimit)) {		a_cp = (char *)vaddr[0] + a_pg_offset;
mtx_lock(&cpage_lock);		b_cp = (char *)vaddr[1] + b_pg_offset;
sched_pin();
pinned = TRUE;
pte = vtopte(cpage_a);
*pte = p_a \| X86_PG_A \| X86_PG_V \|
pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0);
invlpg(cpage_a);
a_cp = (char *)cpage_a + a_pg_offset;
} else {
a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
}
if (__predict_false(p_b < DMAP_MIN_ADDRESS \|\|
p_b > DMAP_MIN_ADDRESS + dmaplimit)) {
if (!pinned) {
mtx_lock(&cpage_lock);
sched_pin();
pinned = TRUE;
}
pte = vtopte(cpage_b);
*pte = p_b \| X86_PG_A \| X86_PG_M \| X86_PG_RW \|
X86_PG_V \| pmap_cache_bits(kernel_pmap,
m_b->md.pat_mode, 0);
invlpg(cpage_b);
b_cp = (char *)cpage_b + b_pg_offset;
} else {
b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
}
bcopy(a_cp, b_cp, cnt);		bcopy(a_cp, b_cp, cnt);
if (__predict_false(pinned)) {		if (__predict_false(mapped))
sched_unpin();		pmap_remove_vaddr(pages, vaddr, 2, FALSE);
mtx_unlock(&cpage_lock);
pinned = FALSE;
}
a_offset += cnt;		a_offset += cnt;
b_offset += cnt;		b_offset += cnt;
xfersize -= cnt;		xfersize -= cnt;
}		}
}		}

/*		/*
* Returns true if the pmap's pv is one of the first		* Returns true if the pmap's pv is one of the first
▲ Show 20 Lines • Show All 1,767 Lines • ▼ Show 20 Lines	if ((pde & PG_V) == 0 \|\| (pde & PG_PS) != 0)
goto done;		goto done;

pte = pmap_pde_to_pte(pde, va);		pte = pmap_pde_to_pte(pde, va);
ptr[idx++] = *pte;		ptr[idx++] = *pte;

done:		done:
PMAP_UNLOCK(pmap);		PMAP_UNLOCK(pmap);
*num = idx;		*num = idx;
		}

		/*
		* Get the kernel virtual address of a physical page. If the physical address
		* is not covered by the DMAP, perform a transient mapping. If the function
		* has to perform any of such mappings, TRUE will be returned and the caller
		* must call pmap_remove_vaddr when done.
		*/
		boolean_t
		pmap_get_vaddr(vm_page_t page[], vm_offset_t vaddr[], int count,
		boolean_t can_fault)
		{
		vm_paddr_t paddr;
		boolean_t needs_mapping;
		pt_entry_t *pte;
		int cache_bits, error, i;

		/*
		* Allocate any KVA space that we need, this is done in a separate
		* loop to prevent calling vmem_alloc while pinned.
		*/
		needs_mapping = FALSE;
		for (i = 0; i < count; i++) {
		paddr = VM_PAGE_TO_PHYS(page[i]);
		if (__predict_false(paddr >= dmaplimit)) {
		error = vmem_alloc(kernel_arena, PAGE_SIZE,
		M_BESTFIT \| M_WAITOK, &vaddr[i]);
		KASSERT(error == 0, ("vmem_alloc failed: %d", error));
		needs_mapping = TRUE;
		} else {
		vaddr[i] = PHYS_TO_DMAP(paddr);
		}
		}

		/* Exit early if everything is covered by the DMAP */
		kibUnsubmitted Not Done Inline Actions I suggest to not call vmem_alloc() while pinned. The loop could be split into two, first phase would do the allocations, then we pin the curthread, and then second loop would initialize ptes. Acquiring locks while pinned could cause starvation. kib: I suggest to not call vmem_alloc() while pinned. The loop could be split into two, first phase…
		if (!needs_mapping)
		goto out;
		kibUnsubmitted Not Done Inline Actions This could be replaced with return (FALSE);. It simplifies the logic and avoids unneeded goto (no, I do not hate goto, but this one only complicates flow control, IMO). kib: This could be replaced with return (FALSE);. It simplifies the logic and avoids unneeded goto…
		roygerUnsubmitted Not Done Inline Actions I originally had a simple return here, but then I thought I could use a single exit point. Since I don't have a strong opinion I will change it to a return. royger: I originally had a simple return here, but then I thought I could use a single exit point.

		/*
		kibUnsubmitted Not Done Inline Actions Do you need PG_G (global) bit set for the mapping ? kib: Do you need PG_G (global) bit set for the mapping ?
		roygerUnsubmitted Not Done Inline Actions No, I've switched to using vtopte and pte_store instead of pmap_kenter_attr. royger: No, I've switched to using vtopte and pte_store instead of pmap_kenter_attr.
		* NB: The sequence of updating a page table followed by accesses
		* to the corresponding pages used in the !DMAP case is subject to
		* the situation described in the "AMD64 Architecture Programmer's
		* Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
		* Coherency Considerations". Therefore, issuing the INVLPG right
		* after modifying the PTE bits is crucial.
		*/
		if (!can_fault)
		sched_pin();
		for (i = 0; i < count; i++) {
		paddr = VM_PAGE_TO_PHYS(page[i]);
		if (paddr >= dmaplimit) {
		if (can_fault) {
		/*
		* Slow path, since we can get page faults
		* while mappings are active don't pin the
		* thread to the CPU and instead add a global
		kibUnsubmitted Not Done Inline Actions I think we can safely unpin before freeing vmem allocations. kib: I think we can safely unpin before freeing vmem allocations.
		* mapping visible to all CPUs.
		*/
		pmap_qenter(vaddr[i], &page[i], 1);
		} else {
		pte = vtopte(vaddr[i]);
		cache_bits = pmap_cache_bits(kernel_pmap,
		page[i]->md.pat_mode, 0);
		pte_store(pte, paddr \| X86_PG_RW \| X86_PG_V \|
		cache_bits);
		invlpg(vaddr[i]);
		}
		}
		}

		out:
		return (needs_mapping);
		}

		void
		pmap_remove_vaddr(vm_page_t page[], vm_offset_t vaddr[], int count,
		boolean_t can_fault)
		{
		vm_paddr_t paddr;
		int i;

		if (!can_fault)
		sched_unpin();
		for (i = 0; i < count; i++) {
		paddr = VM_PAGE_TO_PHYS(page[i]);
		if (paddr >= dmaplimit) {
		if (can_fault)
		pmap_qremove(vaddr[i], 1);
		vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
		}
		}
}		}

#include "opt_ddb.h"		#include "opt_ddb.h"
#ifdef DDB		#ifdef DDB
#include <ddb/ddb.h>		#include <ddb/ddb.h>

DB_SHOW_COMMAND(pte, pmap_print_pte)		DB_SHOW_COMMAND(pte, pmap_print_pte)
{		{
▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines