diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -97,14 +97,16 @@ #define VM_PHYSSEG_MAX 63 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool - * from which physical pages are allocated and VM_FREEPOOL_DIRECT is - * the pool from which physical pages for page tables and small UMA - * objects are allocated. + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from + * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from + * which physical pages for page tables and small UMA objects are allocated. + * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during + * boot and is used to implement deferred initialization of page structures. */ -#define VM_NFREEPOOL 2 -#define VM_FREEPOOL_DEFAULT 0 -#define VM_FREEPOOL_DIRECT 1 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_LAZYINIT 0 +#define VM_FREEPOOL_DEFAULT 1 +#define VM_FREEPOOL_DIRECT 2 /* * Create up to three free page lists: VM_FREELIST_DMA32 is for physical pages diff --git a/sys/arm64/include/vmparam.h b/sys/arm64/include/vmparam.h --- a/sys/arm64/include/vmparam.h +++ b/sys/arm64/include/vmparam.h @@ -72,14 +72,16 @@ #define VM_PHYSSEG_MAX 64 /* - * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool - * from which physical pages are allocated and VM_FREEPOOL_DIRECT is - * the pool from which physical pages for small UMA objects are - * allocated. + * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from + * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from + * which physical pages for page tables and small UMA objects are allocated. + * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during + * boot and is used to implement deferred initialization of page structures. */ -#define VM_NFREEPOOL 2 -#define VM_FREEPOOL_DEFAULT 0 -#define VM_FREEPOOL_DIRECT 1 +#define VM_NFREEPOOL 3 +#define VM_FREEPOOL_LAZYINIT 0 +#define VM_FREEPOOL_DEFAULT 1 +#define VM_FREEPOOL_DIRECT 2 /* * Create one free page lists: VM_FREELIST_DEFAULT is for all physical diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -333,9 +333,9 @@ if (m == NULL) return (true); /* page does not exist, no failure */ - vmd = vm_pagequeue_domain(m); + vmd = VM_DOMAIN(vm_phys_domain(pa)); vm_domain_free_lock(vmd); - found = vm_phys_unfree_page(m); + found = vm_phys_unfree_page(pa); vm_domain_free_unlock(vmd); if (found) { vm_domain_freecnt_inc(vmd, -1); @@ -568,6 +568,9 @@ #if defined(__i386__) && defined(VM_PHYSSEG_DENSE) long ii; #endif +#ifdef VM_FREEPOOL_LAZYINIT + int lazyinit; +#endif vaddr = round_page(vaddr); @@ -750,6 +753,11 @@ */ vm_phys_init(); +#ifdef VM_FREEPOOL_LAZYINIT + lazyinit = 1; + TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit); +#endif + /* * Initialize the page structures and add every available page to the * physical memory allocator's free lists. @@ -765,9 +773,50 @@ vm_cnt.v_page_count = 0; for (segind = 0; segind < vm_phys_nsegs; segind++) { seg = &vm_phys_segs[segind]; - for (m = seg->first_page, pa = seg->start; pa < seg->end; - m++, pa += PAGE_SIZE) - vm_page_init_page(m, pa, segind, VM_FREEPOOL_DEFAULT); + + /* + * If lazy vm_page initialization is not enabled, simply + * initialize all of the pages in the segment. Otherwise, we + * only initialize: + * 1. Pages not covered by phys_avail[], since they might be + * freed to the allocator at some future point, e.g., by + * kmem_bootstrap_free(). + * 2. The first page of each run of free pages handed to the + * vm_phys allocator, which in turn defers initialization + * of pages until they are needed. + * This avoids blocking the boot process for long periods, which + * may be relevant for VMs (which ought to boot as quickly as + * possible) and/or systems with large amounts of physical + * memory. + */ +#ifdef VM_FREEPOOL_LAZYINIT + if (lazyinit) { + startp = seg->start; + for (i = 0; phys_avail[i + 1] != 0; i += 2) { + if (startp >= seg->end) + break; + + if (phys_avail[i + 1] < startp) + continue; + if (phys_avail[i] <= startp) { + startp = phys_avail[i + 1]; + continue; + } + + m = &seg->first_page[atop(startp - seg->start)]; + for (endp = MIN(phys_avail[i], seg->end); + startp < endp; startp += PAGE_SIZE, m++) { + vm_page_init_page(m, startp, segind, + VM_FREEPOOL_DEFAULT); + } + } + } else +#endif + for (m = seg->first_page, pa = seg->start; + pa < seg->end; m++, pa += PAGE_SIZE) { + vm_page_init_page(m, pa, segind, + VM_FREEPOOL_DEFAULT); + } /* * Add the segment's pages that are covered by one of @@ -785,6 +834,12 @@ continue; m = seg->first_page + atop(startp - seg->start); +#ifdef VM_FREEPOOL_LAZYINIT + if (lazyinit) { + vm_page_init_page(m, startp, segind, + VM_FREEPOOL_LAZYINIT); + } +#endif vmd = VM_DOMAIN(seg->domain); vm_domain_free_lock(vmd); vm_phys_enqueue_contig(m, pagecount); diff --git a/sys/vm/vm_phys.h b/sys/vm/vm_phys.h --- a/sys/vm/vm_phys.h +++ b/sys/vm/vm_phys.h @@ -42,6 +42,8 @@ #ifdef _KERNEL +#include + extern vm_paddr_t phys_avail[]; /* Domains must be dense (non-sparse) and zero-based. */ @@ -79,7 +81,7 @@ int *locality); vm_page_t vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options); -bool vm_phys_unfree_page(vm_page_t m); +bool vm_phys_unfree_page(vm_paddr_t pa); int vm_phys_mem_affinity(int f, int t); void vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end); vm_paddr_t vm_phys_early_alloc(int domain, size_t alloc_size); @@ -106,5 +108,19 @@ #endif } +static inline struct vm_phys_seg * +vm_phys_seg(vm_paddr_t pa) +{ + struct vm_phys_seg *seg; + int segind; + + for (segind = 0; segind < vm_phys_nsegs; segind++) { + seg = &vm_phys_segs[segind]; + if (pa >= seg->start && pa < seg->end) + return (seg); + } + return (NULL); +} + #endif /* _KERNEL */ #endif /* !_VM_PHYS_H_ */ diff --git a/sys/vm/vm_phys.c b/sys/vm/vm_phys.c --- a/sys/vm/vm_phys.c +++ b/sys/vm/vm_phys.c @@ -49,14 +49,18 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include +#include +#include #include #include @@ -135,6 +139,7 @@ * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ static int __read_mostly vm_freelist_to_flind[VM_NFREELIST]; +static int __read_mostly vm_default_freepool; CTASSERT(VM_FREELIST_DEFAULT == 0); @@ -178,6 +183,16 @@ static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order, int tail); +static bool +vm_phys_pool_valid(int pool) +{ +#ifdef VM_FREEPOOL_LAZYINIT + if (pool == VM_FREEPOOL_LAZYINIT) + return (false); +#endif + return (pool >= 0 && pool < VM_NFREEPOOL); +} + /* * Red-black tree helpers for vm fictitious range management. */ @@ -614,6 +629,12 @@ } } +#ifdef VM_FREEPOOL_LAZYINIT + vm_default_freepool = VM_FREEPOOL_LAZYINIT; +#else + vm_default_freepool = VM_FREEPOOL_DEFAULT; +#endif + rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } @@ -716,15 +737,33 @@ } /* - * Set the pool for a contiguous, power of two-sized set of physical pages. + * Set the pool for a contiguous, power of two-sized set of physical pages. + * + * If the pages currently belong to the lazy init pool, then the corresponding + * page structures must be initialized. In this case it is assumed that the + * first page in the run has already been initialized. */ static void vm_phys_set_pool(int pool, vm_page_t m, int order) { - vm_page_t m_tmp; - - for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) - m_tmp->pool = pool; +#ifdef VM_FREEPOOL_LAZYINIT + if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { + vm_paddr_t pa; + int segind; + + m->pool = pool; + + TSENTER(); + pa = m->phys_addr + PAGE_SIZE; + segind = m->segind; + for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order]; + m_tmp++, pa += PAGE_SIZE) + vm_page_init_page(m_tmp, pa, segind, pool); + TSEXIT(); + } else +#endif + for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++) + m_tmp->pool = pool; } /* @@ -748,7 +787,7 @@ KASSERT(domain >= 0 && domain < vm_ndomains, ("vm_phys_alloc_npages: domain %d is out of range", domain)); - KASSERT(pool < VM_NFREEPOOL, + KASSERT(vm_phys_pool_valid(pool), ("vm_phys_alloc_npages: pool %d is out of range", pool)); KASSERT(npages <= 1 << (VM_NFREEORDER - 1), ("vm_phys_alloc_npages: npages %d is out of range", npages)); @@ -779,7 +818,8 @@ } } for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; + pind++) { alt = vm_phys_free_queues[domain][flind][pind]; while ((m = TAILQ_FIRST(&alt[oind].pl)) != NULL) { @@ -847,7 +887,7 @@ KASSERT(freelist < VM_NFREELIST, ("vm_phys_alloc_freelist_pages: freelist %d is out of range", freelist)); - KASSERT(pool < VM_NFREEPOOL, + KASSERT(vm_phys_pool_valid(pool), ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool)); KASSERT(order < VM_NFREEORDER, ("vm_phys_alloc_freelist_pages: order %d is out of range", order)); @@ -876,7 +916,7 @@ * use them to satisfy the allocation. */ for (oind = VM_NFREEORDER - 1; oind >= order; oind--) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { alt = &vm_phys_free_queues[domain][flind][pind][0]; m = TAILQ_FIRST(&alt[oind].pl); if (m != NULL) { @@ -1107,7 +1147,7 @@ KASSERT(m->order == VM_NFREEORDER, ("vm_phys_free_pages: page %p has unexpected order %d", m, m->order)); - KASSERT(m->pool < VM_NFREEPOOL, + KASSERT(vm_phys_pool_valid(m->pool), ("vm_phys_free_pages: page %p has unexpected pool %d", m, m->pool)); KASSERT(order < VM_NFREEORDER, @@ -1136,6 +1176,107 @@ vm_freelist_add(fl, m, order, 1); } +#ifdef VM_FREEPOOL_LAZYINIT +/* + * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving + * them to the default pool. This is a prerequisite for some rare operations + * which need to scan the page array and thus depend on all pages being + * initialized. + */ +static void +vm_phys_lazy_init_domain(int domain, bool locked) +{ + static bool initdone[MAXMEMDOM]; + struct vm_domain *vmd; + struct vm_freelist *fl; + vm_page_t m; + int pind; + bool unlocked; + + if (__predict_true(atomic_load_bool(&initdone[domain]))) + return; + + vmd = VM_DOMAIN(domain); + if (locked) + vm_domain_free_assert_locked(vmd); + else + vm_domain_free_lock(vmd); + if (atomic_load_bool(&initdone[domain])) + goto out; + pind = VM_FREEPOOL_LAZYINIT; + for (int freelist = 0; freelist < VM_NFREELIST; freelist++) { + int flind; + + flind = vm_freelist_to_flind[freelist]; + if (flind < 0) + continue; + fl = vm_phys_free_queues[domain][flind][pind]; + for (int oind = 0; oind < VM_NFREEORDER; oind++) { + if (atomic_load_int(&fl[oind].lcnt) == 0) + continue; + while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) { + /* + * Avoid holding the lock across the + * initialization unless there's a free page + * shortage. + */ + vm_freelist_rem(fl, m, oind); + unlocked = vm_domain_allocate(vmd, + VM_ALLOC_NORMAL, 1 << oind); + if (unlocked) + vm_domain_free_unlock(vmd); + vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind); + if (unlocked) { + vm_domain_freecnt_inc(vmd, 1 << oind); + vm_domain_free_lock(vmd); + } + vm_phys_free_pages(m, oind); + } + } + } + atomic_store_bool(&initdone[domain], true); +out: + if (!locked) + vm_domain_free_unlock(vmd); +} + +static void +vm_phys_lazy_init(void) +{ + for (int domain = 0; domain < vm_ndomains; domain++) + vm_phys_lazy_init_domain(domain, false); + atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT); +} + +static void +vm_phys_lazy_init_kthr(void *arg __unused) +{ + vm_phys_lazy_init(); + kthread_exit(); +} + +static void +vm_phys_lazy_sysinit(void *arg __unused) +{ + struct thread *td; + int error; + + error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td, + RFSTOPPED, 0, "vmlazyinit"); + if (error == 0) { + thread_lock(td); + sched_prio(td, PRI_MIN_IDLE); + sched_add(td, SRQ_BORING); + } else { + printf("%s: could not create lazy init thread: %d\n", + __func__, error); + vm_phys_lazy_init(); + } +} +SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit, + NULL); +#endif /* VM_FREEPOOL_LAZYINIT */ + /* * Return the largest possible order of a set of pages starting at m. */ @@ -1157,6 +1298,7 @@ static vm_page_t vm_phys_enqueue_contig_chunk(struct vm_freelist *fl, vm_page_t m, int order) { + vm_page_t m_ret; int npages; KASSERT(order >= 0 && order < VM_NFREEORDER, @@ -1164,7 +1306,17 @@ npages = 1 << order; vm_freelist_add(fl, m, order, 1); - return (m + npages); + m_ret = m + npages; +#ifdef VM_FREEPOOL_LAZYINIT + if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) { + vm_paddr_t pa; + + pa = m->phys_addr; + vm_page_init_page(m_ret, pa + ptoa(npages), m->segind, + VM_FREEPOOL_LAZYINIT); + } +#endif + return (m_ret); } /* @@ -1284,6 +1436,14 @@ pa_end = high <= seg->end ? high : seg->end; if (pa_end - pa_start < ptoa(npages)) continue; + +#ifdef VM_FREEPOOL_LAZYINIT + /* + * The pages on the free lists must be initialized. + */ + vm_phys_lazy_init_domain(domain, false); +#endif + m_start = &seg->first_page[atop(pa_start - seg->start)]; m_end = &seg->first_page[atop(pa_end - seg->start)]; m_run = vm_page_scan_contig(npages, m_start, m_end, @@ -1302,21 +1462,30 @@ * The free page queues must be locked. */ bool -vm_phys_unfree_page(vm_page_t m) +vm_phys_unfree_page(vm_paddr_t pa) { struct vm_freelist *fl; struct vm_phys_seg *seg; - vm_paddr_t pa, pa_half; - vm_page_t m_set, m_tmp; + vm_paddr_t pa_half; + vm_page_t m, m_set, m_tmp; int order; + seg = vm_phys_seg(pa); + vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); + + /* + * The pages on the free lists must be initialized. + */ +#ifdef VM_FREEPOOL_LAZYINIT + vm_phys_lazy_init_domain(seg->domain, true); +#endif + /* * First, find the contiguous, power of two-sized set of free * physical pages containing the given physical page "m" and * assign it to "m_set". */ - seg = &vm_phys_segs[m->segind]; - vm_domain_free_assert_locked(VM_DOMAIN(seg->domain)); + m = vm_phys_paddr_to_vm_page(pa); for (m_set = m, order = 0; m_set->order == VM_NFREEORDER && order < VM_NFREEORDER - 1; ) { order++; @@ -1488,7 +1657,7 @@ /* Search for a large enough free block. */ size = npages << PAGE_SHIFT; for (oind = order; oind < VM_NFREEORDER; oind++) { - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { fl = (*queues)[pind]; TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) { /* @@ -1509,7 +1678,7 @@ return (NULL); /* Search for a long-enough sequence of small blocks. */ oind = VM_NFREEORDER - 1; - for (pind = 0; pind < VM_NFREEPOOL; pind++) { + for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) { fl = (*queues)[pind]; m_ret = vm_phys_find_freelist_contig(fl, oind, npages, low, high, alignment, boundary);