Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -5189,6 +5189,7 @@ vm/uma_core.c standard vm/uma_dbg.c standard vm/memguard.c optional DEBUG_MEMGUARD +vm/vm_compact.c standard vm/vm_domainset.c standard vm/vm_fault.c standard vm/vm_glue.c standard Index: sys/vm/vm_compact.h =================================================================== --- /dev/null +++ sys/vm/vm_compact.h @@ -0,0 +1,56 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023. Bojan Novković + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include + +MALLOC_DECLARE(M_VMCOMPACT); + +struct vm_compact_region { + vm_paddr_t start; + vm_paddr_t end; + SLIST_ENTRY(vm_compact_region) entries; +}; +typedef struct vm_compact_region *vm_compact_region_t; + +SLIST_HEAD(vm_compact_region_head, vm_compact_region); + +typedef int ( + *vm_compact_search_fn)(struct vm_compact_region_head *, int, void *); +typedef size_t ( + *vm_compact_defrag_fn)(struct vm_compact_region_head *, int, void *); +typedef bool (*vm_compact_end_fn)(void); +typedef void (*vm_compact_ctx_init_fn)(void **); + +void *vm_compact_create_job(vm_compact_search_fn sfn, vm_compact_defrag_fn dfn, + vm_compact_ctx_init_fn ctxfn, vm_paddr_t start, vm_paddr_t end, int order, + int domain, int *error); +void vm_compact_free_job(void *ctx); +int vm_compact_run(void *ctx); Index: sys/vm/vm_compact.c =================================================================== --- /dev/null +++ sys/vm/vm_compact.c @@ -0,0 +1,151 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023. Bojan Novković + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define VM_COMPACT_LOCK() mtx_lock(&compact_lock) +#define VM_COMPACT_UNLOCK() mtx_unlock(&compact_lock) + +MALLOC_DEFINE(M_VMCOMPACT, "vm_compact_ctx", "memory compaction context"); + +static struct mtx compact_lock; +static LIST_HEAD(, vm_compact_ctx) active_compactions[MAXMEMDOM]; + +struct vm_compact_ctx { + vm_compact_search_fn search_fn; + vm_compact_defrag_fn defrag_fn; + + vm_paddr_t start; + vm_paddr_t end; + + int order; + int domain; + struct vm_compact_region_head regions; + + void *p_data; + + LIST_ENTRY(vm_compact_ctx) entries; +}; + +static bool +vm_compact_job_overlaps(struct vm_compact_ctx *ctxp1, + struct vm_compact_ctx *ctxp2) +{ + return (ctxp1->start <= ctxp2->start && ctxp2->start <= ctxp1->end); +} + +void * +vm_compact_create_job(vm_compact_search_fn sfn, vm_compact_defrag_fn dfn, + vm_compact_ctx_init_fn ctxfn, vm_paddr_t start, vm_paddr_t end, int order, + int domain, int *error) +{ + struct vm_compact_ctx *ctxp; + /* Arguments sanity check. */ + if (end <= start || order > (VM_NFREEORDER_MAX - 1)) { + *error = (EINVAL); + return (NULL); + } + + ctxp = malloc(sizeof(struct vm_compact_ctx), M_VMCOMPACT, + M_WAITOK | M_ZERO); + + ctxp->search_fn = sfn; + ctxp->defrag_fn = dfn; + ctxp->start = start; + ctxp->order = order; + ctxp->domain = domain; + SLIST_INIT(&ctxp->regions); + + ctxfn(&ctxp->p_data); + + return ((void *)ctxp); +} + +void +vm_compact_free_job(void *ctx) +{ + free(ctx, M_VMCOMPACT); +} + +int +vm_compact_run(void *ctx) +{ + struct vm_compact_ctx *ctxp = (struct vm_compact_ctx *)ctx; + struct vm_compact_ctx *ctxp_tmp; + int retval; + + VM_COMPACT_LOCK(); + /* Check if the requested compaction overlaps with an existing one. */ + LIST_FOREACH (ctxp_tmp, &active_compactions[ctxp->domain], entries) { + if (vm_compact_job_overlaps(ctxp, ctxp_tmp)) { + VM_COMPACT_UNLOCK(); + return (-EINPROGRESS); + } + } + + LIST_INSERT_HEAD(&active_compactions[ctxp->domain], ctxp, entries); + VM_COMPACT_UNLOCK(); + + /* Run compaction job. */ + if (ctxp->search_fn(&ctxp->regions, ctxp->domain, ctxp->p_data)) { + retval = 0; + goto cleanup; + } + + retval = ctxp->defrag_fn(&ctxp->regions, ctxp->domain, ctxp->p_data); + +cleanup: + VM_COMPACT_LOCK(); + LIST_REMOVE(ctxp, entries); + VM_COMPACT_UNLOCK(); + + return retval; +} + +static void +vm_compact_init(void *arg) +{ + mtx_init(&compact_lock, "vm_compact", NULL, MTX_DEF); + for (int i = 0; i < MAXMEMDOM; i++) + LIST_INIT(&active_compactions[i]); +} + +SYSINIT(vm_compact, SI_SUB_KMEM + 2, SI_ORDER_ANY, vm_compact_init, NULL); Index: sys/vm/vm_page.h =================================================================== --- sys/vm/vm_page.h +++ sys/vm/vm_page.h @@ -711,6 +711,7 @@ vm_page_bits_t vm_page_bits(int base, int size); void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count); +int vm_page_relocate_page(vm_page_t src, vm_page_t dst, int domain); void vm_page_dirty_KBI(vm_page_t m); void vm_page_lock_KBI(vm_page_t m, const char *file, int line); Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -738,6 +738,8 @@ #endif phys_avail[biggestone + 1] = new_end; + vm_phys_search_index_startup(&vaddr); + /* * Add physical memory segments corresponding to the available * physical pages. @@ -5595,6 +5597,86 @@ } #endif +/* + * Tries to move 'src' into 'dst'. The 'src' page must be busied and its object + * locked. Returns 0 on success, 1 if the error was caused by the src page, 2 if + * caused by the dst page. + */ +int +vm_page_relocate_page(vm_page_t src, vm_page_t dst, int domain) +{ + int error = 0; + struct vm_domain *vmd = VM_DOMAIN(domain); + vm_object_t obj = src->object; + + vm_page_assert_xbusied(src); + + VM_OBJECT_ASSERT_WLOCKED(obj); + KASSERT(vm_page_domain(src) == domain, + ("Source page is from a different domain")); + KASSERT(vm_page_domain(dst) == domain, + ("Destination page is from a different domain")); + + vm_domain_free_lock(vmd); + /* Check if the dst page is still eligible and remove it from the + * freelist. */ + if (dst->order != 0 || !vm_page_none_valid(dst)) { + error = 2; + vm_page_xunbusy(src); + vm_domain_free_unlock(vmd); + goto unlock; + } + + vm_page_dequeue(dst); + vm_phys_unfree_page(dst); + vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, -1); + + /* Unmap src page */ + if (obj->ref_count != 0 && !vm_page_try_remove_all(src)) { + error = 1; + + vm_page_xunbusy(src); + /* Place dst page back on the freelists. */ + vm_domain_free_lock(vmd); + vm_phys_free_pages(dst, 0); + vm_domain_free_unlock(vmd); + vm_domain_freecnt_inc(vmd, 1); + goto unlock; + } + /* Note - if this is missing the calling process gets stuck at the + * 'vmpfw' channel */ + if (dst->busy_lock == VPB_FREED) { + dst->busy_lock = VPB_UNBUSIED; + } + + /* Copy page attributes */ + dst->a.flags = src->a.flags & ~PGA_QUEUE_STATE_MASK; + dst->oflags = 0; + pmap_copy_page(src, dst); + + dst->valid = src->valid; + dst->dirty = src->dirty; + src->flags &= ~PG_ZERO; + vm_page_dequeue(src); + + if (vm_page_replace_hold(dst, obj, src->pindex, src) && + vm_page_free_prep(src)) { + /* Return src page to freelist. */ + vm_domain_free_lock(vmd); + vm_phys_free_pages(src, 0); + vm_domain_free_unlock(vmd); + + vm_domain_freecnt_inc(vmd, 1); + } + + vm_page_deactivate(dst); +unlock: + VM_OBJECT_WUNLOCK(obj); + + return error; +} + #include "opt_ddb.h" #ifdef DDB #include Index: sys/vm/vm_phys.h =================================================================== --- sys/vm/vm_phys.h +++ sys/vm/vm_phys.h @@ -87,7 +87,8 @@ int vm_phys_avail_largest(void); vm_paddr_t vm_phys_avail_size(int i); bool vm_phys_is_dumpable(vm_paddr_t pa); - +int vm_phys_fragmentation_index(int order, int domain); +void vm_phys_search_index_startup(vm_offset_t *vaddr); static inline int vm_phys_domain(vm_paddr_t pa) { Index: sys/vm/vm_phys.c =================================================================== --- sys/vm/vm_phys.c +++ sys/vm/vm_phys.c @@ -47,8 +47,10 @@ #include #include #include -#include +#include #include +#include +#include #include #include #include @@ -62,6 +64,8 @@ #include #include + +#include #include #include #include @@ -131,6 +135,53 @@ vm_paddr_t phys_avail[PHYS_AVAIL_COUNT]; vm_paddr_t dump_avail[PHYS_AVAIL_COUNT]; +/* + * Structures used for memory compaction. + */ + +/* Tracks invalid physical memory ranges. */ +struct vm_phys_hole { + vm_paddr_t start; + vm_paddr_t end; + int domain; +}; + +/* Used to track valid memory ranges inside search index chunks containing + * memory holes. */ +struct vm_phys_subseg { + struct vm_compact_region region; + SLIST_ENTRY(vm_phys_subseg) link; +}; +SLIST_HEAD(vm_phys_subseg_head, vm_phys_subseg); + +/* Tracks various metrics and valid memory segments for a fixed-size physical + * memory region. */ +struct vm_phys_search_chunk { + int holecnt; + int score; + int skipidx; + struct vm_phys_subseg_head *shp; +}; + +struct vm_phys_search_index { + struct vm_phys_search_chunk *chunks; + int nchunks; + vm_paddr_t dom_start; + vm_paddr_t dom_end; +}; + +static void vm_phys_update_search_index(vm_page_t m, int order, bool alloc); + +static struct vm_phys_search_index vm_phys_search_index[MAXMEMDOM]; + +static struct vm_phys_hole vm_phys_holes[VM_PHYSSEG_MAX * 2]; +static int vm_phys_nholes; + +struct vm_phys_info { + uint64_t free_pages; + uint64_t free_blocks; +}; + /* * Provides the mapping from VM_FREELIST_* to free list indices (flind). */ @@ -156,6 +207,11 @@ sysctl_vm_phys_free, "A", "Phys Free Info"); +static int sysctl_vm_phys_frag_idx(SYSCTL_HANDLER_ARGS); +SYSCTL_OID(_vm, OID_AUTO, phys_frag_idx, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, + sysctl_vm_phys_frag_idx, "A", "Phys Frag Info"); + static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, @@ -291,6 +347,80 @@ return (error); } +static void +vm_phys_get_info(struct vm_phys_info *info, int domain) +{ + struct vm_freelist *fl; + int pind, oind, flind; + + /* Calculate total number of free pages and blocks */ + info->free_pages = info->free_blocks = 0; + for (flind = 0; flind < vm_nfreelists; flind++) { + for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { + for (pind = 0; pind < VM_NFREEPOOL; pind++) { + fl = vm_phys_free_queues[domain][flind][pind]; + info->free_pages += fl[oind].lcnt << oind; + info->free_blocks += fl[oind].lcnt; + } + } + } +} + +int +vm_phys_fragmentation_index(int order, int domain) +{ + struct vm_phys_info info; + + vm_domain_free_assert_locked(VM_DOMAIN(domain)); + vm_phys_get_info(&info, domain); + + if (info.free_blocks == 0) { + return (0); + } + + return (1000 - + ((info.free_pages * 1000) / (1 << order) / info.free_blocks)); +} + +/* + * Outputs the value of the Free Memory Fragmentation Index (FMFI) for each + * domain. + */ +static int +sysctl_vm_phys_frag_idx(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sbuf; + int64_t idx; + int oind, dom, error; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req); + + for (dom = 0; dom < vm_ndomains; dom++) { + vm_domain_free_lock(VM_DOMAIN(dom)); + + sbuf_printf(&sbuf, "\n--\n"); + sbuf_printf(&sbuf, "\nDOMAIN %d\n", dom); + sbuf_printf(&sbuf, "\n ORDER (SIZE) | FMFI\n"); + sbuf_printf(&sbuf, "--\n"); + + for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) { + idx = vm_phys_fragmentation_index(oind, dom); + sbuf_printf(&sbuf, " %2d (%6dK) ", oind, + 1 << (PAGE_SHIFT - 10 + oind)); + sbuf_printf(&sbuf, "| %ld \n", idx); + } + + vm_domain_free_unlock(VM_DOMAIN(dom)); + } + + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + return (error); +} + /* * Outputs the set of physical memory segments. */ @@ -378,6 +508,7 @@ else TAILQ_INSERT_HEAD(&fl[order].pl, m, listq); fl[order].lcnt++; + vm_phys_update_search_index(m, order, false); } static void @@ -387,6 +518,7 @@ TAILQ_REMOVE(&fl[order].pl, m, listq); fl[order].lcnt--; m->order = VM_NFREEORDER; + vm_phys_update_search_index(m, order, true); } /* @@ -614,6 +746,47 @@ } } + /* + * Initialize hole array. + */ + struct vm_phys_hole *hp; + + vm_phys_nholes = 0; + if (vm_phys_segs[0].start != 0) { + hp = &vm_phys_holes[0]; + hp->start = 0; + hp->end = vm_phys_segs[0].start; + hp->domain = vm_phys_segs[0].domain; + vm_phys_nholes++; + } + + struct vm_phys_search_index *sip; + /* Initialize memory hole array. */ + for (int i = 0; i + 1 < vm_phys_nsegs; i++, vm_phys_nholes++) { + hp = &vm_phys_holes[vm_phys_nholes]; + hp->start = vm_phys_segs[i].end; + hp->end = vm_phys_segs[i + 1].start; + hp->domain = vm_phys_segs[i].domain; + sip = &vm_phys_search_index[hp->domain]; + + /* Does this hole span two domains? */ + if (vm_phys_segs[i].domain != vm_phys_segs[i + 1].domain && + hp->end > sip->dom_end) { + /* Clamp end of current hole to domain end */ + sip = &vm_phys_search_index[hp->domain]; + hp->end = sip->dom_end; + /* Add new hole at beginning of subsequent domain */ + vm_phys_nholes++; + hp = &vm_phys_holes[vm_phys_nholes]; + hp->domain = vm_phys_segs[i + 1].domain; + sip = &vm_phys_search_index[hp->domain]; + /* Hole starts at domain start and ends at the start of + * the first segment. */ + hp->start = sip->dom_start; + hp->end = vm_phys_segs[i + 1].start; + } + } + rw_init(&vm_phys_fictitious_reg_lock, "vmfctr"); } @@ -1892,3 +2065,654 @@ } } #endif + +#define VM_PHYS_SEARCH_CHUNK_ORDER (14) +#define VM_PHYS_SEARCH_CHUNK_NPAGES (1 << (VM_PHYS_SEARCH_CHUNK_ORDER)) +#define VM_PHYS_SEARCH_CHUNK_SIZE \ + (1 << (PAGE_SHIFT + VM_PHYS_SEARCH_CHUNK_ORDER)) +#define VM_PHYS_SEARCH_CHUNK_MASK (VM_PHYS_SEARCH_CHUNK_SIZE - 1) +#define VM_PHYS_HOLECNT_HI ((1 << (VM_PHYS_SEARCH_CHUNK_ORDER)) - 100) +#define VM_PHYS_HOLECNT_LO (16) + +static __inline vm_paddr_t +vm_phys_search_idx_to_paddr(int idx, int domain) +{ + vm_paddr_t paddr; + struct vm_phys_search_index *sip = &vm_phys_search_index[domain]; + + paddr = (vm_paddr_t)idx << ((VM_PHYS_SEARCH_CHUNK_ORDER) + PAGE_SHIFT); + /* Adjust address relative to domain start */ + paddr += sip->dom_start & ~VM_PHYS_SEARCH_CHUNK_MASK; + + return (paddr); +} + +static __inline int +vm_phys_paddr_to_chunk_idx(vm_paddr_t paddr, int domain) +{ + struct vm_phys_search_index *sip = &vm_phys_search_index[domain]; + + /* Adjust address relative to domain start */ + paddr -= sip->dom_start & ~VM_PHYS_SEARCH_CHUNK_MASK; + /* Strip lower bits */ + paddr &= ~VM_PHYS_SEARCH_CHUNK_MASK; + return (int)(paddr >> ((VM_PHYS_SEARCH_CHUNK_ORDER) + PAGE_SHIFT)); +} + +static __inline struct vm_phys_search_chunk * +vm_phys_search_get_chunk(struct vm_phys_search_index *sip, int idx) +{ + KASSERT(idx >= 0 && idx < sip->nchunks, + ("%s: search index out-of-bounds access, idx: %d, dom_start: %p, dom_end: %p, nchunks: %d", + __func__, idx, (void *)sip->dom_start, (void *)sip->dom_end, + sip->nchunks)); + + return (&sip->chunks[idx]); +} + +static struct vm_phys_search_chunk * +vm_phys_paddr_to_search_chunk(vm_paddr_t paddr, int domain) +{ + struct vm_phys_search_index *sip = &vm_phys_search_index[domain]; + int idx = vm_phys_paddr_to_chunk_idx(paddr, domain); + + return vm_phys_search_get_chunk(sip, idx); +} + +/* + * Allocates physical memory required for the memory compaction search index. + */ +void +vm_phys_search_index_startup(vm_offset_t *vaddr) +{ + struct vm_phys_search_index *cur_idx; + vm_paddr_t pa; + vm_paddr_t dom_start, dom_end; + size_t alloc_size; + int dom_nsearch_chunks; + int i; + + for (int dom = 0; dom < vm_ndomains; dom++) { + cur_idx = &vm_phys_search_index[dom]; + dom_nsearch_chunks = 0; + /* Calculate number of of search index chunks for current domain + */ + if (mem_affinity != NULL) { + for (i = 0; mem_affinity[i].end != 0; i++) { + if (mem_affinity[i].domain == dom) { + dom_start = mem_affinity[i].start; + while (mem_affinity[i].domain == dom) { + i++; + } + dom_end = mem_affinity[i - 1].end; + } + } + } else { + dom_start = phys_avail[0]; + i = 1; + while (phys_avail[i + 1] != 0) { + i++; + } + dom_end = phys_avail[i]; + } + /* Allocate search index for current domain */ + dom_nsearch_chunks = atop(dom_end - dom_start) / + VM_PHYS_SEARCH_CHUNK_NPAGES; + /* Add additional chunks if beginning and start aren't search + * chunk-aligned. */ + if (dom_start & VM_PHYS_SEARCH_CHUNK_MASK) + dom_nsearch_chunks++; + if (dom_end & VM_PHYS_SEARCH_CHUNK_MASK) + dom_nsearch_chunks++; + + alloc_size = round_page( + dom_nsearch_chunks * sizeof(struct vm_phys_search_chunk)); + pa = vm_phys_early_alloc(dom, alloc_size); + + /* Map and zero the array */ + cur_idx->chunks = (void *)(uintptr_t)pmap_map(vaddr, pa, + pa + alloc_size, VM_PROT_READ | VM_PROT_WRITE); + cur_idx->nchunks = dom_nsearch_chunks; + cur_idx->dom_start = dom_start; + cur_idx->dom_end = dom_end; + + if (cur_idx->chunks == NULL) { + panic("Unable to allocate search index for domain %d\n", + dom); + } + + bzero(cur_idx->chunks, alloc_size); + } +} + +static void +vm_phys_update_search_index(vm_page_t m, int order, bool alloc) +{ + int domain = vm_page_domain(m); + struct vm_phys_search_chunk *scp = + vm_phys_paddr_to_search_chunk(m->phys_addr, domain); + int pgcnt = 1 << order; + + /* Update chunk hole count */ + scp->holecnt += alloc ? -pgcnt : pgcnt; + KASSERT(scp->holecnt >= 0 && + scp->holecnt <= VM_PHYS_SEARCH_CHUNK_NPAGES, + ("%s: inconsistent hole count: %d", __func__, scp->holecnt)); + + /* Update chunk fragmentation score */ + if (order == 0) { + scp->score += alloc ? -1 : 1; + if (scp->score < 0) { + scp->score = 0; + } + } +} + +static void +vm_phys_chunk_register_hole(struct vm_phys_search_chunk *cp, + vm_paddr_t hole_start, vm_paddr_t hole_end) +{ + struct vm_phys_subseg *ssp; + + if (cp->shp == NULL) { + vm_paddr_t chunk_start = hole_start & + ~VM_PHYS_SEARCH_CHUNK_MASK; + cp->shp = malloc(sizeof(*cp->shp), M_TEMP, M_ZERO | M_WAITOK); + SLIST_INIT(cp->shp); + /* Split chunk into a subseg */ + ssp = malloc(sizeof(*ssp), M_TEMP, M_ZERO | M_WAITOK); + ssp->region.start = chunk_start; + ssp->region.end = chunk_start + VM_PHYS_SEARCH_CHUNK_SIZE; + + SLIST_INSERT_HEAD(cp->shp, ssp, link); + } + + /* + * Holes are ordered by paddr - hole registration will + * thus always affect the last subsegment in the list. + * Take last subseg and split it. + */ + ssp = SLIST_FIRST(cp->shp); + while (SLIST_NEXT(ssp, link)) { + ssp = SLIST_NEXT(ssp, link); + } + + if (hole_start == ssp->region.start) { + ssp->region.start = hole_end; + } else if (hole_end == ssp->region.end) { + ssp->region.end = hole_start; + } else { /* Hole splits the subseg - create and enqueue new subseg */ + struct vm_phys_subseg *nssp = malloc(sizeof(*nssp), M_TEMP, + M_ZERO | M_WAITOK); + + nssp->region.start = hole_end; + nssp->region.end = ssp->region.end; + ssp->region.end = hole_start; + KASSERT(nssp->region.end > nssp->region.start, + ("%s: inconsistent subsegment after splitting", __func__)); + + SLIST_INSERT_AFTER(ssp, nssp, link); + } + + KASSERT(ssp->region.end > ssp->region.start, + ("%s: inconsistent subsegment", __func__)); +} + +/* + * Populates compaction search index with hole information. + */ +static void +vm_phys_compact_init_holes(void) +{ + int dom; + struct vm_phys_search_index *sip; + struct vm_phys_search_chunk *start_chunk, *end_chunk; + struct vm_phys_hole *hp; + int start_idx, end_idx; + + for (dom = 0; dom < vm_ndomains; dom++) { + sip = &vm_phys_search_index[dom]; + + /* Add hole information to domain search chunks */ + for (int i = 0; i < vm_phys_nholes; i++) { + hp = &vm_phys_holes[i]; + if (hp->domain != dom) + continue; + + start_idx = vm_phys_paddr_to_chunk_idx(hp->start, dom); + end_idx = vm_phys_paddr_to_chunk_idx(hp->end, dom); + + start_chunk = vm_phys_search_get_chunk(sip, start_idx); + /* + * If the domain end address is search chunk-aligned + * and a hole ends there, decrement the index to avoid + * an out of bounds access to the search index chunks. + */ + if ((sip->dom_end & VM_PHYS_SEARCH_CHUNK_MASK) == 0 && + hp->end == sip->dom_end) { + end_chunk = vm_phys_search_get_chunk(sip, + end_idx - 1); + /* This is the last search chunk, point it to + * the first one */ + end_chunk->skipidx = 1; + } else { + end_chunk = vm_phys_search_get_chunk(sip, + end_idx); + } + + /* Hole is completely inside this chunk */ + if (start_chunk == end_chunk) { + /* Register hole in current chunk. */ + vm_phys_chunk_register_hole(start_chunk, + hp->start, hp->end); + } else { /* Hole spans multiple chunks */ + if (hp->start & VM_PHYS_SEARCH_CHUNK_MASK) { + /* Partial overlap - register hole in + * first chunk. */ + vm_phys_chunk_register_hole(start_chunk, + hp->start, + (hp->start & + ~VM_PHYS_SEARCH_CHUNK_MASK) + + VM_PHYS_SEARCH_CHUNK_SIZE); + start_chunk++; + } + /* Mark all chunks that are completely covered + * by this hole as invalid. */ + while (start_chunk < end_chunk) { + start_chunk->skipidx = end_idx; + start_chunk++; + } + + if (hp->end & VM_PHYS_SEARCH_CHUNK_MASK) { + /* Partial overlap - register hole in + * last chunk. */ + vm_phys_chunk_register_hole(end_chunk, + (hp->end & + ~VM_PHYS_SEARCH_CHUNK_MASK), + hp->end); + } + } + } + /* Register search index holes at domain end */ + if (sip->dom_end & VM_PHYS_SEARCH_CHUNK_MASK) { + end_idx = vm_phys_paddr_to_chunk_idx(sip->dom_end, dom); + end_chunk = vm_phys_paddr_to_search_chunk(sip->dom_end, + dom); + + vm_phys_chunk_register_hole(end_chunk, sip->dom_end, + vm_phys_search_idx_to_paddr(end_idx + 1, dom)); + } + } +} + +/* Initializes holes. */ +static void +vm_phys_init_compact(void *arg) +{ + vm_phys_compact_init_holes(); +} + +SYSINIT(vm_phys_compact, SI_SUB_KMEM + 1, SI_ORDER_ANY, vm_phys_init_compact, + NULL); + +/* Maximum number of memory regions enqueued during a search function run. */ +#define VM_PHYS_COMPACT_MAX_SEARCH_REGIONS 10 + +struct vm_phys_compact_ctx { + int last_idx; + struct vm_compact_region region[VM_PHYS_COMPACT_MAX_SEARCH_REGIONS]; +}; + +static void +vm_phys_compact_ctx_init(void **p_data) +{ + *p_data = (void *)malloc(sizeof(struct vm_phys_compact_ctx), + M_VMCOMPACT, M_ZERO | M_WAITOK); +} + +static struct vm_compact_region * +vm_phys_compact_ctx_get_region(struct vm_phys_compact_ctx *ctxp, int idx) +{ + KASSERT(idx < VM_PHYS_COMPACT_MAX_SEARCH_REGIONS, + ("%s: Not enough memory for regions: %d\n", __func__, idx)); + return (&ctxp->region[idx]); +} + +/* + * Scans the search index for physical memory regions that could be potential + * compaction candidates. Eligible regions are enqueued on a slist. + */ +static int +vm_phys_compact_search(struct vm_compact_region_head *headp, int domain, + void *p_data) +{ + struct vm_phys_search_chunk *scp; + struct vm_phys_compact_ctx *ctx = (struct vm_phys_compact_ctx *)p_data; + struct vm_phys_search_index *sip = &vm_phys_search_index[domain]; + struct vm_phys_subseg *ssegp; + struct vm_compact_region *rp; + vm_paddr_t start, end; + int idx, region_cnt = 0; + int ctx_region_idx = 0; + int chunks_scanned = 0; + + SLIST_INIT(headp); + + idx = ctx->last_idx; + while (chunks_scanned < sip->nchunks && + region_cnt < VM_PHYS_COMPACT_MAX_SEARCH_REGIONS) { + for (; + chunks_scanned < sip->nchunks && idx < sip->nchunks - 1 && + region_cnt < VM_PHYS_COMPACT_MAX_SEARCH_REGIONS; + chunks_scanned++, idx++) { + + scp = vm_phys_search_get_chunk(sip, idx); + /* Skip current chunk if it was marked as invalid */ + if (scp->skipidx) { + idx = scp->skipidx - 1; + chunks_scanned += (scp->skipidx - 1) - idx; + continue; + } + + /* Determine whether the current chunk is eligible to be + * compacted */ + if (scp->score > 1 && + scp->holecnt >= VM_PHYS_HOLECNT_LO && + scp->holecnt <= VM_PHYS_HOLECNT_HI) { + if (scp->shp) { + /* Enqueue subsegments in chunks with + * holes. */ + SLIST_FOREACH (ssegp, scp->shp, link) { + SLIST_INSERT_HEAD(headp, + &ssegp->region, entries); + } + + } else { + start = vm_phys_search_idx_to_paddr(idx, + domain); + end = vm_phys_search_idx_to_paddr(idx + + 1, + domain); + + rp = vm_phys_compact_ctx_get_region(ctx, + ctx_region_idx); + rp->start = start; + rp->end = end; + SLIST_INSERT_HEAD(headp, rp, entries); + + ctx_region_idx++; + } + + region_cnt++; + } + } + idx = (idx + 1) % (sip->nchunks - 1); + } + ctx->last_idx = (idx + 1) % (sip->nchunks - 1); + + return SLIST_EMPTY(headp); +} + +/* + * Determine whether a given page is eligible as a relocation destination. + */ +static __noinline bool +vm_phys_defrag_page_free(vm_page_t p) +{ + return (p->order == 0); +} + +/* + * Determine whether a given page is eligible to be relocated. + * A suitable page is left in a xbusied state and its object is locked. + */ +static __noinline bool +vm_phys_defrag_page_relocatable(vm_page_t p) +{ + vm_object_t obj; + + if (p->order != VM_NFREEORDER || vm_page_wired(p) || + (obj = atomic_load_ptr(&p->object)) == NULL) + return false; + + VM_OBJECT_WLOCK(obj); + if (obj != p->object || + (obj->type != OBJT_DEFAULT && obj->type != OBJT_VNODE)) { + goto unlock; + } + + if (vm_page_tryxbusy(p) == 0) + goto unlock; + + if (!vm_page_wired(p) && !vm_page_none_valid(p)) { + return true; + } + + vm_page_xunbusy(p); +unlock: + VM_OBJECT_WUNLOCK(obj); + return false; +} + +static size_t +vm_phys_defrag(struct vm_compact_region_head *headp, int domain, void *p_data) +{ + vm_compact_region_t rp; + size_t nrelocated = 0; + int error; + while (!SLIST_EMPTY(headp)) { + rp = SLIST_FIRST(headp); + SLIST_REMOVE_HEAD(headp, entries); + + vm_page_t free = PHYS_TO_VM_PAGE(rp->start); + vm_page_t scan = PHYS_TO_VM_PAGE(rp->end - PAGE_SIZE); + + KASSERT(free && scan, + ("%s: pages are null %p, %p, region start: %p, region end: %p", + __func__, free, scan, (void *)rp->start, + (void *)rp->end)); + KASSERT(free->phys_addr && scan->phys_addr, + ("%s: pages have null paddr %p, %p", __func__, + (void *)free->phys_addr, (void *)scan->phys_addr)); + + while (free < scan) { + + /* Find suitable destination page ("hole"). */ + while (free < scan && !vm_phys_defrag_page_free(free)) { + free++; + } + + if (__predict_false(free >= scan)) { + break; + } + + /* Find suitable relocation candidate. */ + while (free < scan && + !vm_phys_defrag_page_relocatable(scan)) { + scan--; + } + + if (__predict_false(free >= scan)) { + break; + } + + /* Swap the two pages and move "fingers". */ + error = vm_page_relocate_page(scan, free, domain); + if (error == 0) { + nrelocated++; + scan--; + free++; + } else if (error == 1) { + scan--; + } else { + free++; + } + } + } + + return nrelocated; +} + +/* + * Value of FMFI metric below which compaction will not start. + */ +static int vm_phys_compact_thresh = 300; /* 200 - 1000 */ +static int sysctl_vm_phys_compact_thresh(SYSCTL_HANDLER_ARGS); +SYSCTL_OID(_vm, OID_AUTO, phys_compact_thresh, CTLTYPE_INT | CTLFLAG_RW, NULL, + 0, sysctl_vm_phys_compact_thresh, "I", + "Fragmentation index threshold for memory compaction"); + +static int +sysctl_vm_phys_compact_thresh(SYSCTL_HANDLER_ARGS) +{ + int error; + int new = vm_phys_compact_thresh; + + error = sysctl_handle_int(oidp, &new, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + + if (new != vm_phys_compact_thresh) { + if (new < 200) { + new = 200; + } else if (new > 1000) { + new = 1000; + } + vm_phys_compact_thresh = new; + } + + return (0); +} + +/* + * Structures and routines used by the compaction daemon. + */ +static struct proc *compactproc; +static struct thread *compact_threads[MAXMEMDOM - 1]; + +static void +vm_phys_compact_thread(void *arg) +{ + void *cctx; + size_t domain = (size_t)arg; + void *chan = (void *)&compact_threads[domain]; + struct vm_domain *dom = VM_DOMAIN(domain); + + int error; + int old_frag_idx, frag_idx, nretries = 0; + int nrelocated; + int timo = hz; + + vm_paddr_t start, end; + + start = vm_phys_search_index[domain].dom_start; + end = vm_phys_search_index[domain].dom_end; + cctx = vm_compact_create_job(vm_phys_compact_search, vm_phys_defrag, + vm_phys_compact_ctx_init, start, end, VM_LEVEL_0_ORDER, domain, + &error); + KASSERT(cctx != NULL, ("Error creating compaction job: %d\n", error)); + + while (true) { + tsleep(chan, PPAUSE | PCATCH | PNOLOCK, "cmpctslp", timo); + kproc_suspend_check(compactproc); + + vm_domain_free_lock(dom); + frag_idx = vm_phys_fragmentation_index(VM_LEVEL_0_ORDER, + domain); + vm_domain_free_unlock(dom); + + nretries = 0; + + /* Run compaction until the fragmentation metric stops + * improving. */ + do { + /* No need to compact if fragmentation is below the + * threshold. */ + if (frag_idx < vm_phys_compact_thresh) { + break; + } + + old_frag_idx = frag_idx; + + nrelocated = vm_compact_run(cctx); + /* An error occured. */ + if (nrelocated < 0) { + break; + } + + vm_domain_free_lock(dom); + frag_idx = vm_phys_fragmentation_index(VM_LEVEL_0_ORDER, + domain); + vm_domain_free_unlock(dom); + + if (nrelocated == 0 || (frag_idx >= old_frag_idx)) { + nretries++; + } else { + nretries = 0; + } + } while (nretries < 5); + + /* If compaction was not able to lower the fragmentation score, + * sleep for a longer period of time. */ + if (nretries == 5) { + timo = 10 * hz; + } else { + timo = hz; + } + } + vm_compact_free_job(cctx); +} + +static void +vm_phys_compact_daemon(void) +{ + int error; + + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, compactproc, + SHUTDOWN_PRI_FIRST); + + for (size_t i = 1; i < vm_ndomains; i++) { + error = kproc_kthread_add(vm_phys_compact_thread, (void *)i, + &compactproc, &compact_threads[i - 1], 0, 0, + "compactdaemon", "compact%zu", i); + if (error) { + panic("%s: cannot start compaction thread, error: %d", + __func__, error); + } + } + + vm_phys_compact_thread((void *)0); +} + +static struct kproc_desc compact_kp = { "compactdaemon", vm_phys_compact_daemon, + &compactproc }; +SYSINIT(compactdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start, + &compact_kp); + +static int sysctl_vm_phys_compact(SYSCTL_HANDLER_ARGS); +SYSCTL_OID(_vm, OID_AUTO, phys_compact, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, + sysctl_vm_phys_compact, "A", "Compact physical memory"); + +static int +sysctl_vm_phys_compact(SYSCTL_HANDLER_ARGS) +{ + struct sbuf sbuf; + int error; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sbuf_new_for_sysctl(&sbuf, NULL, 32, req); + + for (int i = 0; i < vm_ndomains; i++) { + void *chan = (void *)&compact_threads[i]; + wakeup_one(chan); + } + + sbuf_printf(&sbuf, "Kicked compaction daemon"); + + error = sbuf_finish(&sbuf); + sbuf_delete(&sbuf); + + return (error); +}