Page MenuHomeFreeBSD

D40772.id123834.diff
No OneTemporary

D40772.id123834.diff

Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -5189,6 +5189,7 @@
vm/uma_core.c standard
vm/uma_dbg.c standard
vm/memguard.c optional DEBUG_MEMGUARD
+vm/vm_compact.c standard
vm/vm_domainset.c standard
vm/vm_fault.c standard
vm/vm_glue.c standard
Index: sys/vm/vm_compact.h
===================================================================
--- /dev/null
+++ sys/vm/vm_compact.h
@@ -0,0 +1,56 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023. Bojan Novković <bnovkov@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+
+MALLOC_DECLARE(M_VMCOMPACT);
+
+struct vm_compact_region {
+ vm_paddr_t start;
+ vm_paddr_t end;
+ SLIST_ENTRY(vm_compact_region) entries;
+};
+typedef struct vm_compact_region *vm_compact_region_t;
+
+SLIST_HEAD(vm_compact_region_head, vm_compact_region);
+
+typedef int (
+ *vm_compact_search_fn)(struct vm_compact_region_head *, int, void *);
+typedef size_t (
+ *vm_compact_defrag_fn)(struct vm_compact_region_head *, int, void *);
+typedef bool (*vm_compact_end_fn)(void);
+typedef void (*vm_compact_ctx_init_fn)(void **);
+
+void *vm_compact_create_job(vm_compact_search_fn sfn, vm_compact_defrag_fn dfn,
+ vm_compact_ctx_init_fn ctxfn, vm_paddr_t start, vm_paddr_t end, int order,
+ int domain, int *error);
+void vm_compact_free_job(void *ctx);
+int vm_compact_run(void *ctx);
Index: sys/vm/vm_compact.c
===================================================================
--- /dev/null
+++ sys/vm/vm_compact.c
@@ -0,0 +1,151 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023. Bojan Novković <bnovkov@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_compact.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pagequeue.h>
+#include <vm/vm_phys.h>
+
+#define VM_COMPACT_LOCK() mtx_lock(&compact_lock)
+#define VM_COMPACT_UNLOCK() mtx_unlock(&compact_lock)
+
+MALLOC_DEFINE(M_VMCOMPACT, "vm_compact_ctx", "memory compaction context");
+
+static struct mtx compact_lock;
+static LIST_HEAD(, vm_compact_ctx) active_compactions[MAXMEMDOM];
+
+struct vm_compact_ctx {
+ vm_compact_search_fn search_fn;
+ vm_compact_defrag_fn defrag_fn;
+
+ vm_paddr_t start;
+ vm_paddr_t end;
+
+ int order;
+ int domain;
+ struct vm_compact_region_head regions;
+
+ void *p_data;
+
+ LIST_ENTRY(vm_compact_ctx) entries;
+};
+
+static bool
+vm_compact_job_overlaps(struct vm_compact_ctx *ctxp1,
+ struct vm_compact_ctx *ctxp2)
+{
+ return (ctxp1->start <= ctxp2->start && ctxp2->start <= ctxp1->end);
+}
+
+void *
+vm_compact_create_job(vm_compact_search_fn sfn, vm_compact_defrag_fn dfn,
+ vm_compact_ctx_init_fn ctxfn, vm_paddr_t start, vm_paddr_t end, int order,
+ int domain, int *error)
+{
+ struct vm_compact_ctx *ctxp;
+ /* Arguments sanity check. */
+ if (end <= start || order > (VM_NFREEORDER_MAX - 1)) {
+ *error = (EINVAL);
+ return (NULL);
+ }
+
+ ctxp = malloc(sizeof(struct vm_compact_ctx), M_VMCOMPACT,
+ M_WAITOK | M_ZERO);
+
+ ctxp->search_fn = sfn;
+ ctxp->defrag_fn = dfn;
+ ctxp->start = start;
+ ctxp->order = order;
+ ctxp->domain = domain;
+ SLIST_INIT(&ctxp->regions);
+
+ ctxfn(&ctxp->p_data);
+
+ return ((void *)ctxp);
+}
+
+void
+vm_compact_free_job(void *ctx)
+{
+ free(ctx, M_VMCOMPACT);
+}
+
+int
+vm_compact_run(void *ctx)
+{
+ struct vm_compact_ctx *ctxp = (struct vm_compact_ctx *)ctx;
+ struct vm_compact_ctx *ctxp_tmp;
+ int retval;
+
+ VM_COMPACT_LOCK();
+ /* Check if the requested compaction overlaps with an existing one. */
+ LIST_FOREACH (ctxp_tmp, &active_compactions[ctxp->domain], entries) {
+ if (vm_compact_job_overlaps(ctxp, ctxp_tmp)) {
+ VM_COMPACT_UNLOCK();
+ return (-EINPROGRESS);
+ }
+ }
+
+ LIST_INSERT_HEAD(&active_compactions[ctxp->domain], ctxp, entries);
+ VM_COMPACT_UNLOCK();
+
+ /* Run compaction job. */
+ if (ctxp->search_fn(&ctxp->regions, ctxp->domain, ctxp->p_data)) {
+ retval = 0;
+ goto cleanup;
+ }
+
+ retval = ctxp->defrag_fn(&ctxp->regions, ctxp->domain, ctxp->p_data);
+
+cleanup:
+ VM_COMPACT_LOCK();
+ LIST_REMOVE(ctxp, entries);
+ VM_COMPACT_UNLOCK();
+
+ return retval;
+}
+
+static void
+vm_compact_init(void *arg)
+{
+ mtx_init(&compact_lock, "vm_compact", NULL, MTX_DEF);
+ for (int i = 0; i < MAXMEMDOM; i++)
+ LIST_INIT(&active_compactions[i]);
+}
+
+SYSINIT(vm_compact, SI_SUB_KMEM + 2, SI_ORDER_ANY, vm_compact_init, NULL);
Index: sys/vm/vm_page.h
===================================================================
--- sys/vm/vm_page.h
+++ sys/vm/vm_page.h
@@ -711,6 +711,7 @@
vm_page_bits_t vm_page_bits(int base, int size);
void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
void vm_page_free_pages_toq(struct spglist *free, bool update_wire_count);
+int vm_page_relocate_page(vm_page_t src, vm_page_t dst, int domain);
void vm_page_dirty_KBI(vm_page_t m);
void vm_page_lock_KBI(vm_page_t m, const char *file, int line);
Index: sys/vm/vm_page.c
===================================================================
--- sys/vm/vm_page.c
+++ sys/vm/vm_page.c
@@ -738,6 +738,8 @@
#endif
phys_avail[biggestone + 1] = new_end;
+ vm_phys_search_index_startup(&vaddr);
+
/*
* Add physical memory segments corresponding to the available
* physical pages.
@@ -5595,6 +5597,86 @@
}
#endif
+/*
+ * Tries to move 'src' into 'dst'. The 'src' page must be busied and its object
+ * locked. Returns 0 on success, 1 if the error was caused by the src page, 2 if
+ * caused by the dst page.
+ */
+int
+vm_page_relocate_page(vm_page_t src, vm_page_t dst, int domain)
+{
+ int error = 0;
+ struct vm_domain *vmd = VM_DOMAIN(domain);
+ vm_object_t obj = src->object;
+
+ vm_page_assert_xbusied(src);
+
+ VM_OBJECT_ASSERT_WLOCKED(obj);
+ KASSERT(vm_page_domain(src) == domain,
+ ("Source page is from a different domain"));
+ KASSERT(vm_page_domain(dst) == domain,
+ ("Destination page is from a different domain"));
+
+ vm_domain_free_lock(vmd);
+ /* Check if the dst page is still eligible and remove it from the
+ * freelist. */
+ if (dst->order != 0 || !vm_page_none_valid(dst)) {
+ error = 2;
+ vm_page_xunbusy(src);
+ vm_domain_free_unlock(vmd);
+ goto unlock;
+ }
+
+ vm_page_dequeue(dst);
+ vm_phys_unfree_page(dst);
+ vm_domain_free_unlock(vmd);
+ vm_domain_freecnt_inc(vmd, -1);
+
+ /* Unmap src page */
+ if (obj->ref_count != 0 && !vm_page_try_remove_all(src)) {
+ error = 1;
+
+ vm_page_xunbusy(src);
+ /* Place dst page back on the freelists. */
+ vm_domain_free_lock(vmd);
+ vm_phys_free_pages(dst, 0);
+ vm_domain_free_unlock(vmd);
+ vm_domain_freecnt_inc(vmd, 1);
+ goto unlock;
+ }
+ /* Note - if this is missing the calling process gets stuck at the
+ * 'vmpfw' channel */
+ if (dst->busy_lock == VPB_FREED) {
+ dst->busy_lock = VPB_UNBUSIED;
+ }
+
+ /* Copy page attributes */
+ dst->a.flags = src->a.flags & ~PGA_QUEUE_STATE_MASK;
+ dst->oflags = 0;
+ pmap_copy_page(src, dst);
+
+ dst->valid = src->valid;
+ dst->dirty = src->dirty;
+ src->flags &= ~PG_ZERO;
+ vm_page_dequeue(src);
+
+ if (vm_page_replace_hold(dst, obj, src->pindex, src) &&
+ vm_page_free_prep(src)) {
+ /* Return src page to freelist. */
+ vm_domain_free_lock(vmd);
+ vm_phys_free_pages(src, 0);
+ vm_domain_free_unlock(vmd);
+
+ vm_domain_freecnt_inc(vmd, 1);
+ }
+
+ vm_page_deactivate(dst);
+unlock:
+ VM_OBJECT_WUNLOCK(obj);
+
+ return error;
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kernel.h>
Index: sys/vm/vm_phys.h
===================================================================
--- sys/vm/vm_phys.h
+++ sys/vm/vm_phys.h
@@ -87,7 +87,8 @@
int vm_phys_avail_largest(void);
vm_paddr_t vm_phys_avail_size(int i);
bool vm_phys_is_dumpable(vm_paddr_t pa);
-
+int vm_phys_fragmentation_index(int order, int domain);
+void vm_phys_search_index_startup(vm_offset_t *vaddr);
static inline int
vm_phys_domain(vm_paddr_t pa)
{
Index: sys/vm/vm_phys.c
===================================================================
--- sys/vm/vm_phys.c
+++ sys/vm/vm_phys.c
@@ -47,8 +47,10 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/domainset.h>
-#include <sys/lock.h>
+#include <sys/eventhandler.h>
#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
@@ -62,6 +64,8 @@
#include <ddb/ddb.h>
#include <vm/vm.h>
+
+#include <vm/vm_compact.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>
#include <vm/vm_kern.h>
@@ -131,6 +135,53 @@
vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
+/*
+ * Structures used for memory compaction.
+ */
+
+/* Tracks invalid physical memory ranges. */
+struct vm_phys_hole {
+ vm_paddr_t start;
+ vm_paddr_t end;
+ int domain;
+};
+
+/* Used to track valid memory ranges inside search index chunks containing
+ * memory holes. */
+struct vm_phys_subseg {
+ struct vm_compact_region region;
+ SLIST_ENTRY(vm_phys_subseg) link;
+};
+SLIST_HEAD(vm_phys_subseg_head, vm_phys_subseg);
+
+/* Tracks various metrics and valid memory segments for a fixed-size physical
+ * memory region. */
+struct vm_phys_search_chunk {
+ int holecnt;
+ int score;
+ int skipidx;
+ struct vm_phys_subseg_head *shp;
+};
+
+struct vm_phys_search_index {
+ struct vm_phys_search_chunk *chunks;
+ int nchunks;
+ vm_paddr_t dom_start;
+ vm_paddr_t dom_end;
+};
+
+static void vm_phys_update_search_index(vm_page_t m, int order, bool alloc);
+
+static struct vm_phys_search_index vm_phys_search_index[MAXMEMDOM];
+
+static struct vm_phys_hole vm_phys_holes[VM_PHYSSEG_MAX * 2];
+static int vm_phys_nholes;
+
+struct vm_phys_info {
+ uint64_t free_pages;
+ uint64_t free_blocks;
+};
+
/*
* Provides the mapping from VM_FREELIST_* to free list indices (flind).
*/
@@ -156,6 +207,11 @@
sysctl_vm_phys_free, "A",
"Phys Free Info");
+static int sysctl_vm_phys_frag_idx(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_frag_idx,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_vm_phys_frag_idx, "A", "Phys Frag Info");
+
static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
SYSCTL_OID(_vm, OID_AUTO, phys_segs,
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
@@ -291,6 +347,80 @@
return (error);
}
+static void
+vm_phys_get_info(struct vm_phys_info *info, int domain)
+{
+ struct vm_freelist *fl;
+ int pind, oind, flind;
+
+ /* Calculate total number of free pages and blocks */
+ info->free_pages = info->free_blocks = 0;
+ for (flind = 0; flind < vm_nfreelists; flind++) {
+ for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+ for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+ fl = vm_phys_free_queues[domain][flind][pind];
+ info->free_pages += fl[oind].lcnt << oind;
+ info->free_blocks += fl[oind].lcnt;
+ }
+ }
+ }
+}
+
+int
+vm_phys_fragmentation_index(int order, int domain)
+{
+ struct vm_phys_info info;
+
+ vm_domain_free_assert_locked(VM_DOMAIN(domain));
+ vm_phys_get_info(&info, domain);
+
+ if (info.free_blocks == 0) {
+ return (0);
+ }
+
+ return (1000 -
+ ((info.free_pages * 1000) / (1 << order) / info.free_blocks));
+}
+
+/*
+ * Outputs the value of the Free Memory Fragmentation Index (FMFI) for each
+ * domain.
+ */
+static int
+sysctl_vm_phys_frag_idx(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ int64_t idx;
+ int oind, dom, error;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
+
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ vm_domain_free_lock(VM_DOMAIN(dom));
+
+ sbuf_printf(&sbuf, "\n--\n");
+ sbuf_printf(&sbuf, "\nDOMAIN %d\n", dom);
+ sbuf_printf(&sbuf, "\n ORDER (SIZE) | FMFI\n");
+ sbuf_printf(&sbuf, "--\n");
+
+ for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+ idx = vm_phys_fragmentation_index(oind, dom);
+ sbuf_printf(&sbuf, " %2d (%6dK) ", oind,
+ 1 << (PAGE_SHIFT - 10 + oind));
+ sbuf_printf(&sbuf, "| %ld \n", idx);
+ }
+
+ vm_domain_free_unlock(VM_DOMAIN(dom));
+ }
+
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+
/*
* Outputs the set of physical memory segments.
*/
@@ -378,6 +508,7 @@
else
TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
fl[order].lcnt++;
+ vm_phys_update_search_index(m, order, false);
}
static void
@@ -387,6 +518,7 @@
TAILQ_REMOVE(&fl[order].pl, m, listq);
fl[order].lcnt--;
m->order = VM_NFREEORDER;
+ vm_phys_update_search_index(m, order, true);
}
/*
@@ -614,6 +746,47 @@
}
}
+ /*
+ * Initialize hole array.
+ */
+ struct vm_phys_hole *hp;
+
+ vm_phys_nholes = 0;
+ if (vm_phys_segs[0].start != 0) {
+ hp = &vm_phys_holes[0];
+ hp->start = 0;
+ hp->end = vm_phys_segs[0].start;
+ hp->domain = vm_phys_segs[0].domain;
+ vm_phys_nholes++;
+ }
+
+ struct vm_phys_search_index *sip;
+ /* Initialize memory hole array. */
+ for (int i = 0; i + 1 < vm_phys_nsegs; i++, vm_phys_nholes++) {
+ hp = &vm_phys_holes[vm_phys_nholes];
+ hp->start = vm_phys_segs[i].end;
+ hp->end = vm_phys_segs[i + 1].start;
+ hp->domain = vm_phys_segs[i].domain;
+ sip = &vm_phys_search_index[hp->domain];
+
+ /* Does this hole span two domains? */
+ if (vm_phys_segs[i].domain != vm_phys_segs[i + 1].domain &&
+ hp->end > sip->dom_end) {
+ /* Clamp end of current hole to domain end */
+ sip = &vm_phys_search_index[hp->domain];
+ hp->end = sip->dom_end;
+ /* Add new hole at beginning of subsequent domain */
+ vm_phys_nholes++;
+ hp = &vm_phys_holes[vm_phys_nholes];
+ hp->domain = vm_phys_segs[i + 1].domain;
+ sip = &vm_phys_search_index[hp->domain];
+ /* Hole starts at domain start and ends at the start of
+ * the first segment. */
+ hp->start = sip->dom_start;
+ hp->end = vm_phys_segs[i + 1].start;
+ }
+ }
+
rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
}
@@ -1892,3 +2065,654 @@
}
}
#endif
+
+#define VM_PHYS_SEARCH_CHUNK_ORDER (14)
+#define VM_PHYS_SEARCH_CHUNK_NPAGES (1 << (VM_PHYS_SEARCH_CHUNK_ORDER))
+#define VM_PHYS_SEARCH_CHUNK_SIZE \
+ (1 << (PAGE_SHIFT + VM_PHYS_SEARCH_CHUNK_ORDER))
+#define VM_PHYS_SEARCH_CHUNK_MASK (VM_PHYS_SEARCH_CHUNK_SIZE - 1)
+#define VM_PHYS_HOLECNT_HI ((1 << (VM_PHYS_SEARCH_CHUNK_ORDER)) - 100)
+#define VM_PHYS_HOLECNT_LO (16)
+
+static __inline vm_paddr_t
+vm_phys_search_idx_to_paddr(int idx, int domain)
+{
+ vm_paddr_t paddr;
+ struct vm_phys_search_index *sip = &vm_phys_search_index[domain];
+
+ paddr = (vm_paddr_t)idx << ((VM_PHYS_SEARCH_CHUNK_ORDER) + PAGE_SHIFT);
+ /* Adjust address relative to domain start */
+ paddr += sip->dom_start & ~VM_PHYS_SEARCH_CHUNK_MASK;
+
+ return (paddr);
+}
+
+static __inline int
+vm_phys_paddr_to_chunk_idx(vm_paddr_t paddr, int domain)
+{
+ struct vm_phys_search_index *sip = &vm_phys_search_index[domain];
+
+ /* Adjust address relative to domain start */
+ paddr -= sip->dom_start & ~VM_PHYS_SEARCH_CHUNK_MASK;
+ /* Strip lower bits */
+ paddr &= ~VM_PHYS_SEARCH_CHUNK_MASK;
+ return (int)(paddr >> ((VM_PHYS_SEARCH_CHUNK_ORDER) + PAGE_SHIFT));
+}
+
+static __inline struct vm_phys_search_chunk *
+vm_phys_search_get_chunk(struct vm_phys_search_index *sip, int idx)
+{
+ KASSERT(idx >= 0 && idx < sip->nchunks,
+ ("%s: search index out-of-bounds access, idx: %d, dom_start: %p, dom_end: %p, nchunks: %d",
+ __func__, idx, (void *)sip->dom_start, (void *)sip->dom_end,
+ sip->nchunks));
+
+ return (&sip->chunks[idx]);
+}
+
+static struct vm_phys_search_chunk *
+vm_phys_paddr_to_search_chunk(vm_paddr_t paddr, int domain)
+{
+ struct vm_phys_search_index *sip = &vm_phys_search_index[domain];
+ int idx = vm_phys_paddr_to_chunk_idx(paddr, domain);
+
+ return vm_phys_search_get_chunk(sip, idx);
+}
+
+/*
+ * Allocates physical memory required for the memory compaction search index.
+ */
+void
+vm_phys_search_index_startup(vm_offset_t *vaddr)
+{
+ struct vm_phys_search_index *cur_idx;
+ vm_paddr_t pa;
+ vm_paddr_t dom_start, dom_end;
+ size_t alloc_size;
+ int dom_nsearch_chunks;
+ int i;
+
+ for (int dom = 0; dom < vm_ndomains; dom++) {
+ cur_idx = &vm_phys_search_index[dom];
+ dom_nsearch_chunks = 0;
+ /* Calculate number of of search index chunks for current domain
+ */
+ if (mem_affinity != NULL) {
+ for (i = 0; mem_affinity[i].end != 0; i++) {
+ if (mem_affinity[i].domain == dom) {
+ dom_start = mem_affinity[i].start;
+ while (mem_affinity[i].domain == dom) {
+ i++;
+ }
+ dom_end = mem_affinity[i - 1].end;
+ }
+ }
+ } else {
+ dom_start = phys_avail[0];
+ i = 1;
+ while (phys_avail[i + 1] != 0) {
+ i++;
+ }
+ dom_end = phys_avail[i];
+ }
+ /* Allocate search index for current domain */
+ dom_nsearch_chunks = atop(dom_end - dom_start) /
+ VM_PHYS_SEARCH_CHUNK_NPAGES;
+ /* Add additional chunks if beginning and start aren't search
+ * chunk-aligned. */
+ if (dom_start & VM_PHYS_SEARCH_CHUNK_MASK)
+ dom_nsearch_chunks++;
+ if (dom_end & VM_PHYS_SEARCH_CHUNK_MASK)
+ dom_nsearch_chunks++;
+
+ alloc_size = round_page(
+ dom_nsearch_chunks * sizeof(struct vm_phys_search_chunk));
+ pa = vm_phys_early_alloc(dom, alloc_size);
+
+ /* Map and zero the array */
+ cur_idx->chunks = (void *)(uintptr_t)pmap_map(vaddr, pa,
+ pa + alloc_size, VM_PROT_READ | VM_PROT_WRITE);
+ cur_idx->nchunks = dom_nsearch_chunks;
+ cur_idx->dom_start = dom_start;
+ cur_idx->dom_end = dom_end;
+
+ if (cur_idx->chunks == NULL) {
+ panic("Unable to allocate search index for domain %d\n",
+ dom);
+ }
+
+ bzero(cur_idx->chunks, alloc_size);
+ }
+}
+
+static void
+vm_phys_update_search_index(vm_page_t m, int order, bool alloc)
+{
+ int domain = vm_page_domain(m);
+ struct vm_phys_search_chunk *scp =
+ vm_phys_paddr_to_search_chunk(m->phys_addr, domain);
+ int pgcnt = 1 << order;
+
+ /* Update chunk hole count */
+ scp->holecnt += alloc ? -pgcnt : pgcnt;
+ KASSERT(scp->holecnt >= 0 &&
+ scp->holecnt <= VM_PHYS_SEARCH_CHUNK_NPAGES,
+ ("%s: inconsistent hole count: %d", __func__, scp->holecnt));
+
+ /* Update chunk fragmentation score */
+ if (order == 0) {
+ scp->score += alloc ? -1 : 1;
+ if (scp->score < 0) {
+ scp->score = 0;
+ }
+ }
+}
+
+static void
+vm_phys_chunk_register_hole(struct vm_phys_search_chunk *cp,
+ vm_paddr_t hole_start, vm_paddr_t hole_end)
+{
+ struct vm_phys_subseg *ssp;
+
+ if (cp->shp == NULL) {
+ vm_paddr_t chunk_start = hole_start &
+ ~VM_PHYS_SEARCH_CHUNK_MASK;
+ cp->shp = malloc(sizeof(*cp->shp), M_TEMP, M_ZERO | M_WAITOK);
+ SLIST_INIT(cp->shp);
+ /* Split chunk into a subseg */
+ ssp = malloc(sizeof(*ssp), M_TEMP, M_ZERO | M_WAITOK);
+ ssp->region.start = chunk_start;
+ ssp->region.end = chunk_start + VM_PHYS_SEARCH_CHUNK_SIZE;
+
+ SLIST_INSERT_HEAD(cp->shp, ssp, link);
+ }
+
+ /*
+ * Holes are ordered by paddr - hole registration will
+ * thus always affect the last subsegment in the list.
+ * Take last subseg and split it.
+ */
+ ssp = SLIST_FIRST(cp->shp);
+ while (SLIST_NEXT(ssp, link)) {
+ ssp = SLIST_NEXT(ssp, link);
+ }
+
+ if (hole_start == ssp->region.start) {
+ ssp->region.start = hole_end;
+ } else if (hole_end == ssp->region.end) {
+ ssp->region.end = hole_start;
+ } else { /* Hole splits the subseg - create and enqueue new subseg */
+ struct vm_phys_subseg *nssp = malloc(sizeof(*nssp), M_TEMP,
+ M_ZERO | M_WAITOK);
+
+ nssp->region.start = hole_end;
+ nssp->region.end = ssp->region.end;
+ ssp->region.end = hole_start;
+ KASSERT(nssp->region.end > nssp->region.start,
+ ("%s: inconsistent subsegment after splitting", __func__));
+
+ SLIST_INSERT_AFTER(ssp, nssp, link);
+ }
+
+ KASSERT(ssp->region.end > ssp->region.start,
+ ("%s: inconsistent subsegment", __func__));
+}
+
+/*
+ * Populates compaction search index with hole information.
+ */
+static void
+vm_phys_compact_init_holes(void)
+{
+ int dom;
+ struct vm_phys_search_index *sip;
+ struct vm_phys_search_chunk *start_chunk, *end_chunk;
+ struct vm_phys_hole *hp;
+ int start_idx, end_idx;
+
+ for (dom = 0; dom < vm_ndomains; dom++) {
+ sip = &vm_phys_search_index[dom];
+
+ /* Add hole information to domain search chunks */
+ for (int i = 0; i < vm_phys_nholes; i++) {
+ hp = &vm_phys_holes[i];
+ if (hp->domain != dom)
+ continue;
+
+ start_idx = vm_phys_paddr_to_chunk_idx(hp->start, dom);
+ end_idx = vm_phys_paddr_to_chunk_idx(hp->end, dom);
+
+ start_chunk = vm_phys_search_get_chunk(sip, start_idx);
+ /*
+ * If the domain end address is search chunk-aligned
+ * and a hole ends there, decrement the index to avoid
+ * an out of bounds access to the search index chunks.
+ */
+ if ((sip->dom_end & VM_PHYS_SEARCH_CHUNK_MASK) == 0 &&
+ hp->end == sip->dom_end) {
+ end_chunk = vm_phys_search_get_chunk(sip,
+ end_idx - 1);
+ /* This is the last search chunk, point it to
+ * the first one */
+ end_chunk->skipidx = 1;
+ } else {
+ end_chunk = vm_phys_search_get_chunk(sip,
+ end_idx);
+ }
+
+ /* Hole is completely inside this chunk */
+ if (start_chunk == end_chunk) {
+ /* Register hole in current chunk. */
+ vm_phys_chunk_register_hole(start_chunk,
+ hp->start, hp->end);
+ } else { /* Hole spans multiple chunks */
+ if (hp->start & VM_PHYS_SEARCH_CHUNK_MASK) {
+ /* Partial overlap - register hole in
+ * first chunk. */
+ vm_phys_chunk_register_hole(start_chunk,
+ hp->start,
+ (hp->start &
+ ~VM_PHYS_SEARCH_CHUNK_MASK) +
+ VM_PHYS_SEARCH_CHUNK_SIZE);
+ start_chunk++;
+ }
+ /* Mark all chunks that are completely covered
+ * by this hole as invalid. */
+ while (start_chunk < end_chunk) {
+ start_chunk->skipidx = end_idx;
+ start_chunk++;
+ }
+
+ if (hp->end & VM_PHYS_SEARCH_CHUNK_MASK) {
+ /* Partial overlap - register hole in
+ * last chunk. */
+ vm_phys_chunk_register_hole(end_chunk,
+ (hp->end &
+ ~VM_PHYS_SEARCH_CHUNK_MASK),
+ hp->end);
+ }
+ }
+ }
+ /* Register search index holes at domain end */
+ if (sip->dom_end & VM_PHYS_SEARCH_CHUNK_MASK) {
+ end_idx = vm_phys_paddr_to_chunk_idx(sip->dom_end, dom);
+ end_chunk = vm_phys_paddr_to_search_chunk(sip->dom_end,
+ dom);
+
+ vm_phys_chunk_register_hole(end_chunk, sip->dom_end,
+ vm_phys_search_idx_to_paddr(end_idx + 1, dom));
+ }
+ }
+}
+
+/* Initializes holes. */
+static void
+vm_phys_init_compact(void *arg)
+{
+ vm_phys_compact_init_holes();
+}
+
+SYSINIT(vm_phys_compact, SI_SUB_KMEM + 1, SI_ORDER_ANY, vm_phys_init_compact,
+ NULL);
+
+/* Maximum number of memory regions enqueued during a search function run. */
+#define VM_PHYS_COMPACT_MAX_SEARCH_REGIONS 10
+
+struct vm_phys_compact_ctx {
+ int last_idx;
+ struct vm_compact_region region[VM_PHYS_COMPACT_MAX_SEARCH_REGIONS];
+};
+
+static void
+vm_phys_compact_ctx_init(void **p_data)
+{
+ *p_data = (void *)malloc(sizeof(struct vm_phys_compact_ctx),
+ M_VMCOMPACT, M_ZERO | M_WAITOK);
+}
+
+static struct vm_compact_region *
+vm_phys_compact_ctx_get_region(struct vm_phys_compact_ctx *ctxp, int idx)
+{
+ KASSERT(idx < VM_PHYS_COMPACT_MAX_SEARCH_REGIONS,
+ ("%s: Not enough memory for regions: %d\n", __func__, idx));
+ return (&ctxp->region[idx]);
+}
+
+/*
+ * Scans the search index for physical memory regions that could be potential
+ * compaction candidates. Eligible regions are enqueued on a slist.
+ */
+static int
+vm_phys_compact_search(struct vm_compact_region_head *headp, int domain,
+ void *p_data)
+{
+ struct vm_phys_search_chunk *scp;
+ struct vm_phys_compact_ctx *ctx = (struct vm_phys_compact_ctx *)p_data;
+ struct vm_phys_search_index *sip = &vm_phys_search_index[domain];
+ struct vm_phys_subseg *ssegp;
+ struct vm_compact_region *rp;
+ vm_paddr_t start, end;
+ int idx, region_cnt = 0;
+ int ctx_region_idx = 0;
+ int chunks_scanned = 0;
+
+ SLIST_INIT(headp);
+
+ idx = ctx->last_idx;
+ while (chunks_scanned < sip->nchunks &&
+ region_cnt < VM_PHYS_COMPACT_MAX_SEARCH_REGIONS) {
+ for (;
+ chunks_scanned < sip->nchunks && idx < sip->nchunks - 1 &&
+ region_cnt < VM_PHYS_COMPACT_MAX_SEARCH_REGIONS;
+ chunks_scanned++, idx++) {
+
+ scp = vm_phys_search_get_chunk(sip, idx);
+ /* Skip current chunk if it was marked as invalid */
+ if (scp->skipidx) {
+ idx = scp->skipidx - 1;
+ chunks_scanned += (scp->skipidx - 1) - idx;
+ continue;
+ }
+
+ /* Determine whether the current chunk is eligible to be
+ * compacted */
+ if (scp->score > 1 &&
+ scp->holecnt >= VM_PHYS_HOLECNT_LO &&
+ scp->holecnt <= VM_PHYS_HOLECNT_HI) {
+ if (scp->shp) {
+ /* Enqueue subsegments in chunks with
+ * holes. */
+ SLIST_FOREACH (ssegp, scp->shp, link) {
+ SLIST_INSERT_HEAD(headp,
+ &ssegp->region, entries);
+ }
+
+ } else {
+ start = vm_phys_search_idx_to_paddr(idx,
+ domain);
+ end = vm_phys_search_idx_to_paddr(idx +
+ 1,
+ domain);
+
+ rp = vm_phys_compact_ctx_get_region(ctx,
+ ctx_region_idx);
+ rp->start = start;
+ rp->end = end;
+ SLIST_INSERT_HEAD(headp, rp, entries);
+
+ ctx_region_idx++;
+ }
+
+ region_cnt++;
+ }
+ }
+ idx = (idx + 1) % (sip->nchunks - 1);
+ }
+ ctx->last_idx = (idx + 1) % (sip->nchunks - 1);
+
+ return SLIST_EMPTY(headp);
+}
+
+/*
+ * Determine whether a given page is eligible as a relocation destination.
+ */
+static __noinline bool
+vm_phys_defrag_page_free(vm_page_t p)
+{
+ return (p->order == 0);
+}
+
+/*
+ * Determine whether a given page is eligible to be relocated.
+ * A suitable page is left in a xbusied state and its object is locked.
+ */
+static __noinline bool
+vm_phys_defrag_page_relocatable(vm_page_t p)
+{
+ vm_object_t obj;
+
+ if (p->order != VM_NFREEORDER || vm_page_wired(p) ||
+ (obj = atomic_load_ptr(&p->object)) == NULL)
+ return false;
+
+ VM_OBJECT_WLOCK(obj);
+ if (obj != p->object ||
+ (obj->type != OBJT_DEFAULT && obj->type != OBJT_VNODE)) {
+ goto unlock;
+ }
+
+ if (vm_page_tryxbusy(p) == 0)
+ goto unlock;
+
+ if (!vm_page_wired(p) && !vm_page_none_valid(p)) {
+ return true;
+ }
+
+ vm_page_xunbusy(p);
+unlock:
+ VM_OBJECT_WUNLOCK(obj);
+ return false;
+}
+
+static size_t
+vm_phys_defrag(struct vm_compact_region_head *headp, int domain, void *p_data)
+{
+ vm_compact_region_t rp;
+ size_t nrelocated = 0;
+ int error;
+ while (!SLIST_EMPTY(headp)) {
+ rp = SLIST_FIRST(headp);
+ SLIST_REMOVE_HEAD(headp, entries);
+
+ vm_page_t free = PHYS_TO_VM_PAGE(rp->start);
+ vm_page_t scan = PHYS_TO_VM_PAGE(rp->end - PAGE_SIZE);
+
+ KASSERT(free && scan,
+ ("%s: pages are null %p, %p, region start: %p, region end: %p",
+ __func__, free, scan, (void *)rp->start,
+ (void *)rp->end));
+ KASSERT(free->phys_addr && scan->phys_addr,
+ ("%s: pages have null paddr %p, %p", __func__,
+ (void *)free->phys_addr, (void *)scan->phys_addr));
+
+ while (free < scan) {
+
+ /* Find suitable destination page ("hole"). */
+ while (free < scan && !vm_phys_defrag_page_free(free)) {
+ free++;
+ }
+
+ if (__predict_false(free >= scan)) {
+ break;
+ }
+
+ /* Find suitable relocation candidate. */
+ while (free < scan &&
+ !vm_phys_defrag_page_relocatable(scan)) {
+ scan--;
+ }
+
+ if (__predict_false(free >= scan)) {
+ break;
+ }
+
+ /* Swap the two pages and move "fingers". */
+ error = vm_page_relocate_page(scan, free, domain);
+ if (error == 0) {
+ nrelocated++;
+ scan--;
+ free++;
+ } else if (error == 1) {
+ scan--;
+ } else {
+ free++;
+ }
+ }
+ }
+
+ return nrelocated;
+}
+
+/*
+ * Value of FMFI metric below which compaction will not start.
+ */
+static int vm_phys_compact_thresh = 300; /* 200 - 1000 */
+static int sysctl_vm_phys_compact_thresh(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_compact_thresh, CTLTYPE_INT | CTLFLAG_RW, NULL,
+ 0, sysctl_vm_phys_compact_thresh, "I",
+ "Fragmentation index threshold for memory compaction");
+
+static int
+sysctl_vm_phys_compact_thresh(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int new = vm_phys_compact_thresh;
+
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ if (new != vm_phys_compact_thresh) {
+ if (new < 200) {
+ new = 200;
+ } else if (new > 1000) {
+ new = 1000;
+ }
+ vm_phys_compact_thresh = new;
+ }
+
+ return (0);
+}
+
+/*
+ * Structures and routines used by the compaction daemon.
+ */
+static struct proc *compactproc;
+static struct thread *compact_threads[MAXMEMDOM - 1];
+
+static void
+vm_phys_compact_thread(void *arg)
+{
+ void *cctx;
+ size_t domain = (size_t)arg;
+ void *chan = (void *)&compact_threads[domain];
+ struct vm_domain *dom = VM_DOMAIN(domain);
+
+ int error;
+ int old_frag_idx, frag_idx, nretries = 0;
+ int nrelocated;
+ int timo = hz;
+
+ vm_paddr_t start, end;
+
+ start = vm_phys_search_index[domain].dom_start;
+ end = vm_phys_search_index[domain].dom_end;
+ cctx = vm_compact_create_job(vm_phys_compact_search, vm_phys_defrag,
+ vm_phys_compact_ctx_init, start, end, VM_LEVEL_0_ORDER, domain,
+ &error);
+ KASSERT(cctx != NULL, ("Error creating compaction job: %d\n", error));
+
+ while (true) {
+ tsleep(chan, PPAUSE | PCATCH | PNOLOCK, "cmpctslp", timo);
+ kproc_suspend_check(compactproc);
+
+ vm_domain_free_lock(dom);
+ frag_idx = vm_phys_fragmentation_index(VM_LEVEL_0_ORDER,
+ domain);
+ vm_domain_free_unlock(dom);
+
+ nretries = 0;
+
+ /* Run compaction until the fragmentation metric stops
+ * improving. */
+ do {
+ /* No need to compact if fragmentation is below the
+ * threshold. */
+ if (frag_idx < vm_phys_compact_thresh) {
+ break;
+ }
+
+ old_frag_idx = frag_idx;
+
+ nrelocated = vm_compact_run(cctx);
+ /* An error occured. */
+ if (nrelocated < 0) {
+ break;
+ }
+
+ vm_domain_free_lock(dom);
+ frag_idx = vm_phys_fragmentation_index(VM_LEVEL_0_ORDER,
+ domain);
+ vm_domain_free_unlock(dom);
+
+ if (nrelocated == 0 || (frag_idx >= old_frag_idx)) {
+ nretries++;
+ } else {
+ nretries = 0;
+ }
+ } while (nretries < 5);
+
+ /* If compaction was not able to lower the fragmentation score,
+ * sleep for a longer period of time. */
+ if (nretries == 5) {
+ timo = 10 * hz;
+ } else {
+ timo = hz;
+ }
+ }
+ vm_compact_free_job(cctx);
+}
+
+static void
+vm_phys_compact_daemon(void)
+{
+ int error;
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, compactproc,
+ SHUTDOWN_PRI_FIRST);
+
+ for (size_t i = 1; i < vm_ndomains; i++) {
+ error = kproc_kthread_add(vm_phys_compact_thread, (void *)i,
+ &compactproc, &compact_threads[i - 1], 0, 0,
+ "compactdaemon", "compact%zu", i);
+ if (error) {
+ panic("%s: cannot start compaction thread, error: %d",
+ __func__, error);
+ }
+ }
+
+ vm_phys_compact_thread((void *)0);
+}
+
+static struct kproc_desc compact_kp = { "compactdaemon", vm_phys_compact_daemon,
+ &compactproc };
+SYSINIT(compactdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start,
+ &compact_kp);
+
+static int sysctl_vm_phys_compact(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_compact, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
+ sysctl_vm_phys_compact, "A", "Compact physical memory");
+
+static int
+sysctl_vm_phys_compact(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ int error;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 32, req);
+
+ for (int i = 0; i < vm_ndomains; i++) {
+ void *chan = (void *)&compact_threads[i];
+ wakeup_one(chan);
+ }
+
+ sbuf_printf(&sbuf, "Kicked compaction daemon");
+
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+
+ return (error);
+}

File Metadata

Mime Type
text/plain
Expires
Mon, Feb 9, 2:30 AM (19 h, 11 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28526601
Default Alt Text
D40772.id123834.diff (33 KB)

Event Timeline