diff --git a/sys/vm/default_pager.c b/sys/vm/default_pager.c index 758012692a48..d07a84f8c98e 100644 --- a/sys/vm/default_pager.c +++ b/sys/vm/default_pager.c @@ -1,156 +1,158 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1995, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by David Greenman. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include #include #include static vm_object_t default_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void default_pager_dealloc(vm_object_t); static int default_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static void default_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t default_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); /* * pagerops for OBJT_DEFAULT - "default pager". * * This pager handles anonymous (no handle) swap-backed memory, just * like the swap pager. It allows several optimizations based on the * fact that no pages of a default object can be swapped out. The * most important optimization is in vm_fault(), where the pager is * never asked for a non-resident page. Instead, a freshly allocated * zeroed page is used. * * On the first request to page out a page from a default object, the * object is converted to swap pager type. */ const struct pagerops defaultpagerops = { + .pgo_kvme_type = KVME_TYPE_DEFAULT, .pgo_alloc = default_pager_alloc, .pgo_dealloc = default_pager_dealloc, .pgo_getpages = default_pager_getpages, .pgo_putpages = default_pager_putpages, .pgo_haspage = default_pager_haspage, }; /* * Return an initialized object. */ static vm_object_t default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) panic("default_pager_alloc: handle specified"); if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(round_page(offset + size))); if (cred != NULL) { object->cred = cred; object->charge = size; } return (object); } /* * Deallocate resources associated with the object. */ static void default_pager_dealloc(vm_object_t object) { /* Reserved swap is released by vm_object_destroy(). */ object->type = OBJT_DEAD; } /* * Load pages from backing store. */ static int default_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { /* * Since an OBJT_DEFAULT object is converted to OBJT_SWAP by the first * call to the putpages method, this function will never be called on * a vm_page with assigned swap. */ return (VM_PAGER_FAIL); } /* * Store pages to backing store. */ static void default_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { /* The swap pager will convert the object to OBJT_SWAP. */ swappagerops.pgo_putpages(object, m, count, flags, rtvals); } /* * Tell us whether the requested (object,index) is available from the object's * backing store. */ static boolean_t default_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { /* An OBJT_DEFAULT object has no backing store. */ return (FALSE); } diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index 25affb114a47..567f8fe55817 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -1,472 +1,475 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)device_pager.c 8.1 (Berkeley) 6/11/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include static void dev_pager_init(void); static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void dev_pager_dealloc(vm_object_t); static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static void dev_pager_free_page(vm_object_t object, vm_page_t m); static int dev_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, vm_prot_t, vm_pindex_t *first, vm_pindex_t *last); /* list of device pager objects */ static struct pagerlst dev_pager_object_list; /* protect list manipulation */ static struct mtx dev_pager_mtx; const struct pagerops devicepagerops = { + .pgo_kvme_type = KVME_TYPE_DEVICE, .pgo_init = dev_pager_init, .pgo_alloc = dev_pager_alloc, .pgo_dealloc = dev_pager_dealloc, .pgo_getpages = dev_pager_getpages, .pgo_putpages = dev_pager_putpages, .pgo_haspage = dev_pager_haspage, }; const struct pagerops mgtdevicepagerops = { + .pgo_kvme_type = KVME_TYPE_MGTDEVICE, .pgo_alloc = dev_pager_alloc, .pgo_dealloc = dev_pager_dealloc, .pgo_getpages = dev_pager_getpages, .pgo_putpages = dev_pager_putpages, .pgo_haspage = dev_pager_haspage, .pgo_populate = dev_pager_populate, }; static int old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color); static void old_dev_pager_dtor(void *handle); static int old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres); static const struct cdev_pager_ops old_dev_pager_ops = { .cdev_pg_ctor = old_dev_pager_ctor, .cdev_pg_dtor = old_dev_pager_dtor, .cdev_pg_fault = old_dev_pager_fault }; static void dev_pager_init(void) { TAILQ_INIT(&dev_pager_object_list); mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF); } vm_object_t cdev_pager_lookup(void *handle) { vm_object_t object; mtx_lock(&dev_pager_mtx); object = vm_pager_object_lookup(&dev_pager_object_list, handle); mtx_unlock(&dev_pager_mtx); return (object); } vm_object_t cdev_pager_allocate(void *handle, enum obj_type tp, const struct cdev_pager_ops *ops, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { vm_object_t object, object1; vm_pindex_t pindex; u_short color; if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE) return (NULL); KASSERT(tp == OBJT_MGTDEVICE || ops->cdev_pg_populate == NULL, ("populate on unmanaged device pager")); /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); /* * Treat the mmap(2) file offset as an unsigned value for a * device mapping. This, in effect, allows a user to pass all * possible off_t values as the mapping cookie to the driver. At * this point, we know that both foff and size are a multiple * of the page size. Do a check to avoid wrap. */ size = round_page(size); pindex = OFF_TO_IDX(foff) + OFF_TO_IDX(size); if (pindex > OBJ_MAX_SIZE || pindex < OFF_TO_IDX(foff) || pindex < OFF_TO_IDX(size)) return (NULL); if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0) return (NULL); mtx_lock(&dev_pager_mtx); /* * Look up pager, creating as necessary. */ object1 = NULL; object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. Initialize * the object's pg_color based upon the physical address of the * device's memory. */ mtx_unlock(&dev_pager_mtx); object1 = vm_object_allocate(tp, pindex); object1->flags |= OBJ_COLORED; object1->pg_color = color; object1->handle = handle; object1->un_pager.devp.ops = ops; object1->un_pager.devp.dev = handle; TAILQ_INIT(&object1->un_pager.devp.devp_pglist); mtx_lock(&dev_pager_mtx); object = vm_pager_object_lookup(&dev_pager_object_list, handle); if (object != NULL) { /* * We raced with other thread while allocating object. */ if (pindex > object->size) object->size = pindex; KASSERT(object->type == tp, ("Inconsistent device pager type %p %d", object, tp)); KASSERT(object->un_pager.devp.ops == ops, ("Inconsistent devops %p %p", object, ops)); } else { object = object1; object1 = NULL; object->handle = handle; TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list); if (ops->cdev_pg_populate != NULL) vm_object_set_flag(object, OBJ_POPULATE); } } else { if (pindex > object->size) object->size = pindex; KASSERT(object->type == tp, ("Inconsistent device pager type %p %d", object, tp)); } mtx_unlock(&dev_pager_mtx); if (object1 != NULL) { object1->handle = object1; mtx_lock(&dev_pager_mtx); TAILQ_INSERT_TAIL(&dev_pager_object_list, object1, pager_object_list); mtx_unlock(&dev_pager_mtx); vm_object_deallocate(object1); } return (object); } static vm_object_t dev_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { return (cdev_pager_allocate(handle, OBJT_DEVICE, &old_dev_pager_ops, size, prot, foff, cred)); } void cdev_pager_free_page(vm_object_t object, vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_MGTDEVICE) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("unmanaged %p", m)); pmap_remove_all(m); (void)vm_page_remove(m); } else if (object->type == OBJT_DEVICE) dev_pager_free_page(object, m); } static void dev_pager_free_page(vm_object_t object, vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->type == OBJT_DEVICE && (m->oflags & VPO_UNMANAGED) != 0), ("Managed device or page obj %p m %p", object, m)); TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, plinks.q); vm_page_putfake(m); } static void dev_pager_dealloc(vm_object_t object) { vm_page_t m; VM_OBJECT_WUNLOCK(object); object->un_pager.devp.ops->cdev_pg_dtor(object->un_pager.devp.dev); mtx_lock(&dev_pager_mtx); TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list); mtx_unlock(&dev_pager_mtx); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEVICE) { /* * Free up our fake pages. */ while ((m = TAILQ_FIRST(&object->un_pager.devp.devp_pglist)) != NULL) { if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) continue; dev_pager_free_page(object, m); } } object->handle = NULL; object->type = OBJT_DEAD; } static int dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { int error; /* Since our haspage reports zero after/before, the count is 1. */ KASSERT(count == 1, ("%s: count %d", __func__, count)); if (object->un_pager.devp.ops->cdev_pg_fault == NULL) return (VM_PAGER_FAIL); VM_OBJECT_WLOCK(object); error = object->un_pager.devp.ops->cdev_pg_fault(object, IDX_TO_OFF(ma[0]->pindex), PROT_READ, &ma[0]); VM_OBJECT_ASSERT_WLOCKED(object); if (error == VM_PAGER_OK) { KASSERT((object->type == OBJT_DEVICE && (ma[0]->oflags & VPO_UNMANAGED) != 0) || (object->type == OBJT_MGTDEVICE && (ma[0]->oflags & VPO_UNMANAGED) == 0), ("Wrong page type %p %p", ma[0], object)); if (object->type == OBJT_DEVICE) { TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist, ma[0], plinks.q); } if (rbehind) *rbehind = 0; if (rahead) *rahead = 0; } VM_OBJECT_WUNLOCK(object); return (error); } static int dev_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->un_pager.devp.ops->cdev_pg_populate == NULL) return (VM_PAGER_FAIL); return (object->un_pager.devp.ops->cdev_pg_populate(object, pidx, fault_type, max_prot, first, last)); } static int old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { vm_paddr_t paddr; vm_page_t m_paddr, page; struct cdev *dev; struct cdevsw *csw; struct file *fpop; struct thread *td; vm_memattr_t memattr, memattr1; int ref, ret; memattr = object->memattr; VM_OBJECT_WUNLOCK(object); dev = object->handle; csw = dev_refthread(dev, &ref); if (csw == NULL) { VM_OBJECT_WLOCK(object); return (VM_PAGER_FAIL); } td = curthread; fpop = td->td_fpop; td->td_fpop = NULL; ret = csw->d_mmap(dev, offset, &paddr, prot, &memattr); td->td_fpop = fpop; dev_relthread(dev, ref); if (ret != 0) { printf( "WARNING: dev_pager_getpage: map function returns error %d", ret); VM_OBJECT_WLOCK(object); return (VM_PAGER_FAIL); } /* If "paddr" is a real page, perform a sanity check on "memattr". */ if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL && (memattr1 = pmap_page_get_memattr(m_paddr)) != memattr) { /* * For the /dev/mem d_mmap routine to return the * correct memattr, pmap_page_get_memattr() needs to * be called, which we do there. */ if ((csw->d_flags & D_MEM) == 0) { printf("WARNING: Device driver %s has set " "\"memattr\" inconsistently (drv %u pmap %u).\n", csw->d_name, memattr, memattr1); } memattr = memattr1; } if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake page, update it with * the new physical address. */ page = *mres; VM_OBJECT_WLOCK(object); vm_page_updatefake(page, paddr, memattr); } else { /* * Replace the passed in reqpage page with our own fake page and * free up the all of the original pages. */ page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); vm_page_replace(page, object, (*mres)->pindex, *mres); *mres = page; } vm_page_valid(page); return (VM_PAGER_OK); } static void dev_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { panic("dev_pager_putpage called"); } static boolean_t dev_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { if (before != NULL) *before = 0; if (after != NULL) *after = 0; return (TRUE); } static int old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { struct cdev *dev; struct cdevsw *csw; vm_memattr_t dummy; vm_ooffset_t off; vm_paddr_t paddr; unsigned int npages; int ref; /* * Make sure this device can be mapped. */ dev = handle; csw = dev_refthread(dev, &ref); if (csw == NULL) return (ENXIO); /* * Check that the specified range of the device allows the desired * protection. * * XXX assumes VM_PROT_* == PROT_* */ npages = OFF_TO_IDX(size); paddr = 0; /* Make paddr initialized for the case of size == 0. */ for (off = foff; npages--; off += PAGE_SIZE) { if (csw->d_mmap(dev, off, &paddr, (int)prot, &dummy) != 0) { dev_relthread(dev, ref); return (EINVAL); } } dev_ref(dev); dev_relthread(dev, ref); *color = atop(paddr) - OFF_TO_IDX(off - PAGE_SIZE); return (0); } static void old_dev_pager_dtor(void *handle) { dev_rel(handle); } diff --git a/sys/vm/phys_pager.c b/sys/vm/phys_pager.c index af7b10701b16..6d7c5a4a7291 100644 --- a/sys/vm/phys_pager.c +++ b/sys/vm/phys_pager.c @@ -1,309 +1,311 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000 Peter Wemm * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include /* list of phys pager objects */ static struct pagerlst phys_pager_object_list; /* protect access to phys_pager_object_list */ static struct mtx phys_pager_mtx; static int default_phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead); static int default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last); static boolean_t default_phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); const struct phys_pager_ops default_phys_pg_ops = { .phys_pg_getpages = default_phys_pager_getpages, .phys_pg_populate = default_phys_pager_populate, .phys_pg_haspage = default_phys_pager_haspage, .phys_pg_ctor = NULL, .phys_pg_dtor = NULL, }; static void phys_pager_init(void) { TAILQ_INIT(&phys_pager_object_list); mtx_init(&phys_pager_mtx, "phys_pager list", NULL, MTX_DEF); } vm_object_t phys_pager_allocate(void *handle, const struct phys_pager_ops *ops, void *data, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { vm_object_t object, object1; vm_pindex_t pindex; bool init; /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); pindex = OFF_TO_IDX(foff + PAGE_MASK + size); init = true; if (handle != NULL) { mtx_lock(&phys_pager_mtx); /* * Look up pager, creating as necessary. */ object1 = NULL; object = vm_pager_object_lookup(&phys_pager_object_list, handle); if (object == NULL) { /* * Allocate object and associate it with the pager. */ mtx_unlock(&phys_pager_mtx); object1 = vm_object_allocate(OBJT_PHYS, pindex); mtx_lock(&phys_pager_mtx); object = vm_pager_object_lookup(&phys_pager_object_list, handle); if (object != NULL) { /* * We raced with other thread while * allocating object. */ if (pindex > object->size) object->size = pindex; init = false; } else { object = object1; object1 = NULL; object->handle = handle; object->un_pager.phys.ops = ops; object->un_pager.phys.data_ptr = data; if (ops->phys_pg_populate != NULL) vm_object_set_flag(object, OBJ_POPULATE); TAILQ_INSERT_TAIL(&phys_pager_object_list, object, pager_object_list); } } else { if (pindex > object->size) object->size = pindex; } mtx_unlock(&phys_pager_mtx); vm_object_deallocate(object1); } else { object = vm_object_allocate(OBJT_PHYS, pindex); object->un_pager.phys.ops = ops; object->un_pager.phys.data_ptr = data; if (ops->phys_pg_populate != NULL) vm_object_set_flag(object, OBJ_POPULATE); } if (init && ops->phys_pg_ctor != NULL) ops->phys_pg_ctor(object, prot, foff, cred); return (object); } static vm_object_t phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *ucred) { return (phys_pager_allocate(handle, &default_phys_pg_ops, NULL, size, prot, foff, ucred)); } static void phys_pager_dealloc(vm_object_t object) { if (object->handle != NULL) { VM_OBJECT_WUNLOCK(object); mtx_lock(&phys_pager_mtx); TAILQ_REMOVE(&phys_pager_object_list, object, pager_object_list); mtx_unlock(&phys_pager_mtx); VM_OBJECT_WLOCK(object); } object->type = OBJT_DEAD; if (object->un_pager.phys.ops->phys_pg_dtor != NULL) object->un_pager.phys.ops->phys_pg_dtor(object); object->handle = NULL; } /* * Fill as many pages as vm_fault has allocated for us. */ static int default_phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { int i; for (i = 0; i < count; i++) { if (vm_page_none_valid(m[i])) { if ((m[i]->flags & PG_ZERO) == 0) pmap_zero_page(m[i]); vm_page_valid(m[i]); } KASSERT(vm_page_all_valid(m[i]), ("phys_pager_getpages: partially valid page %p", m[i])); KASSERT(m[i]->dirty == 0, ("phys_pager_getpages: dirty page %p", m[i])); } if (rbehind) *rbehind = 0; if (rahead) *rahead = 0; return (VM_PAGER_OK); } static int phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { return (object->un_pager.phys.ops->phys_pg_getpages(object, m, count, rbehind, rahead)); } /* * Implement a pretty aggressive clustered getpages strategy. Hint that * everything in an entire 4MB window should be prefaulted at once. * * 4MB (1024 slots per page table page) is convenient for x86, * but may not be for other arches. */ #ifndef PHYSCLUSTER #define PHYSCLUSTER 1024 #endif static int phys_pager_cluster = PHYSCLUSTER; SYSCTL_INT(_vm, OID_AUTO, phys_pager_cluster, CTLFLAG_RWTUN, &phys_pager_cluster, 0, "prefault window size for phys pager"); /* * Max hint to vm_page_alloc() about the further allocation needs * inside the phys_pager_populate() loop. The number of bits used to * implement VM_ALLOC_COUNT() determines the hard limit on this value. * That limit is currently 65535. */ #define PHYSALLOC 16 static int default_phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first, vm_pindex_t *last) { vm_page_t m; vm_pindex_t base, end, i; int ahead; base = rounddown(pidx, phys_pager_cluster); end = base + phys_pager_cluster - 1; if (end >= object->size) end = object->size - 1; if (*first > base) base = *first; if (end > *last) end = *last; *first = base; *last = end; for (i = base; i <= end; i++) { ahead = MIN(end - i, PHYSALLOC); m = vm_page_grab(object, i, VM_ALLOC_NORMAL | VM_ALLOC_COUNT(ahead)); if (!vm_page_all_valid(m)) vm_page_zero_invalid(m, TRUE); KASSERT(m->dirty == 0, ("phys_pager_populate: dirty page %p", m)); } return (VM_PAGER_OK); } static int phys_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { return (object->un_pager.phys.ops->phys_pg_populate(object, pidx, fault_type, max_prot, first, last)); } static void phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync, int *rtvals) { panic("phys_pager_putpage called"); } static boolean_t default_phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { vm_pindex_t base, end; base = rounddown(pindex, phys_pager_cluster); end = base + phys_pager_cluster - 1; if (before != NULL) *before = pindex - base; if (after != NULL) *after = end - pindex; return (TRUE); } static boolean_t phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { return (object->un_pager.phys.ops->phys_pg_haspage(object, pindex, before, after)); } const struct pagerops physpagerops = { + .pgo_kvme_type = KVME_TYPE_PHYS, .pgo_init = phys_pager_init, .pgo_alloc = phys_pager_alloc, .pgo_dealloc = phys_pager_dealloc, .pgo_getpages = phys_pager_getpages, .pgo_putpages = phys_pager_putpages, .pgo_haspage = phys_pager_haspage, .pgo_populate = phys_pager_populate, }; diff --git a/sys/vm/sg_pager.c b/sys/vm/sg_pager.c index 9b3e60bc170a..d7af2aed935a 100644 --- a/sys/vm/sg_pager.c +++ b/sys/vm/sg_pager.c @@ -1,228 +1,230 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * This pager manages OBJT_SG objects. These objects are backed by * a scatter/gather list of physical address ranges. */ #include #include #include #include #include +#include #include #include #include #include #include #include #include #include static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void sg_pager_dealloc(vm_object_t); static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static void sg_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t sg_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); const struct pagerops sgpagerops = { + .pgo_kvme_type = KVME_TYPE_SG, .pgo_alloc = sg_pager_alloc, .pgo_dealloc = sg_pager_dealloc, .pgo_getpages = sg_pager_getpages, .pgo_putpages = sg_pager_putpages, .pgo_haspage = sg_pager_haspage, }; static vm_object_t sg_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { struct sglist *sg; vm_object_t object; vm_pindex_t npages, pindex; int i; /* * Offset should be page aligned. */ if (foff & PAGE_MASK) return (NULL); /* * The scatter/gather list must only include page-aligned * ranges. */ npages = 0; sg = handle; for (i = 0; i < sg->sg_nseg; i++) { if ((sg->sg_segs[i].ss_paddr % PAGE_SIZE) != 0 || (sg->sg_segs[i].ss_len % PAGE_SIZE) != 0) return (NULL); npages += sg->sg_segs[i].ss_len / PAGE_SIZE; } /* * The scatter/gather list has a fixed size. Refuse requests * to map beyond that. */ size = round_page(size); pindex = OFF_TO_IDX(foff) + OFF_TO_IDX(size); if (pindex > npages || pindex < OFF_TO_IDX(foff) || pindex < OFF_TO_IDX(size)) return (NULL); /* * Allocate a new object and associate it with the * scatter/gather list. It is ok for our purposes to have * multiple VM objects associated with the same scatter/gather * list because scatter/gather lists are static. This is also * simpler than ensuring a unique object per scatter/gather * list. */ object = vm_object_allocate(OBJT_SG, npages); object->handle = sglist_hold(sg); TAILQ_INIT(&object->un_pager.sgp.sgp_pglist); return (object); } static void sg_pager_dealloc(vm_object_t object) { struct sglist *sg; vm_page_t m; /* * Free up our fake pages. */ while ((m = TAILQ_FIRST(&object->un_pager.sgp.sgp_pglist)) != 0) { if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0) continue; TAILQ_REMOVE(&object->un_pager.sgp.sgp_pglist, m, plinks.q); vm_page_putfake(m); } sg = object->handle; sglist_free(sg); object->handle = NULL; object->type = OBJT_DEAD; } static int sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct sglist *sg; vm_page_t m_paddr, page; vm_pindex_t offset; vm_paddr_t paddr; vm_memattr_t memattr; size_t space; int i; /* Since our haspage reports zero after/before, the count is 1. */ KASSERT(count == 1, ("%s: count %d", __func__, count)); /* Handle is stable while paging is in progress. */ sg = object->handle; memattr = object->memattr; offset = m[0]->pindex; /* * Lookup the physical address of the requested page. An initial * value of '1' instead of '0' is used so we can assert that the * page is found since '0' can be a valid page-aligned physical * address. */ space = 0; paddr = 1; for (i = 0; i < sg->sg_nseg; i++) { if (space + sg->sg_segs[i].ss_len <= (offset * PAGE_SIZE)) { space += sg->sg_segs[i].ss_len; continue; } paddr = sg->sg_segs[i].ss_paddr + offset * PAGE_SIZE - space; break; } KASSERT(paddr != 1, ("invalid SG page index")); /* If "paddr" is a real page, perform a sanity check on "memattr". */ if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL && pmap_page_get_memattr(m_paddr) != memattr) { memattr = pmap_page_get_memattr(m_paddr); printf( "WARNING: A device driver has set \"memattr\" inconsistently.\n"); } /* Return a fake page for the requested page. */ KASSERT(!(m[0]->flags & PG_FICTITIOUS), ("backing page for SG is fake")); /* Construct a new fake page. */ page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q); vm_page_replace(page, object, offset, m[0]); VM_OBJECT_WUNLOCK(object); m[0] = page; vm_page_valid(page); if (rbehind) *rbehind = 0; if (rahead) *rahead = 0; return (VM_PAGER_OK); } static void sg_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync, int *rtvals) { panic("sg_pager_putpage called"); } static boolean_t sg_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { if (before != NULL) *before = 0; if (after != NULL) *after = 0; return (TRUE); } diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 57c953542a88..37db4cbac857 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -1,3195 +1,3198 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64. * The 64-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 32 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif #define SWAP_META_PAGES PCTRIE_COUNT /* * A swblk structure maps each page index within a * SWAP_META_PAGES-aligned and sized range to the address of an * on-disk swap block (or SWAPBLK_NONE). The collection of these * mappings for an entire vm object is implemented as a pc-trie. */ struct swblk { vm_pindex_t p; daddr_t d[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static __exclusive_cache_line u_long swap_reserved; static u_long swap_total; static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM swap stats"); SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_reserved, 0, sysctl_page_shift, "A", "Amount of swap storage needed to back all allocated anonymous memory."); SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_total, 0, sysctl_page_shift, "A", "Total amount of available swap storage."); static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, "Actual size of swap metadata zone"); static unsigned long swap_maxpages; SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, "Maximum amount of swap supported"); static COUNTER_U64_DEFINE_EARLY(swap_free_deferred); SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred, CTLFLAG_RD, &swap_free_deferred, "Number of pages that deferred freeing swap space"); static COUNTER_U64_DEFINE_EARLY(swap_free_completed); SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed, CTLFLAG_RD, &swap_free_completed, "Number of deferred frees completed"); /* bits from overcommit */ #define SWAP_RESERVE_FORCE_ON (1 << 0) #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) static int sysctl_page_shift(SYSCTL_HANDLER_ARGS) { uint64_t newval; u_long value = *(u_long *)arg1; newval = ((uint64_t)value) << PAGE_SHIFT; return (sysctl_handle_64(oidp, &newval, 0, req)); } static bool swap_reserve_by_cred_rlimit(u_long pincr, struct ucred *cred, int oc) { struct uidinfo *uip; u_long prev; uip = cred->cr_ruidinfo; prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); if ((oc & SWAP_RESERVE_RLIMIT_ON) != 0 && prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT) != 0) { prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); KASSERT(prev >= pincr, ("negative vmsize for uid = %d\n", uip->ui_uid)); return (false); } return (true); } static void swap_release_by_cred_rlimit(u_long pdecr, struct ucred *cred) { struct uidinfo *uip; #ifdef INVARIANTS u_long prev; #endif uip = cred->cr_ruidinfo; #ifdef INVARIANTS prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); KASSERT(prev >= pdecr, ("negative vmsize for uid = %d\n", uip->ui_uid)); #else atomic_subtract_long(&uip->ui_vmsize, pdecr); #endif } static void swap_reserve_force_rlimit(u_long pincr, struct ucred *cred) { struct uidinfo *uip; uip = cred->cr_ruidinfo; atomic_add_long(&uip->ui_vmsize, pincr); } bool swap_reserve(vm_ooffset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } bool swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { u_long r, s, prev, pincr; #ifdef RACCT int error; #endif int oc; static int curfail; static struct timeval lastfail; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); #ifdef RACCT if (RACCT_ENABLED()) { PROC_LOCK(curproc); error = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); if (error != 0) return (false); } #endif pincr = atop(incr); prev = atomic_fetchadd_long(&swap_reserved, pincr); r = prev + pincr; s = swap_total; oc = atomic_load_int(&overcommit); if (r > s && (oc & SWAP_RESERVE_ALLOW_NONWIRED) != 0) { s += vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); } if ((oc & SWAP_RESERVE_FORCE_ON) != 0 && r > s && priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) != 0) { prev = atomic_fetchadd_long(&swap_reserved, -pincr); KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail")); goto out_error; } if (!swap_reserve_by_cred_rlimit(pincr, cred, oc)) { prev = atomic_fetchadd_long(&swap_reserved, -pincr); KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail")); goto out_error; } return (true); out_error: if (ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", cred->cr_ruidinfo->ui_uid, curproc->p_pid, incr); } #ifdef RACCT if (RACCT_ENABLED()) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif return (false); } void swap_reserve_force(vm_ooffset_t incr) { u_long pincr; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); #ifdef RACCT if (RACCT_ENABLED()) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif pincr = atop(incr); atomic_add_long(&swap_reserved, pincr); swap_reserve_force_rlimit(pincr, curthread->td_ucred); } void swap_release(vm_ooffset_t decr) { struct ucred *cred; PROC_LOCK(curproc); cred = curproc->p_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { u_long pdecr; #ifdef INVARIANTS u_long prev; #endif KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, (uintmax_t)decr)); pdecr = atop(decr); #ifdef INVARIANTS prev = atomic_fetchadd_long(&swap_reserved, -pdecr); KASSERT(prev >= pdecr, ("swap_reserved < decr")); #else atomic_subtract_long(&swap_reserved, pdecr); #endif swap_release_by_cred_rlimit(pdecr, cred); #ifdef RACCT if (racct_enable) racct_sub_cred(cred, RACCT_SWAP, decr); #endif } static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A", "Swap Fragmentation Info"); static struct sx sw_alloc_sx; /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swwbuf_zone; static uma_zone_t swrbuf_zone; static uma_zone_t swblk_zone; static uma_zone_t swpctrie_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static vm_object_t swap_tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); static void swap_tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp); static void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size); const struct pagerops swappagerops = { + .pgo_kvme_type = KVME_TYPE_SWAP, .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ .pgo_update_writecount = swap_pager_update_writecount, .pgo_release_writecount = swap_pager_release_writecount, .pgo_freespace = swap_pager_freespace, }; const struct pagerops swaptmpfspagerops = { + .pgo_kvme_type = KVME_TYPE_VNODE, .pgo_alloc = swap_tmpfs_pager_alloc, .pgo_dealloc = swap_pager_dealloc, .pgo_getpages = swap_pager_getpages, .pgo_getpages_async = swap_pager_getpages_async, .pgo_putpages = swap_pager_putpages, .pgo_haspage = swap_pager_haspage, .pgo_pageunswapped = swap_pager_unswapped, .pgo_update_writecount = swap_pager_update_writecount, .pgo_release_writecount = swap_pager_release_writecount, .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, .pgo_mightbedirty = vm_object_mightbedirty_, .pgo_getvp = swap_tmpfs_pager_getvp, .pgo_freespace = swap_pager_freespace, }; /* * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0, "Maximum size of a swap block in pages"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit); static void swp_pager_free_empty_swblk(vm_object_t, struct swblk *sb); static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages); static daddr_t swp_pager_getswapspace(int *npages); /* * Metadata functions */ static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst, vm_pindex_t pindex, vm_pindex_t count); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_lookup(vm_object_t, vm_pindex_t); static void swp_pager_init_freerange(daddr_t *start, daddr_t *num) { *start = SWAPBLK_NONE; *num = 0; } static void swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr) { if (*start + *num == addr) { (*num)++; } else { swp_pager_freeswapspace(*start, *num); *start = addr; *num = 1; } } static void * swblk_trie_alloc(struct pctrie *ptree) { return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0))); } static void swblk_trie_free(struct pctrie *ptree, void *node) { uma_zfree(swpctrie_zone, node); } PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); sx_init(&swdev_syscall_lock, "swsysc"); } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { unsigned long n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array, which has maxphys / PAGE_SIZE entries, and our locally * defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min(maxphys / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4); swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2); /* * Initialize our zone, taking the user's requested size or * estimating the number we need based on the number of pages * in the system. */ n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) : vm_cnt.v_page_count / 2; swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 0); if (swpctrie_zone == NULL) panic("failed to create swap pctrie zone."); swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, NULL, NULL, _Alignof(struct swblk) - 1, 0); if (swblk_zone == NULL) panic("failed to create swap blk zone."); n2 = n; do { if (uma_zone_reserve_kva(swblk_zone, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); /* * Often uma_zone_reserve_kva() cannot reserve exactly the * requested size. Account for the difference when * calculating swap_maxpages. */ n = uma_zone_get_max(swblk_zone); if (n < n2) printf("Swap blk zone entries changed from %lu to %lu.\n", n2, n); /* absolute maximum we can handle assuming 100% efficiency */ swap_maxpages = n * SWAP_META_PAGES; swzone = n * sizeof(struct swblk); if (!uma_zone_reserve_kva(swpctrie_zone, n)) printf("Cannot reserve swap pctrie zone, " "reduce kern.maxswzone.\n"); } static vm_object_t swap_pager_alloc_init(objtype_t otype, void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset) { vm_object_t object; if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } /* * The un_pager.swp.swp_blks trie is initialized by * vm_object_allocate() to ensure the correct order of * visibility to other threads. */ object = vm_object_allocate(otype, OFF_TO_IDX(offset + PAGE_MASK + size)); object->un_pager.swp.writemappings = 0; object->handle = handle; if (cred != NULL) { object->cred = cred; object->charge = size; } return (object); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. * * This routine must ensure that no live duplicate is created for * the named object request, which is protected against by * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = swap_pager_alloc_init(OBJT_SWAP, handle, cred, size, offset); if (object != NULL) { TAILQ_INSERT_TAIL(NOBJLIST(object->handle), object, pager_object_list); } } sx_xunlock(&sw_alloc_sx); } else { object = swap_pager_alloc_init(OBJT_SWAP, handle, cred, size, offset); } return (object); } static vm_object_t swap_tmpfs_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; MPASS(handle == NULL); object = swap_pager_alloc_init(OBJT_SWAP_TMPFS, handle, cred, size, offset); return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * The object must be locked. */ static void swap_pager_dealloc(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if ((object->flags & OBJ_ANON) == 0 && object->handle != NULL) { VM_OBJECT_WUNLOCK(object); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(object); } vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); object->handle = NULL; object->type = OBJT_DEAD; vm_object_clear_flag(object, OBJ_SWAP); } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for up to the requested number of pages. The * starting swap block number (a page index) is returned or * SWAPBLK_NONE if the allocation failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * This routine may not sleep. * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int *io_npages) { daddr_t blk; struct swdevt *sp; int mpages, npages; KASSERT(*io_npages >= 1, ("%s: npages not positive", __func__)); blk = SWAPBLK_NONE; mpages = *io_npages; npages = imin(BLIST_MAX_ALLOC, mpages); mtx_lock(&sw_dev_mtx); sp = swdevhd; while (!TAILQ_EMPTY(&swtailq)) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if ((sp->sw_flags & SW_CLOSING) == 0) blk = blist_alloc(sp->sw_blist, &npages, mpages); if (blk != SWAPBLK_NONE) break; sp = TAILQ_NEXT(sp, sw_list); if (swdevhd == sp) { if (npages == 1) break; mpages = npages - 1; npages >>= 1; } } if (blk != SWAPBLK_NONE) { *io_npages = npages; blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); } else { if (swap_pager_full != 2) { printf("swp_pager_getswapspace(%d): failed\n", *io_npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; } mtx_unlock(&sw_dev_mtx); return (blk); } static bool swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(bp->b_blkno, sp)) { mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); } sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * This routine may not sleep. */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages) { struct swdevt *sp; if (npages == 0) return; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(blk, sp)) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats */ static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct swdevt *sp; const char *devname; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (vn_isdisk(sp->sw_vp)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname); blist_stats(sp->sw_blist, &sbuf); } mtx_unlock(&sw_dev_mtx); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * The object must be locked. */ static void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zeroed. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { daddr_t addr, blk, n_free, s_free; int i, j, n; swp_pager_init_freerange(&s_free, &n_free); VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += n) { n = size - i; blk = swp_pager_getswapspace(&n); if (blk == SWAPBLK_NONE) { swp_pager_meta_free(object, start, i); VM_OBJECT_WUNLOCK(object); return (-1); } for (j = 0; j < n; ++j) { addr = swp_pager_meta_build(object, start + i + j, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); } } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WUNLOCK(object); return (0); } static bool swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t pindex, daddr_t addr) { daddr_t dstaddr; KASSERT((srcobject->flags & OBJ_SWAP) != 0, ("%s: Srcobject not swappable", __func__)); if ((dstobject->flags & OBJ_SWAP) != 0 && swp_pager_meta_lookup(dstobject, pindex) != SWAPBLK_NONE) { /* Caller should destroy the source block. */ return (false); } /* * Destination has no swapblk and is not resident, transfer source. * swp_pager_meta_build() can sleep. */ VM_OBJECT_WUNLOCK(srcobject); dstaddr = swp_pager_meta_build(dstobject, pindex, addr); KASSERT(dstaddr == SWAPBLK_NONE, ("Unexpected destination swapblk")); VM_OBJECT_WLOCK(srcobject); return (true); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to sleep. It may sleep allocating metadata * indirectly through swp_pager_meta_build(). * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked. * Both object locks may temporarily be released. */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { VM_OBJECT_ASSERT_WLOCKED(srcobject); VM_OBJECT_ASSERT_WLOCKED(dstobject); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource && (srcobject->flags & OBJ_ANON) == 0 && srcobject->handle != NULL) { VM_OBJECT_WUNLOCK(srcobject); VM_OBJECT_WUNLOCK(dstobject); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(dstobject); VM_OBJECT_WLOCK(srcobject); } /* * Transfer source to destination. */ swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size); /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; vm_object_clear_flag(srcobject, OBJ_SWAP); } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk, blk0; int i; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & OBJ_SWAP) != 0, ("%s: object not swappable", __func__)); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_lookup(object, pindex); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_lookup(object, pindex - i); if (blk != blk0 - i) break; } *before = i - 1; } /* * find forward-looking contiguous good backing store */ if (after != NULL) { for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_lookup(object, pindex + i); if (blk != blk0 + i) break; } *after = i - 1; } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not sleep. * * The object containing the page may be locked. */ static void swap_pager_unswapped(vm_page_t m) { struct swblk *sb; vm_object_t obj; /* * Handle enqueing deferred frees first. If we do not have the * object lock we wait for the page daemon to clear the space. */ obj = m->object; if (!VM_OBJECT_WOWNED(obj)) { VM_PAGE_OBJECT_BUSY_ASSERT(m); /* * The caller is responsible for synchronization but we * will harmlessly handle races. This is typically provided * by only calling unswapped() when a page transitions from * clean to dirty. */ if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) == PGA_SWAP_SPACE) { vm_page_aflag_set(m, PGA_SWAP_FREE); counter_u64_add(swap_free_deferred, 1); } return; } if ((m->a.flags & PGA_SWAP_FREE) != 0) counter_u64_add(swap_free_completed, 1); vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ KASSERT((m->object->flags & OBJ_SWAP) != 0, ("Free object not swappable")); sb = SWAP_PCTRIE_LOOKUP(&m->object->un_pager.swp.swp_blks, rounddown(m->pindex, SWAP_META_PAGES)); if (sb == NULL) return; if (sb->d[m->pindex % SWAP_META_PAGES] == SWAPBLK_NONE) return; swp_pager_freeswapspace(sb->d[m->pindex % SWAP_META_PAGES], 1); sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE; swp_pager_free_empty_swblk(m->object, sb); } /* * swap_pager_getpages() - bring pages in from swap * * Attempt to page in the pages in array "ma" of length "count". The * caller may optionally specify that additional pages preceding and * succeeding the specified range be paged in. The number of such pages * is returned in the "rbehind" and "rahead" parameters, and they will * be in the inactive queue upon return. * * The pages in "ma" must be busied and will remain busied upon return. */ static int swap_pager_getpages_locked(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { struct buf *bp; vm_page_t bm, mpred, msucc, p; vm_pindex_t pindex; daddr_t blk; int i, maxahead, maxbehind, reqcount; VM_OBJECT_ASSERT_WLOCKED(object); reqcount = count; KASSERT((object->flags & OBJ_SWAP) != 0, ("%s: object not swappable", __func__)); if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) { VM_OBJECT_WUNLOCK(object); return (VM_PAGER_FAIL); } KASSERT(reqcount - 1 <= maxahead, ("page count %d extends beyond swap block", reqcount)); /* * Do not transfer any pages other than those that are xbusied * when running during a split or collapse operation. This * prevents clustering from re-creating pages which are being * moved into another object. */ if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) { maxahead = reqcount - 1; maxbehind = 0; } /* * Clip the readahead and readbehind ranges to exclude resident pages. */ if (rahead != NULL) { *rahead = imin(*rahead, maxahead - (reqcount - 1)); pindex = ma[reqcount - 1]->pindex; msucc = TAILQ_NEXT(ma[reqcount - 1], listq); if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) *rahead = msucc->pindex - pindex - 1; } if (rbehind != NULL) { *rbehind = imin(*rbehind, maxbehind); pindex = ma[0]->pindex; mpred = TAILQ_PREV(ma[0], pglist, listq); if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) *rbehind = pindex - mpred->pindex - 1; } bm = ma[0]; for (i = 0; i < count; i++) ma[i]->oflags |= VPO_SWAPINPROG; /* * Allocate readahead and readbehind pages. */ if (rbehind != NULL) { for (i = 1; i <= *rbehind; i++) { p = vm_page_alloc(object, ma[0]->pindex - i, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; bm = p; } *rbehind = i - 1; } if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; } *rahead = i; } if (rbehind != NULL) count += *rbehind; if (rahead != NULL) count += *rahead; vm_object_pip_add(object, count); pindex = bm->pindex; blk = swp_pager_meta_lookup(object, pindex); KASSERT(blk != SWAPBLK_NONE, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swrbuf_zone, M_WAITOK); MPASS((bp->b_flags & B_MAXPHYS) != 0); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); bp->b_pages[i] = p; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; bp->b_npages = count; bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; bp->b_pgafter = rahead != NULL ? *rahead : 0; VM_CNT_INC(v_swapin); VM_CNT_ADD(v_swappgsin, count); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our ma[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); /* This could be implemented more efficiently with aflags */ while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) { ma[0]->oflags |= VPO_SWAPSLEEP; VM_CNT_INC(v_intrans); if (VM_OBJECT_SLEEP(object, &object->handle, PSWP, "swread", hz * 20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } VM_OBJECT_WUNLOCK(object); /* * If we had an unrecoverable read error pages will not be valid. */ for (i = 0; i < reqcount; i++) if (ma[i]->valid != VM_PAGE_BITS_ALL) return (VM_PAGER_ERROR); return (VM_PAGER_OK); /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } static int swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { VM_OBJECT_WLOCK(object); return (swap_pager_getpages_locked(object, ma, count, rbehind, rahead)); } /* * swap_pager_getpages_async(): * * Right now this is emulation of asynchronous operation on top of * swap_pager_getpages(). */ static int swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { int r, error; r = swap_pager_getpages(object, ma, count, rbehind, rahead); switch (r) { case VM_PAGER_OK: error = 0; break; case VM_PAGER_ERROR: error = EIO; break; case VM_PAGER_FAIL: error = EINVAL; break; default: panic("unhandled swap_pager_getpages() error %d", r); } (iodone)(arg, ma, count, error); return (r); } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whose rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ static void swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, int flags, int *rtvals) { struct buf *bp; daddr_t addr, blk, n_free, s_free; vm_page_t mreq; int i, j, n; bool async; KASSERT(count == 0 || ma[0]->object == object, ("%s: object mismatch %p/%p", __func__, object, ma[0]->object)); /* * Step 1 * * Turn object into OBJT_SWAP. Force sync if not a pageout process. */ if ((object->flags & OBJ_SWAP) == 0) { addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE); KASSERT(addr == SWAPBLK_NONE, ("unexpected object swap block")); } VM_OBJECT_WUNLOCK(object); async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0; swp_pager_init_freerange(&s_free, &n_free); /* * Step 2 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { /* Maximum I/O size is limited by maximum swap block size. */ n = min(count - i, nsw_cluster_max); if (async) { mtx_lock(&swbuf_mtx); while (nsw_wcount_async == 0) msleep(&nsw_wcount_async, &swbuf_mtx, PVM, "swbufa", 0); nsw_wcount_async--; mtx_unlock(&swbuf_mtx); } /* Get a block of swap of size up to size n. */ VM_OBJECT_WLOCK(object); blk = swp_pager_getswapspace(&n); if (blk == SWAPBLK_NONE) { VM_OBJECT_WUNLOCK(object); mtx_lock(&swbuf_mtx); if (++nsw_wcount_async == 1) wakeup(&nsw_wcount_async); mtx_unlock(&swbuf_mtx); for (j = 0; j < n; ++j) rtvals[i + j] = VM_PAGER_FAIL; continue; } for (j = 0; j < n; ++j) { mreq = ma[i + j]; vm_page_aflag_clear(mreq, PGA_SWAP_FREE); addr = swp_pager_meta_build(mreq->object, mreq->pindex, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); MPASS(mreq->dirty == VM_PAGE_BITS_ALL); mreq->oflags |= VPO_SWAPINPROG; } VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swwbuf_zone, M_WAITOK); MPASS((bp->b_flags & B_MAXPHYS) != 0); if (async) bp->b_flags |= B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; for (j = 0; j < n; j++) bp->b_pages[j] = ma[i + j]; bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; VM_CNT_INC(v_swapout); VM_CNT_ADD(v_swappgsout, bp->b_npages); /* * We unconditionally set rtvals[] to VM_PAGER_PEND so that we * can call the async completion routine at the end of a * synchronous I/O operation. Otherwise, our caller would * perform duplicate unbusy and wakeup operations on the page * and object, respectively. */ for (j = 0; j < n; j++) rtvals[i + j] = VM_PAGER_PEND; /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ if (async) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WLOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * This routine may not sleep. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * Report error - unless we ran out of memory, in which case * we've already logged it in swapgeom_strategy(). */ if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); else bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_WLOCK(object); } /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (m->oflags & VPO_SWAPSLEEP) { m->oflags &= ~VPO_SWAPSLEEP; wakeup(&object->handle); } /* We always have space after I/O, successful or not. */ vm_page_aflag_set(m, PGA_SWAP_SPACE); if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ vm_page_invalid(m); } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ MPASS(m->dirty == VM_PAGE_BITS_ALL); /* PQ_UNSWAPPABLE? */ vm_page_activate(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); vm_page_valid(m); if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). * A page is only written to swap after a period of * inactivity. Therefore, we do not expect it to be * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); vm_page_deactivate_noreuse(m); vm_page_sunbusy(m); } } /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_WUNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ if (bp->b_flags & B_ASYNC) { mtx_lock(&swbuf_mtx); if (++nsw_wcount_async == 1) wakeup(&nsw_wcount_async); mtx_unlock(&swbuf_mtx); } uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); } int swap_pager_nswapdev(void) { return (nswapdev); } static void swp_pager_force_dirty(vm_page_t m) { vm_page_dirty(m); swap_pager_unswapped(m); vm_page_launder(m); } u_long swap_pager_swapped_pages(vm_object_t object) { struct swblk *sb; vm_pindex_t pi; u_long res; int i; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_SWAP) == 0) return (0); for (res = 0, pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi)) != NULL; pi = sb->p + SWAP_META_PAGES) { for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) res++; } } return (res); } /* * swap_pager_swapoff_object: * * Page in all of the pages that have been paged out for an object * to a swap device. */ static void swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object) { struct swblk *sb; vm_page_t m; vm_pindex_t pi; daddr_t blk; int i, nv, rahead, rv; KASSERT((object->flags & OBJ_SWAP) != 0, ("%s: Object not swappable", __func__)); for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi)) != NULL; ) { if ((object->flags & OBJ_DEAD) != 0) { /* * Make sure that pending writes finish before * returning. */ vm_object_pip_wait(object, "swpoff"); swp_pager_meta_free_all(object); break; } for (i = 0; i < SWAP_META_PAGES; i++) { /* * Count the number of contiguous valid blocks. */ for (nv = 0; nv < SWAP_META_PAGES - i; nv++) { blk = sb->d[i + nv]; if (!swp_pager_isondev(blk, sp) || blk == SWAPBLK_NONE) break; } if (nv == 0) continue; /* * Look for a page corresponding to the first * valid block and ensure that any pending paging * operations on it are complete. If the page is valid, * mark it dirty and free the swap block. Try to batch * this operation since it may cause sp to be freed, * meaning that we must restart the scan. Avoid busying * valid pages since we may block forever on kernel * stack pages. */ m = vm_page_lookup(object, sb->p + i); if (m == NULL) { m = vm_page_alloc(object, sb->p + i, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) break; } else { if ((m->oflags & VPO_SWAPINPROG) != 0) { m->oflags |= VPO_SWAPSLEEP; VM_OBJECT_SLEEP(object, &object->handle, PSWP, "swpoff", 0); break; } if (vm_page_all_valid(m)) { do { swp_pager_force_dirty(m); } while (--nv > 0 && (m = vm_page_next(m)) != NULL && vm_page_all_valid(m) && (m->oflags & VPO_SWAPINPROG) == 0); break; } if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL)) break; } vm_object_pip_add(object, 1); rahead = SWAP_META_PAGES; rv = swap_pager_getpages_locked(object, &m, 1, NULL, &rahead); if (rv != VM_PAGER_OK) panic("%s: read from swap failed: %d", __func__, rv); vm_object_pip_wakeupn(object, 1); VM_OBJECT_WLOCK(object); vm_page_xunbusy(m); /* * The object lock was dropped so we must restart the * scan of this swap block. Pages paged in during this * iteration will be marked dirty in a future iteration. */ break; } if (i == SWAP_META_PAGES) pi = sb->p + SWAP_META_PAGES; } } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { vm_object_t object; int retries; sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; full_rescan: mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { if ((object->flags & OBJ_SWAP) == 0) continue; mtx_unlock(&vm_object_list_mtx); /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); /* * Dead objects are eventually terminated on their own. */ if ((object->flags & OBJ_DEAD) != 0) goto next_obj; /* * Sync with fences placed after pctrie * initialization. We must not access pctrie below * unless we checked that our object is swap and not * dead. */ atomic_thread_fence_acq(); if ((object->flags & OBJ_SWAP) == 0) goto next_obj; swap_pager_swapoff_object(sp, object); next_obj: VM_OBJECT_WUNLOCK(object); mtx_lock(&vm_object_list_mtx); } mtx_unlock(&vm_object_list_mtx); if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } EVENTHANDLER_INVOKE(swapoff, sp); } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free? */ static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit) { int i; MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES); for (i = start; i < limit; i++) { if (sb->d[i] != SWAPBLK_NONE) return (false); } return (true); } /* * SWP_PAGER_FREE_EMPTY_SWBLK() - frees if a block is free * * Nothing is done if the block is still in use. */ static void swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb) { if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is returned. */ static daddr_t swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted; struct swblk *sb, *sb1; vm_pindex_t modpi, rdpi; daddr_t prev_swapblk; int error, i; VM_OBJECT_ASSERT_WLOCKED(object); /* * Convert default object to swap object if necessary */ if ((object->flags & OBJ_SWAP) == 0) { pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff()'s iteration over * object_list does not see a garbage pctrie. */ atomic_thread_fence_rel(); object->type = OBJT_SWAP; vm_object_set_flag(object, OBJ_SWAP); object->un_pager.swp.writemappings = 0; KASSERT((object->flags & OBJ_ANON) != 0 || object->handle == NULL, ("default pager %p with handle %p", object, object->handle)); } rdpi = rounddown(pindex, SWAP_META_PAGES); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb == NULL) { if (swapblk == SWAPBLK_NONE) return (SWAPBLK_NONE); for (;;) { sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0)); if (sb != NULL) { sb->p = rdpi; for (i = 0; i < SWAP_META_PAGES; i++) sb->d[i] = SWAPBLK_NONE; if (atomic_cmpset_int(&swblk_zone_exhausted, 1, 0)) printf("swblk zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swblk_zone)) { if (atomic_cmpset_int(&swblk_zone_exhausted, 0, 1)) printf("swap blk zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxb", 10); } else uma_zwait(swblk_zone); VM_OBJECT_WLOCK(object); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb != NULL) /* * Somebody swapped out a nearby page, * allocating swblk at the rdpi index, * while we dropped the object lock. */ goto allocated; } for (;;) { error = SWAP_PCTRIE_INSERT( &object->un_pager.swp.swp_blks, sb); if (error == 0) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 1, 0)) printf("swpctrie zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swpctrie_zone)) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 0, 1)) printf("swap pctrie zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxp", 10); } else uma_zwait(swpctrie_zone); VM_OBJECT_WLOCK(object); sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb1 != NULL) { uma_zfree(swblk_zone, sb); sb = sb1; goto allocated; } } } allocated: MPASS(sb->p == rdpi); modpi = pindex % SWAP_META_PAGES; /* Return prior contents of metadata. */ prev_swapblk = sb->d[modpi]; /* Enter block into metadata. */ sb->d[modpi] = swapblk; /* * Free the swblk if we end up with the empty page run. */ if (swapblk == SWAPBLK_NONE) swp_pager_free_empty_swblk(object, sb); return (prev_swapblk); } /* * SWP_PAGER_META_TRANSFER() - free a range of blocks in the srcobject's swap * metadata, or transfer it into dstobject. * * This routine will free swap metadata structures as they are cleaned * out. */ static void swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t pindex, vm_pindex_t count) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t offset, last; int i, limit, start; VM_OBJECT_ASSERT_WLOCKED(srcobject); if ((srcobject->flags & OBJ_SWAP) == 0 || count == 0) return; swp_pager_init_freerange(&s_free, &n_free); offset = pindex; last = pindex + count; for (;;) { sb = SWAP_PCTRIE_LOOKUP_GE(&srcobject->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL || sb->p >= last) break; start = pindex > sb->p ? pindex - sb->p : 0; limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : SWAP_META_PAGES; for (i = start; i < limit; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; if (dstobject == NULL || !swp_pager_xfer_source(srcobject, dstobject, sb->p + i - offset, sb->d[i])) { swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } sb->d[i] = SWAPBLK_NONE; } pindex = sb->p + SWAP_META_PAGES; if (swp_pager_swblk_empty(sb, 0, start) && swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&srcobject->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count) { swp_pager_meta_transfer(object, NULL, pindex, count); } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. */ static void swp_pager_meta_free_all(vm_object_t object) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t pindex; int i; VM_OBJECT_ASSERT_WLOCKED(object); if ((object->flags & OBJ_SWAP) == 0) return; swp_pager_init_freerange(&s_free, &n_free); for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pindex)) != NULL;) { pindex = sb->p + SWAP_META_PAGES; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_METACTL() - misc control of swap meta data. * * This routine is capable of looking up, or removing swapblk * assignments in the swap meta data. It returns the swapblk being * looked-up, popped, or SWAPBLK_NONE if the block was invalid. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. */ static daddr_t swp_pager_meta_lookup(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; VM_OBJECT_ASSERT_LOCKED(object); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ KASSERT((object->flags & OBJ_SWAP) != 0, ("Lookup object not swappable")); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (SWAPBLK_NONE); return (sb->d[pindex % SWAP_META_PAGES]); } /* * Returns the least page index which is greater than or equal to the * parameter pindex and for which there is a swap block allocated. * Returns object's size if the object's type is not swap or if there * are no allocated swap blocks for the object after the requested * pindex. */ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; int i; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_SWAP) == 0) return (object->size); sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); if (sb->p < pindex) { for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, roundup(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); } for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } /* * We get here if a swblk is present in the trie but it * doesn't map any blocks. */ MPASS(0); return (object->size); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swblk_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vn_isdisk_error(vp, &error)) { error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: sx_xunlock(&swdev_syscall_lock); return (error); } /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning * message. */ static void swapon_check_swzone(void) { /* recommend using no more than half that amount */ if (swap_total > swap_maxpages / 2) { printf("warning: total configured swap (%lu pages) " "exceeds maximum recommended amount (%lu pages).\n", swap_total, swap_maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); } } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; daddr_t dvbase; /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_blist = blist_create(nblks, M_WAITOK); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; /* * Do not free the first blocks in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE), nblks - howmany(BBSIZE, PAGE_SIZE)); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE); swap_total += nblks; swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); EVENTHANDLER_INVOKE(swapon, sp); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: sx_xunlock(&swdev_syscall_lock); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks; #ifdef MAC int error; #endif sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); swap_total -= nblks; mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); mtx_lock(&sw_dev_mtx); sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); if (vn_isdisk(sp->sw_vp)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); sx_xunlock(&swdev_syscall_lock); } void swap_pager_status(int *total, int *used) { *total = swap_total; *used = swap_total - swap_pager_avail - nswapdev * howmany(BBSIZE, PAGE_SIZE); } int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) { struct swdevt *sp; const char *tmp_devname; int error, n; n = 0; error = ENOENT; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n != name) { n++; continue; } xs->xsw_version = XSWDEV_VERSION; xs->xsw_dev = sp->sw_dev; xs->xsw_flags = sp->sw_flags; xs->xsw_nblks = sp->sw_nblks; xs->xsw_used = sp->sw_used; if (devname != NULL) { if (vn_isdisk(sp->sw_vp)) tmp_devname = devtoname(sp->sw_vp->v_rdev); else tmp_devname = "[file]"; strncpy(devname, tmp_devname, len); } error = 0; break; } mtx_unlock(&sw_dev_mtx); return (error); } #if defined(COMPAT_FREEBSD11) #define XSWDEV_VERSION_11 1 struct xswdev11 { u_int xsw_version; uint32_t xsw_dev; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 { u_int xsw_version; u_int xsw_dev1, xsw_dev2; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { struct xswdev xs; #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 xs32; #endif #if defined(COMPAT_FREEBSD11) struct xswdev11 xs11; #endif int error; if (arg2 != 1) /* name length */ return (EINVAL); memset(&xs, 0, sizeof(xs)); error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_FREEBSD32) if (req->oldlen == sizeof(xs32)) { memset(&xs32, 0, sizeof(xs32)); xs32.xsw_version = XSWDEV_VERSION; xs32.xsw_dev1 = xs.xsw_dev; xs32.xsw_dev2 = xs.xsw_dev >> 32; xs32.xsw_flags = xs.xsw_flags; xs32.xsw_nblks = xs.xsw_nblks; xs32.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs32, sizeof(xs32)); return (error); } #endif #if defined(COMPAT_FREEBSD11) if (req->oldlen == sizeof(xs11)) { memset(&xs11, 0, sizeof(xs11)); xs11.xsw_version = XSWDEV_VERSION_11; xs11.xsw_dev = xs.xsw_dev; /* truncation */ xs11.xsw_flags = xs.xsw_flags; xs11.xsw_nblks = xs.xsw_nblks; xs11.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs11, sizeof(xs11)); return (error); } #endif error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_vm_swap_info, "Swap statistics by device"); /* * Count the approximate swap usage in pages for a vmspace. The * shadowed or not yet copied on write swap blocks are not accounted. * The map must be locked. */ long vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t cur; vm_object_t object; struct swblk *sb; vm_pindex_t e, pi; long count; int i; map = &vmspace->vm_map; count = 0; VM_MAP_ENTRY_FOREACH(cur, map) { if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; object = cur->object.vm_object; if (object == NULL || (object->flags & OBJ_SWAP) == 0) continue; VM_OBJECT_RLOCK(object); if ((object->flags & OBJ_SWAP) == 0) goto unlock; pi = OFF_TO_IDX(cur->offset); e = pi + OFF_TO_IDX(cur->end - cur->start); for (;; pi = sb->p + SWAP_META_PAGES) { sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi); if (sb == NULL || sb->p >= e) break; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->p + i < e && sb->d[i] != SWAPBLK_NONE) count++; } } unlock: VM_OBJECT_RUNLOCK(object); } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } /* * Add a reference to the g_consumer for an inflight transaction. */ static void swapgeom_acquire(struct g_consumer *cp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index++; } /* * Remove a reference from the g_consumer. Post a close event if all * references go away, since the function might be called from the * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index--; if (cp->index == 0) { if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) sp->sw_id = NULL; } } static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; bp = bp2->bio_caller2; cp = bp2->bio_from; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bp->b_caller1 = NULL; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sp->sw_id; if (cp == NULL) { mtx_unlock(&sw_dev_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; printf("swap_pager: cannot allocate bio\n"); bufdone(bp); return; } bp->b_caller1 = bio; bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bio->bio_ma_n = bp->b_npages; bio->bio_flags |= BIO_UNMAPPED; } else { bio->bio_data = bp->b_data; bio->bio_ma = NULL; } g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; int destroy; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == cp) { sp->sw_flags |= SW_CLOSING; break; } } /* * Drop reference we were created with. Do directly since we're in a * special context where we don't have to queue the call to * swapgeom_close_ev(). */ cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); if (destroy) swapgeom_close_ev(cp, 0); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); /* * swapgeom_close() may be called from the biodone context, * where we cannot perform topology changes. Delegate the * work to the events thread. */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static int swapongeom_locked(struct cdev *dev, struct vnode *vp) { struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; pp = g_dev_getprovider(dev); if (pp == NULL) return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); return (0); } static int swapongeom(struct vnode *vp) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type != VCHR || VN_IS_DOOMED(vp)) { error = ENOENT; } else { g_topology_lock(); error = swapongeom_locked(vp->v_rdev, vp); g_topology_unlock(); } VOP_UNLOCK(vp); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); return (0); } static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) { int error, new, n; new = nsw_wcount_async_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new > nswbuf / 2 || new < 1) return (EINVAL); mtx_lock(&swbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, * we will need to sqeeze our update slowly in. Sleep with a * higher priority than getpbuf() to finish faster. */ n = new - nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, "swpsysctl", 0); } } mtx_unlock(&swbuf_mtx); return (0); } static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_ANON) == 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_ANON) == 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } static void swap_tmpfs_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) { struct vnode *vp; /* * Tmpfs VREG node, which was reclaimed, has OBJT_SWAP_TMPFS * type, but not OBJ_TMPFS flag. In this case there is no * v_writecount to adjust. */ if (vp_heldp != NULL) VM_OBJECT_RLOCK(object); else VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; if (vp != NULL) { *vpp = vp; if (vp_heldp != NULL) { vhold(vp); *vp_heldp = true; } } } if (vp_heldp != NULL) VM_OBJECT_RUNLOCK(object); } diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 1c4e879d82ea..735ab603a09b 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1,2859 +1,2824 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean); static void vm_object_backing_remove(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM object stats"); static COUNTER_U64_DEFINE_EARLY(object_collapses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, "VM object collapses"); static COUNTER_U64_DEFINE_EARLY(object_bypasses); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, "VM object bypasses"); static COUNTER_U64_DEFINE_EARLY(object_collapse_waits); SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD, &object_collapse_waits, "Number of sleeps for collapse"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(!vm_object_busied(object), ("object %p busy = %d", object, blockcount_read(&object->busy))); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; vm_radix_init(&object->rtree); refcount_init(&object->ref_count, 0); blockcount_init(&object->paging_in_progress); blockcount_init(&object->busy); object->resident_page_count = 0; object->shadow_count = 0; object->flags = OBJ_DEAD; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, u_short flags, vm_object_t object, void *handle) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; object->flags = flags; if ((flags & OBJ_SWAP) != 0) pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff() iteration over object_list * sees up to date type and pctrie head if it observed * non-dead object. */ atomic_thread_fence_rel(); object->pg_color = 0; object->size = size; object->domain.dr_policy = NULL; object->generation = 1; object->cleangeneration = 1; refcount_init(&object->ref_count, 1); object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = handle; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), OBJ_UNMANAGED, kernel_object, NULL); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif kernel_object->un_pager.phys.ops = &default_phys_pg_ops; /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. * * paging_in_progress is valid always. Lockless references to * the objects may acquire pip and then check OBJ_DEAD. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_zinit(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_SWAP_TMPFS: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { if (i > 0) blockcount_acquire(&object->paging_in_progress, i); } void vm_object_pip_wakeup(vm_object_t object) { vm_object_pip_wakeupn(object, 1); } void vm_object_pip_wakeupn(vm_object_t object, short i) { if (i > 0) blockcount_release(&object->paging_in_progress, i); } /* * Atomically drop the object lock and wait for pip to drain. This protects * from sleep/wakeup races due to identity changes. The lock is not re-acquired * on return. */ static void vm_object_pip_sleep(vm_object_t object, const char *waitid) { (void)blockcount_sleep(&object->paging_in_progress, &object->lock, waitid, PVM | PDROP); } void vm_object_pip_wait(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); blockcount_wait(&object->paging_in_progress, &object->lock, waitid, PVM); } void vm_object_pip_wait_unlocked(vm_object_t object, const char *waitid) { VM_OBJECT_ASSERT_UNLOCKED(object); blockcount_wait(&object->paging_in_progress, NULL, waitid, PVM); } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; u_short flags; switch (type) { case OBJT_DEAD: panic("vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: flags = OBJ_COLORED; break; case OBJT_SWAP: case OBJT_SWAP_TMPFS: flags = OBJ_COLORED | OBJ_SWAP; break; case OBJT_DEVICE: case OBJT_SG: flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: flags = OBJ_UNMANAGED; break; case OBJT_VNODE: flags = 0; break; default: panic("vm_object_allocate: type %d is undefined", type); } object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, flags, object, NULL); return (object); } /* * vm_object_allocate_anon: * * Returns a new default object of the given size and marked as * anonymous memory for special split/collapse handling. Color * to be initialized by the caller. */ vm_object_t vm_object_allocate_anon(vm_pindex_t size, vm_object_t backing_object, struct ucred *cred, vm_size_t charge) { vm_object_t handle, object; if (backing_object == NULL) handle = NULL; else if ((backing_object->flags & OBJ_ANON) != 0) handle = backing_object->handle; else handle = backing_object; object = uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(OBJT_DEFAULT, size, OBJ_ANON | OBJ_ONEMAPPING, object, handle); object->cred = cred; object->charge = cred != NULL ? charge : 0; return (object); } static void vm_object_reference_vnode(vm_object_t object) { u_int old; /* * vnode objects need the lock for the first reference * to serialize with vnode_object_deallocate(). */ if (!refcount_acquire_if_gt(&object->ref_count, 0)) { VM_OBJECT_RLOCK(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); VM_OBJECT_RUNLOCK(object); } } /* * vm_object_reference: * * Acquires a reference to the given object. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; if (object->type == OBJT_VNODE) vm_object_reference_vnode(object); else refcount_acquire(&object->ref_count); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { u_int old; VM_OBJECT_ASSERT_LOCKED(object); old = refcount_acquire(&object->ref_count); if (object->type == OBJT_VNODE && old == 0) vref(object->handle); KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_reference: Referenced dead object.")); } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_deallocate_vnode(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; bool last; KASSERT(object->type == OBJT_VNODE, ("vm_object_deallocate_vnode: not a vnode object")); KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp")); /* Object lock to protect handle lookup. */ last = refcount_release(&object->ref_count); VM_OBJECT_RUNLOCK(object); if (!last) return; if (!umtx_shm_vnobj_persistent) umtx_shm_object_terminated(object); /* vrele may need the vnode lock. */ vrele(vp); } /* * We dropped a reference on an object and discovered that it had a * single remaining shadow. This is a sibling of the reference we * dropped. Attempt to collapse the sibling and backing object. */ static vm_object_t vm_object_deallocate_anon(vm_object_t backing_object) { vm_object_t object; /* Fetch the final shadow. */ object = LIST_FIRST(&backing_object->shadow_head); KASSERT(object != NULL && backing_object->shadow_count == 1, ("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d", backing_object->ref_count, backing_object->shadow_count)); KASSERT((object->flags & OBJ_ANON) != 0, ("invalid shadow object %p", object)); if (!VM_OBJECT_TRYWLOCK(object)) { /* * Prevent object from disappearing since we do not have a * reference. */ vm_object_pip_add(object, 1); VM_OBJECT_WUNLOCK(backing_object); VM_OBJECT_WLOCK(object); vm_object_pip_wakeup(object); } else VM_OBJECT_WUNLOCK(backing_object); /* * Check for a collapse/terminate race with the last reference holder. */ if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 || !refcount_acquire_if_not_zero(&object->ref_count)) { VM_OBJECT_WUNLOCK(object); return (NULL); } backing_object = object->backing_object; if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0) vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); return (object); } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; bool released; while (object != NULL) { /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. A ref count * of 1 may be a special case depending on the shadow count * being 0 or 1. These cases require a write lock on the * object. */ if ((object->flags & OBJ_ANON) == 0) released = refcount_release_if_gt(&object->ref_count, 1); else released = refcount_release_if_gt(&object->ref_count, 2); if (released) return; if (object->type == OBJT_VNODE) { VM_OBJECT_RLOCK(object); if (object->type == OBJT_VNODE) { vm_object_deallocate_vnode(object); return; } VM_OBJECT_RUNLOCK(object); } VM_OBJECT_WLOCK(object); KASSERT(object->ref_count > 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If this is not the final reference to an anonymous * object we may need to collapse the shadow chain. */ if (!refcount_release(&object->ref_count)) { if (object->ref_count > 1 || object->shadow_count == 0) { if ((object->flags & OBJ_ANON) != 0 && object->ref_count == 1) vm_object_set_flag(object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(object); return; } /* Handle collapsing last ref on anonymous objects. */ object = vm_object_deallocate_anon(object); continue; } /* * Handle the final reference to an object. We restart * the loop with the backing object to avoid recursion. */ umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT(object->type != OBJT_SWAP_TMPFS, ("shadowed tmpfs v_object 2 %p", object)); vm_object_backing_remove(object); } KASSERT((object->flags & OBJ_DEAD) == 0, ("vm_object_deallocate: Terminating dead object.")); vm_object_set_flag(object, OBJ_DEAD); vm_object_terminate(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } static void vm_object_backing_remove_locked(vm_object_t object) { vm_object_t backing_object; backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("vm_object_backing_remove: Removing collapsing object.")); if ((object->flags & OBJ_SHADOWLIST) != 0) { LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; object->flags &= ~OBJ_SHADOWLIST; } object->backing_object = NULL; } static void vm_object_backing_remove(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); if ((object->flags & OBJ_SHADOWLIST) != 0) { backing_object = object->backing_object; VM_OBJECT_WLOCK(backing_object); vm_object_backing_remove_locked(object); VM_OBJECT_WUNLOCK(backing_object); } else object->backing_object = NULL; } static void vm_object_backing_insert_locked(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_ASSERT_WLOCKED(backing_object); LIST_INSERT_HEAD(&backing_object->shadow_head, object, shadow_list); backing_object->shadow_count++; object->flags |= OBJ_SHADOWLIST; } object->backing_object = backing_object; } static void vm_object_backing_insert(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); vm_object_backing_insert_locked(object, backing_object); VM_OBJECT_WUNLOCK(backing_object); } else object->backing_object = backing_object; } /* * Insert an object into a backing_object's shadow list with an additional * reference to the backing_object added. */ static void vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object) { VM_OBJECT_ASSERT_WLOCKED(object); if ((backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(backing_object); KASSERT((backing_object->flags & OBJ_DEAD) == 0, ("shadowing dead anonymous object")); vm_object_reference_locked(backing_object); vm_object_backing_insert_locked(object, backing_object); vm_object_clear_flag(backing_object, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(backing_object); } else { vm_object_reference(backing_object); object->backing_object = backing_object; } } /* * Transfer a backing reference from backing_object to object. */ static void vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object) { vm_object_t new_backing_object; /* * Note that the reference to backing_object->backing_object * moves from within backing_object to within object. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object == NULL) return; if ((new_backing_object->flags & OBJ_ANON) != 0) { VM_OBJECT_WLOCK(new_backing_object); vm_object_backing_remove_locked(backing_object); vm_object_backing_insert_locked(object, new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); } else { object->backing_object = new_backing_object; backing_object->backing_object = NULL; } } /* * Wait for a concurrent collapse to settle. */ static void vm_object_collapse_wait(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); while ((object->flags & OBJ_COLLAPSING) != 0) { vm_object_pip_wait(object, "vmcolwait"); counter_u64_add(object_collapse_waits, 1); } } /* * Waits for a backing object to clear a pending collapse and returns * it locked if it is an ANON object. */ static vm_object_t vm_object_backing_collapse_wait(vm_object_t object) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); for (;;) { backing_object = object->backing_object; if (backing_object == NULL || (backing_object->flags & OBJ_ANON) == 0) return (NULL); VM_OBJECT_WLOCK(backing_object); if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0) break; VM_OBJECT_WUNLOCK(object); vm_object_pip_sleep(backing_object, "vmbckwait"); counter_u64_add(object_collapse_waits, 1); VM_OBJECT_WLOCK(object); } return (backing_object); } /* * vm_object_terminate_pages removes any remaining pageable pages * from the object and resets the object to an empty state. */ static void vm_object_terminate_pages(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); KASSERT(p->object == object && (p->ref_count & VPRC_OBJREF) != 0, ("vm_object_terminate_pages: page %p is inconsistent", p)); p->object = NULL; if (vm_page_drop(p, VPRC_OBJREF) == VPRC_OBJREF) { VM_CNT_INC(v_pfree); vm_page_free(p); } } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("terminating non-dead obj %p", object)); KASSERT((object->flags & OBJ_COLLAPSING) == 0, ("terminating collapsing obj %p", object)); KASSERT(object->backing_object == NULL, ("terminating shadow obj %p", object)); /* * Wait for the pageout daemon and other current users to be * done with the object. Note that new paging_in_progress * users can come after this wait, but they must check * OBJ_DEAD flag set (without unlocking the object), and avoid * the object being terminated. */ vm_object_pip_wait(object, "objtrm"); KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); if ((object->flags & OBJ_PG_DTOR) == 0) vm_object_terminate_pages(object); #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP || object->type == OBJT_SWAP_TMPFS, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *allclean) { vm_page_assert_busied(p); /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->a.flags & PGA_NOSYNC) != 0) { *allclean = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with PGA_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * For swap objects backing tmpfs regular files, do not flush anything, * but remove write protection on the mapped pages to update mtime through * mmaped writes. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t eio, res, allclean; VM_OBJECT_ASSERT_WLOCKED(object); if (!vm_object_mightbedirty(object) || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); allclean = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (vm_page_none_valid(p)) continue; if (vm_page_busy_acquire(p, VM_ALLOC_WAITFAIL) == 0) { if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &allclean)) { vm_page_xunbusy(p); continue; } if (object->type == OBJT_VNODE) { n = vm_object_page_collect_flush(object, p, pagerflags, flags, &allclean, &eio); if (eio) { res = FALSE; allclean = FALSE; } if (object->generation != curgeneration && (flags & OBJPC_SYNC) != 0) goto rescan; /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; allclean = FALSE; } } else { n = 1; vm_page_xunbusy(p); } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif /* * Leave updating cleangeneration for tmpfs objects to tmpfs * scan. It needs to update mtime, which happens for other * filesystems during page writeouts. */ if (allclean && object->type == OBJT_VNODE) object->cleangeneration = curgeneration; return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *allclean, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); vm_page_assert_xbusied(p); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_tryxbusy(tp) == 0) break; if (!vm_object_page_remove_write(tp, flags, allclean)) { vm_page_xunbusy(tp); break; } p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && vm_object_mightbedirty(object) != 0 && ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && atop(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * Determine whether the given advice can be applied to the object. Advice is * not applied to unmanaged pages since they never belong to page queues, and * since MADV_FREE is destructive, it can apply only to anonymous pages that * have been mapped at most once. */ static bool vm_object_advice_applies(vm_object_t object, int advice) { if ((object->flags & OBJ_UNMANAGED) != 0) return (false); if (advice != MADV_FREE) return (true); return ((object->flags & (OBJ_ONEMAPPING | OBJ_ANON)) == (OBJ_ONEMAPPING | OBJ_ANON)); } static void vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex, vm_size_t size) { if (advice == MADV_FREE) vm_pager_freespace(object, pindex, size); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advice) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m, tm; if (object == NULL) return; relookup: VM_OBJECT_WLOCK(object); if (!vm_object_advice_applies(object, advice)) { VM_OBJECT_WUNLOCK(object); return; } for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) { tobject = object; /* * If the next page isn't resident in the top-level object, we * need to search the shadow chain. When applying MADV_FREE, we * take care to release any swap space used to store * non-resident pages. */ if (m == NULL || pindex < m->pindex) { /* * Optimize a common case: if the top-level object has * no backing object, we can skip over the non-resident * range in constant time. */ if (object->backing_object == NULL) { tpindex = (m != NULL && m->pindex < end) ? m->pindex : end; vm_object_madvise_freespace(object, advice, pindex, tpindex - pindex); if ((pindex = tpindex) == end) break; goto next_page; } tpindex = pindex; do { vm_object_madvise_freespace(tobject, advice, tpindex, 1); /* * Prepare to search the next object in the * chain. */ backing_object = tobject->backing_object; if (backing_object == NULL) goto next_pindex; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; if (!vm_object_advice_applies(tobject, advice)) goto next_pindex; } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { next_page: tm = m; m = TAILQ_NEXT(m, listq); } /* * If the page is not in a normal state, skip it. The page * can not be invalidated while the object lock is held. */ if (!vm_page_all_valid(tm) || vm_page_wired(tm)) goto next_pindex; KASSERT((tm->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", tm)); KASSERT((tm->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_tryxbusy(tm) == 0) { if (object != tobject) VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(tm, PGA_REFERENCED); } vm_page_busy_sleep(tm, "madvpo", false); goto relookup; } vm_page_advise(tm, advice); vm_page_xunbusy(tm); vm_object_madvise_freespace(tobject, advice, tm->pindex, 1); next_pindex: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow(vm_object_t *object, vm_ooffset_t *offset, vm_size_t length, struct ucred *cred, bool shared) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. * * If we hold the only reference we can guarantee that it won't * increase while we have the map locked. Otherwise the race is * harmless and we will end up with an extra shadow object that * will be collapsed later. */ if (source != NULL && source->ref_count == 1 && (source->flags & OBJ_ANON) != 0) return; /* * Allocate a new object with the given length. */ result = vm_object_allocate_anon(atop(length), source, cred, length); /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (shared || source != NULL) { VM_OBJECT_WLOCK(result); /* * The new object shadows the source object, adding a * reference to it. Our caller changes his reference * to point to the new object, removing a reference to * the source object. Net result: no change of * reference count, unless the caller needs to add one * more reference due to forking a shared map entry. */ if (shared) { vm_object_reference_locked(result); vm_object_clear_flag(result, OBJ_ONEMAPPING); } /* * Try to optimize the result object's page color when * shadowing in order to maintain page coloring * consistency in the combined shadowed object. */ if (source != NULL) { vm_object_backing_insert(result, source); result->domain = source->domain; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif } VM_OBJECT_WUNLOCK(result); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_busy, m_next; vm_object_t orig_object, new_object, backing_object; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0, ("vm_object_split: Splitting object with multiple mappings.")); if ((orig_object->flags & OBJ_ANON) == 0) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate_anon(size, orig_object, orig_object->cred, ptoa(size)); /* * We must wait for the orig_object to complete any in-progress * collapse so that the swap blocks are stable below. The * additional reference on backing_object by new object will * prevent further collapse operations until split completes. */ VM_OBJECT_WLOCK(orig_object); vm_object_collapse_wait(orig_object); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); new_object->domain = orig_object->domain; backing_object = orig_object->backing_object; if (backing_object != NULL) { vm_object_backing_insert_ref(new_object, backing_object); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; } if (orig_object->cred != NULL) { crhold(orig_object->cred); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } /* * Mark the split operation so that swap_pager_getpages() knows * that the object is in transition. */ vm_object_set_flag(orig_object, OBJ_SPLIT); m_busy = NULL; #ifdef INVARIANTS idx = 0; #endif retry: m = vm_page_find_least(orig_object, offidxstart); KASSERT(m == NULL || idx <= m->pindex - offidxstart, ("%s: object %p was repopulated", __func__, orig_object)); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_tryxbusy(m) == 0) { VM_OBJECT_WUNLOCK(new_object); vm_page_sleep_if_busy(m, "spltwt"); VM_OBJECT_WLOCK(new_object); goto retry; } /* * The page was left invalid. Likely placed there by * an incomplete fault. Just remove and ignore. */ if (vm_page_none_valid(m)) { if (vm_page_remove(m)) vm_page_free(m); continue; } /* vm_page_rename() will dirty the page. */ if (vm_page_rename(m, new_object, idx)) { vm_page_xunbusy(m); VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); vm_radix_wait(); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif /* * orig_object's type may change while sleeping, so keep track * of the beginning of the busied range. */ if (orig_object->type != OBJT_SWAP) vm_page_xunbusy(m); else if (m_busy == NULL) m_busy = m; } if ((orig_object->flags & OBJ_SWAP) != 0) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); if (m_busy != NULL) TAILQ_FOREACH_FROM(m_busy, &new_object->memq, listq) vm_page_xunbusy(m_busy); } vm_object_clear_flag(orig_object, OBJ_SPLIT); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); /* The page is only NULL when rename fails. */ if (p == NULL) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); vm_radix_wait(); } else { if (p->object == object) VM_OBJECT_WUNLOCK(backing_object); else VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmocol", false); } VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex, pi, ps; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; if ((backing_object->flags & OBJ_ANON) == 0) return (false); pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset); p = vm_page_find_least(backing_object, pi); ps = swap_pager_find_least(backing_object, pi); /* * Only check pages inside the parent object's range and * inside the parent object's mapping of the backing object. */ for (;; pi++) { if (p != NULL && p->pindex < pi) p = TAILQ_NEXT(p, listq); if (ps < pi) ps = swap_pager_find_least(backing_object, pi); if (p == NULL && ps >= backing_object->size) break; else if (p == NULL) pi = ps; else pi = MIN(p->pindex, ps); new_pindex = pi - backing_offset_index; if (new_pindex >= object->size) break; if (p != NULL) { /* * If the backing object page is busy a * grandparent or older page may still be * undergoing CoW. It is not safe to collapse * the backing object until it is quiesced. */ if (vm_page_tryxbusy(p) == 0) return (false); /* * We raced with the fault handler that left * newly allocated invalid page on the object * queue and retried. */ if (!vm_page_all_valid(p)) goto unbusy_ret; } /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); /* * The valid check here is stable due to object lock * being required to clear valid and initiate paging. * Busy of p disallows fault handler to validate pp. */ if ((pp == NULL || vm_page_none_valid(pp)) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) goto unbusy_ret; if (p != NULL) vm_page_xunbusy(p); } return (true); unbusy_ret: if (p != NULL) vm_page_xunbusy(p); return (false); } static void vm_object_collapse_scan(vm_object_t object) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_tryxbusy(p) == 0) { next = vm_object_collapse_scan_wait(object, p); continue; } KASSERT(object->backing_object == backing_object, ("vm_object_collapse_scan: backing object mismatch %p != %p", object->backing_object, backing_object)); KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch %p != %p", p->object, backing_object)); if (p->pindex < backing_offset_index || new_pindex >= object->size) { vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } if (!vm_page_all_valid(p)) { KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_tryxbusy(pp) == 0) { vm_page_xunbusy(p); /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. */ next = vm_object_collapse_scan_wait(object, pp); continue; } if (pp != NULL && vm_page_none_valid(pp)) { /* * The page was invalid in the parent. Likely placed * there by an incomplete fault. Just remove and * ignore. p can replace it. */ if (vm_page_remove(pp)) vm_page_free(pp); pp = NULL; } if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ vm_pager_freespace(backing_object, p->pindex, 1); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (vm_page_remove(p)) vm_page_free(p); if (pp != NULL) vm_page_xunbusy(pp); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will dirty the page. */ if (vm_page_rename(p, object, new_pindex)) { vm_page_xunbusy(p); next = vm_object_collapse_scan_wait(object, NULL); continue; } /* Use the old pindex to free the right page. */ vm_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif vm_page_xunbusy(p); } return; } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { vm_object_t backing_object, new_backing_object; VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON, ("collapsing invalid object")); /* * Wait for the backing_object to finish any pending * collapse so that the caller sees the shortest possible * shadow chain. */ backing_object = vm_object_backing_collapse_wait(object); if (backing_object == NULL) return; KASSERT(object->ref_count > 0 && object->ref_count > object->shadow_count, ("collapse with invalid ref %d or shadow %d count.", object->ref_count, object->shadow_count)); KASSERT((backing_object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: Backing object already collapsing.")); KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0, ("vm_object_collapse: object is already collapsing.")); /* * We know that we can either collapse the backing object if * the parent is the only reference to it, or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. */ if (backing_object->ref_count == 1) { KASSERT(backing_object->shadow_count == 1, ("vm_object_collapse: shadow_count: %d", backing_object->shadow_count)); vm_object_pip_add(object, 1); vm_object_set_flag(object, OBJ_COLLAPSING); vm_object_pip_add(backing_object, 1); vm_object_set_flag(backing_object, OBJ_DEAD); /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if ((backing_object->flags & OBJ_SWAP) != 0) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy backing_object, it will change the * type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); } /* * Object now shadows whatever backing_object did. */ vm_object_clear_flag(object, OBJ_COLLAPSING); vm_object_backing_transfer(object, backing_object); object->backing_object_offset += backing_object->backing_object_offset; VM_OBJECT_WUNLOCK(object); vm_object_pip_wakeup(object); /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); vm_object_pip_wakeup(backing_object); (void)refcount_release(&backing_object->ref_count); vm_object_terminate(backing_object); counter_u64_add(object_collapses, 1); VM_OBJECT_WLOCK(object); } else { /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. * * The object lock and backing_object lock must not * be dropped during this sequence. */ if (!vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ vm_object_backing_remove_locked(object); new_backing_object = backing_object->backing_object; if (new_backing_object != NULL) { vm_object_backing_insert_ref(object, new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ (void)refcount_release(&backing_object->ref_count); KASSERT(backing_object->ref_count >= 1, ( "backing_object %p was somehow dereferenced during collapse!", backing_object)); VM_OBJECT_WUNLOCK(backing_object); counter_u64_add(object_bypasses, 1); } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) return; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ if (vm_page_tryxbusy(p) == 0) { vm_page_sleep_if_busy(p, "vmopar"); goto again; } if (vm_page_wired(p)) { wired: if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { vm_page_invalid(p); vm_page_undirty(p); } vm_page_xunbusy(p); continue; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && !vm_page_none_valid(p)) { if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_write(p)) goto wired; if (p->dirty != 0) { vm_page_xunbusy(p); continue; } } if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0 && !vm_page_try_remove_all(p)) goto wired; vm_page_free(p); } vm_object_pip_wakeup(object); vm_pager_freespace(object, start, (end == 0 ? object->size : end) - start); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t p, next; VM_OBJECT_ASSERT_LOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); vm_page_deactivate_noreuse(p); } } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { rv = vm_page_grab_valid(&m, object, pindex, VM_ALLOC_NORMAL); if (rv != VM_PAGER_OK) break; /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); if ((prev_object->flags & OBJ_ANON) == 0) return (FALSE); VM_OBJECT_WLOCK(prev_object); /* * Try to collapse the object first. */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if (prev_object->ref_count > 1 && prev_object->size != next_pindex && (prev_object->flags & OBJ_ONEMAPPING) == 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty_(vm_object_t object) { atomic_add_int(&object->generation, 1); } bool vm_object_mightbedirty_(vm_object_t object) { return (object->generation != object->cleangeneration); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject, t1object; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); again: locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } if (vm_page_trysbusy(tm) == 0) { for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; if (tm->object != tobject) VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } vm_page_busy_sleep(tm, "unwbo", true); goto again; } vm_page_unwire(tm, queue); vm_page_sunbusy(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (tobject = object; locked_depth >= 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); tobject = t1object; } } /* * Return the vnode for the given object, or NULL if none exists. * For tmpfs objects, the function may return NULL if there is * no vnode allocated at the time of the call. */ struct vnode * vm_object_vnode(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_LOCKED(object); vm_pager_getvp(object, &vp, NULL); return (vp); } /* * Busy the vm object. This prevents new pages belonging to the object from * becoming busy. Existing pages persist as busy. Callers are responsible * for checking page state before proceeding. */ void vm_object_busy(vm_object_t obj) { VM_OBJECT_ASSERT_LOCKED(obj); blockcount_acquire(&obj->busy, 1); /* The fence is required to order loads of page busy. */ atomic_thread_fence_acq_rel(); } void vm_object_unbusy(vm_object_t obj) { blockcount_release(&obj->busy, 1); } void vm_object_busy_wait(vm_object_t obj, const char *wmesg) { VM_OBJECT_ASSERT_UNLOCKED(obj); (void)blockcount_sleep(&obj->busy, NULL, wmesg, PVM); } -/* - * Return the kvme type of the given object. - * If vpp is not NULL, set it to the object's vm_object_vnode() or NULL. - */ -int -vm_object_kvme_type(vm_object_t object, struct vnode **vpp) -{ - - VM_OBJECT_ASSERT_LOCKED(object); - if (vpp != NULL) - *vpp = vm_object_vnode(object); - switch (object->type) { - case OBJT_DEFAULT: - return (KVME_TYPE_DEFAULT); - case OBJT_VNODE: - return (KVME_TYPE_VNODE); - case OBJT_SWAP: - return (KVME_TYPE_SWAP); - case OBJT_SWAP_TMPFS: - return (KVME_TYPE_VNODE); - case OBJT_DEVICE: - return (KVME_TYPE_DEVICE); - case OBJT_PHYS: - return (KVME_TYPE_PHYS); - case OBJT_DEAD: - return (KVME_TYPE_DEAD); - case OBJT_SG: - return (KVME_TYPE_SG); - case OBJT_MGTDEVICE: - return (KVME_TYPE_MGTDEVICE); - default: - return (KVME_TYPE_UNKNOWN); - } -} - static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject *kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; u_long sp; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK); error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo->kvo_size = ptoa(obj->size); kvo->kvo_resident = obj->resident_page_count; kvo->kvo_ref_count = obj->ref_count; kvo->kvo_shadow_count = obj->shadow_count; kvo->kvo_memattr = obj->memattr; kvo->kvo_active = 0; kvo->kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (m->a.queue == PQ_ACTIVE) kvo->kvo_active++; else if (m->a.queue == PQ_INACTIVE) kvo->kvo_inactive++; } kvo->kvo_vn_fileid = 0; kvo->kvo_vn_fsid = 0; kvo->kvo_vn_fsid_freebsd11 = 0; freepath = NULL; fullpath = ""; kvo->kvo_type = vm_object_kvme_type(obj, &vp); if (vp != NULL) { vref(vp); } else if ((obj->flags & OBJ_ANON) != 0) { MPASS(kvo->kvo_type == KVME_TYPE_DEFAULT || kvo->kvo_type == KVME_TYPE_SWAP); kvo->kvo_me = (uintptr_t)obj; /* tmpfs objs are reported as vnodes */ kvo->kvo_backing_obj = (uintptr_t)obj->backing_object; sp = swap_pager_swapped_pages(obj); kvo->kvo_swapped = sp > UINT32_MAX ? UINT32_MAX : sp; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo->kvo_vn_fileid = va.va_fileid; kvo->kvo_vn_fsid = va.va_fsid; kvo->kvo_vn_fsid_freebsd11 = va.va_fsid; /* truncate */ } vput(vp); } strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo->kvo_path) + 1; kvo->kvo_structsize = roundup(kvo->kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); free(kvo, M_TEMP); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; if (map == 0) return 0; if (entry == 0) { VM_MAP_ENTRY_FOREACH(tmpe, map) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; VM_MAP_ENTRY_FOREACH(tmpe, tmpm) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if ((object->flags & OBJ_ANON) != 0) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } if (db_pager_quit) return; } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); if (db_pager_quit) break; } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index 240847903f62..9476058a75bf 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -1,532 +1,548 @@ /*- * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pager.c 8.6 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Paging space routine stubs. Emulates a matchmaker-like interface * for builtin pagers. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include uma_zone_t pbuf_zone; static int pbuf_init(void *, int, int); static int pbuf_ctor(void *, int, void *, int); static void pbuf_dtor(void *, int, void *); static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t dead_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static void dead_pager_dealloc(vm_object_t); static void dead_pager_getvp(vm_object_t, struct vnode **, bool *); static int dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int *rbehind, int *rahead) { return (VM_PAGER_FAIL); } static vm_object_t dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off, struct ucred *cred) { return (NULL); } static void dead_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int i; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_AGAIN; } static int dead_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *prev, int *next) { if (prev != NULL) *prev = 0; if (next != NULL) *next = 0; return (FALSE); } static void dead_pager_dealloc(vm_object_t object) { } static void dead_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) { /* * For OBJT_DEAD objects, v_writecount was handled in * vnode_pager_dealloc(). */ } static const struct pagerops deadpagerops = { + .pgo_kvme_type = KVME_TYPE_DEAD, .pgo_alloc = dead_pager_alloc, .pgo_dealloc = dead_pager_dealloc, .pgo_getpages = dead_pager_getpages, .pgo_putpages = dead_pager_putpages, .pgo_haspage = dead_pager_haspage, .pgo_getvp = dead_pager_getvp, }; const struct pagerops *pagertab[] __read_mostly = { [OBJT_DEFAULT] = &defaultpagerops, [OBJT_SWAP] = &swappagerops, [OBJT_VNODE] = &vnodepagerops, [OBJT_DEVICE] = &devicepagerops, [OBJT_PHYS] = &physpagerops, [OBJT_DEAD] = &deadpagerops, [OBJT_SG] = &sgpagerops, [OBJT_MGTDEVICE] = &mgtdevicepagerops, [OBJT_SWAP_TMPFS] = &swaptmpfspagerops, }; void vm_pager_init(void) { const struct pagerops **pgops; /* * Initialize known pagers */ for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++) if ((*pgops)->pgo_init != NULL) (*(*pgops)->pgo_init)(); } static int nswbuf_max; void vm_pager_bufferinit(void) { /* Main zone for paging bufs. */ pbuf_zone = uma_zcreate("pbuf", sizeof(struct buf) + PBUF_PAGES * sizeof(vm_page_t), pbuf_ctor, pbuf_dtor, pbuf_init, NULL, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); /* Few systems may still use this zone directly, so it needs a limit. */ nswbuf_max += uma_zone_set_max(pbuf_zone, NSWBUF_MIN); } uma_zone_t pbuf_zsecond_create(const char *name, int max) { uma_zone_t zone; zone = uma_zsecond_create(name, pbuf_ctor, pbuf_dtor, NULL, NULL, pbuf_zone); /* * uma_prealloc() rounds up to items per slab. If we would prealloc * immediately on every pbuf_zsecond_create(), we may accumulate too * much of difference between hard limit and prealloced items, which * means wasted memory. */ if (nswbuf_max > 0) nswbuf_max += uma_zone_set_max(zone, max); else uma_prealloc(pbuf_zone, uma_zone_set_max(zone, max)); return (zone); } static void pbuf_prealloc(void *arg __unused) { uma_prealloc(pbuf_zone, nswbuf_max); nswbuf_max = -1; } SYSINIT(pbuf, SI_SUB_KTHREAD_BUF, SI_ORDER_ANY, pbuf_prealloc, NULL); /* * Allocate an instance of a pager of the given type. * Size, protection and offset parameters are passed in for pagers that * need to perform page-level validation (e.g. the device pager). */ vm_object_t vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t off, struct ucred *cred) { vm_object_t ret; const struct pagerops *ops; ops = pagertab[type]; if (ops) ret = (*ops->pgo_alloc)(handle, size, prot, off, cred); else ret = NULL; return (ret); } /* * The object must be locked. */ void vm_pager_deallocate(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); (*pagertab[object->type]->pgo_dealloc) (object); } static void vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count) { #ifdef INVARIANTS /* * All pages must be consecutive, busied, not mapped, not fully valid, * not dirty and belong to the proper object. Some pages may be the * bogus page, but the first and last pages must be a real ones. */ VM_OBJECT_ASSERT_UNLOCKED(object); VM_OBJECT_ASSERT_PAGING(object); KASSERT(count > 0, ("%s: 0 count", __func__)); for (int i = 0 ; i < count; i++) { if (m[i] == bogus_page) { KASSERT(i != 0 && i != count - 1, ("%s: page %d is the bogus page", __func__, i)); continue; } vm_page_assert_xbusied(m[i]); KASSERT(!pmap_page_is_mapped(m[i]), ("%s: page %p is mapped", __func__, m[i])); KASSERT(m[i]->valid != VM_PAGE_BITS_ALL, ("%s: request for a valid page %p", __func__, m[i])); KASSERT(m[i]->dirty == 0, ("%s: page %p is dirty", __func__, m[i])); KASSERT(m[i]->object == object, ("%s: wrong object %p/%p", __func__, object, m[i]->object)); KASSERT(m[i]->pindex == m[0]->pindex + i, ("%s: page %p isn't consecutive", __func__, m[i])); } #endif } /* * Page in the pages for the object using its associated pager. * The requested page must be fully valid on successful return. */ int vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { #ifdef INVARIANTS vm_pindex_t pindex = m[0]->pindex; #endif int r; vm_pager_assert_in(object, m, count); r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind, rahead); if (r != VM_PAGER_OK) return (r); for (int i = 0; i < count; i++) { /* * If pager has replaced a page, assert that it had * updated the array. */ #ifdef INVARIANTS VM_OBJECT_RLOCK(object); KASSERT(m[i] == vm_page_lookup(object, pindex++), ("%s: mismatch page %p pindex %ju", __func__, m[i], (uintmax_t )pindex - 1)); VM_OBJECT_RUNLOCK(object); #endif /* * Zero out partially filled data. */ if (m[i]->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m[i], TRUE); } return (VM_PAGER_OK); } int vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { vm_pager_assert_in(object, m, count); return ((*pagertab[object->type]->pgo_getpages_async)(object, m, count, rbehind, rahead, iodone, arg)); } /* * vm_pager_put_pages() - inline, see vm/vm_pager.h * vm_pager_has_page() - inline, see vm/vm_pager.h */ /* * Search the specified pager object list for an object with the * specified handle. If an object with the specified handle is found, * increase its reference count and return it. Otherwise, return NULL. * * The pager object list must be locked. */ vm_object_t vm_pager_object_lookup(struct pagerlst *pg_list, void *handle) { vm_object_t object; TAILQ_FOREACH(object, pg_list, pager_object_list) { if (object->handle == handle) { VM_OBJECT_WLOCK(object); if ((object->flags & OBJ_DEAD) == 0) { vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); break; } VM_OBJECT_WUNLOCK(object); } } return (object); } static int pbuf_ctor(void *mem, int size, void *arg, int flags) { struct buf *bp = mem; bp->b_vp = NULL; bp->b_bufobj = NULL; /* copied from initpbuf() */ bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = 0; /* On no queue (QUEUE_NONE) */ bp->b_data = bp->b_kvabase; bp->b_xflags = 0; bp->b_flags = B_MAXPHYS; bp->b_ioflags = 0; bp->b_iodone = NULL; bp->b_error = 0; BUF_LOCK(bp, LK_EXCLUSIVE, NULL); return (0); } static void pbuf_dtor(void *mem, int size, void *arg) { struct buf *bp = mem; if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } BUF_UNLOCK(bp); } static int pbuf_init(void *mem, int size, int flags) { struct buf *bp = mem; bp->b_kvabase = (void *)kva_alloc(ptoa(PBUF_PAGES)); if (bp->b_kvabase == NULL) return (ENOMEM); bp->b_kvasize = ptoa(PBUF_PAGES); BUF_LOCKINIT(bp); LIST_INIT(&bp->b_dep); bp->b_rcred = bp->b_wcred = NOCRED; bp->b_xflags = 0; return (0); } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetvp(struct vnode *vp, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); KASSERT(bp->b_bufobj == NULL, ("pbgetvp: not free (bufobj)")); bp->b_vp = vp; bp->b_flags |= B_PAGING; bp->b_bufobj = &vp->v_bufobj; } /* * Associate a p-buffer with a vnode. * * Also sets B_PAGING flag to indicate that vnode is not fully associated * with the buffer. i.e. the bp has not been linked into the vnode or * ref-counted. */ void pbgetbo(struct bufobj *bo, struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbgetbo: not free (vnode)")); KASSERT(bp->b_bufobj == NULL, ("pbgetbo: not free (bufobj)")); bp->b_flags |= B_PAGING; bp->b_bufobj = bo; } /* * Disassociate a p-buffer from a vnode. */ void pbrelvp(struct buf *bp) { KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); KASSERT(bp->b_bufobj != NULL, ("pbrelvp: NULL bufobj")); KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, ("pbrelvp: pager buf on vnode list.")); bp->b_vp = NULL; bp->b_bufobj = NULL; bp->b_flags &= ~B_PAGING; } /* * Disassociate a p-buffer from a bufobj. */ void pbrelbo(struct buf *bp) { KASSERT(bp->b_vp == NULL, ("pbrelbo: vnode")); KASSERT(bp->b_bufobj != NULL, ("pbrelbo: NULL bufobj")); KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0, ("pbrelbo: pager buf on vnode list.")); bp->b_bufobj = NULL; bp->b_flags &= ~B_PAGING; } void vm_object_set_writeable_dirty(vm_object_t object) { pgo_set_writeable_dirty_t *method; method = pagertab[object->type]->pgo_set_writeable_dirty; if (method != NULL) method(object); } bool vm_object_mightbedirty(vm_object_t object) { pgo_mightbedirty_t *method; method = pagertab[object->type]->pgo_mightbedirty; if (method == NULL) return (false); return (method(object)); } + +/* + * Return the kvme type of the given object. + * If vpp is not NULL, set it to the object's vm_object_vnode() or NULL. + */ +int +vm_object_kvme_type(vm_object_t object, struct vnode **vpp) +{ + VM_OBJECT_ASSERT_LOCKED(object); + + if (vpp != NULL) + *vpp = vm_object_vnode(object); + return (pagertab[object->type]->pgo_kvme_type); +} diff --git a/sys/vm/vm_pager.h b/sys/vm/vm_pager.h index 68c5fe42351e..5e0261f818cf 100644 --- a/sys/vm/vm_pager.h +++ b/sys/vm/vm_pager.h @@ -1,286 +1,287 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vm_pager.h 8.4 (Berkeley) 1/12/94 * $FreeBSD$ */ /* * Pager routine interface definition. */ #ifndef _VM_PAGER_ #define _VM_PAGER_ #include TAILQ_HEAD(pagerlst, vm_object); struct vnode; typedef void pgo_init_t(void); typedef vm_object_t pgo_alloc_t(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); typedef void pgo_dealloc_t(vm_object_t); typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int *, int *); typedef void pgo_getpages_iodone_t(void *, vm_page_t *, int, int); typedef int pgo_getpages_async_t(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); typedef void pgo_putpages_t(vm_object_t, vm_page_t *, int, int, int *); typedef boolean_t pgo_haspage_t(vm_object_t, vm_pindex_t, int *, int *); typedef int pgo_populate_t(vm_object_t, vm_pindex_t, int, vm_prot_t, vm_pindex_t *, vm_pindex_t *); typedef void pgo_pageunswapped_t(vm_page_t); typedef void pgo_writecount_t(vm_object_t, vm_offset_t, vm_offset_t); typedef void pgo_set_writeable_dirty_t(vm_object_t); typedef bool pgo_mightbedirty_t(vm_object_t); typedef void pgo_getvp_t(vm_object_t object, struct vnode **vpp, bool *vp_heldp); typedef void pgo_freespace_t(vm_object_t object, vm_pindex_t start, vm_size_t size); struct pagerops { + int pgo_kvme_type; pgo_init_t *pgo_init; /* Initialize pager. */ pgo_alloc_t *pgo_alloc; /* Allocate pager. */ pgo_dealloc_t *pgo_dealloc; /* Disassociate. */ pgo_getpages_t *pgo_getpages; /* Get (read) page. */ pgo_getpages_async_t *pgo_getpages_async; /* Get page asyncly. */ pgo_putpages_t *pgo_putpages; /* Put (write) page. */ pgo_haspage_t *pgo_haspage; /* Query page. */ pgo_populate_t *pgo_populate; /* Bulk spec pagein. */ pgo_pageunswapped_t *pgo_pageunswapped; pgo_writecount_t *pgo_update_writecount; pgo_writecount_t *pgo_release_writecount; pgo_set_writeable_dirty_t *pgo_set_writeable_dirty; pgo_mightbedirty_t *pgo_mightbedirty; pgo_getvp_t *pgo_getvp; pgo_freespace_t *pgo_freespace; }; extern const struct pagerops defaultpagerops; extern const struct pagerops swappagerops; extern const struct pagerops vnodepagerops; extern const struct pagerops devicepagerops; extern const struct pagerops physpagerops; extern const struct pagerops sgpagerops; extern const struct pagerops mgtdevicepagerops; extern const struct pagerops swaptmpfspagerops; /* * get/put return values * OK operation was successful * BAD specified data was out of the accepted range * FAIL specified data was in range, but doesn't exist * PEND operations was initiated but not completed * ERROR error while accessing data that is in range and exists * AGAIN temporary resource shortage prevented operation from happening */ #define VM_PAGER_OK 0 #define VM_PAGER_BAD 1 #define VM_PAGER_FAIL 2 #define VM_PAGER_PEND 3 #define VM_PAGER_ERROR 4 #define VM_PAGER_AGAIN 5 #define VM_PAGER_PUT_SYNC 0x0001 #define VM_PAGER_PUT_INVAL 0x0002 #define VM_PAGER_PUT_NOREUSE 0x0004 #define VM_PAGER_CLUSTER_OK 0x0008 #ifdef _KERNEL extern const struct pagerops *pagertab[] __read_mostly; extern struct mtx_padalign pbuf_mtx; /* * Number of pages that pbuf buffer can store in b_pages. * It is +1 to allow for unaligned data buffer of maxphys size. */ #define PBUF_PAGES (atop(maxphys) + 1) vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *); void vm_pager_bufferinit(void); void vm_pager_deallocate(vm_object_t); int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int *, int *); int vm_pager_get_pages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); void vm_pager_init(void); vm_object_t vm_pager_object_lookup(struct pagerlst *, void *); static __inline void vm_pager_put_pages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { VM_OBJECT_ASSERT_WLOCKED(object); (*pagertab[object->type]->pgo_putpages) (object, m, count, flags, rtvals); } /* * vm_pager_haspage * * Check to see if an object's pager has the requested page. The * object's pager will also set before and after to give the caller * some idea of the number of pages before and after the requested * page can be I/O'd efficiently. * * The object must be locked. */ static __inline boolean_t vm_pager_has_page(vm_object_t object, vm_pindex_t offset, int *before, int *after) { boolean_t ret; VM_OBJECT_ASSERT_LOCKED(object); ret = (*pagertab[object->type]->pgo_haspage) (object, offset, before, after); return (ret); } static __inline int vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { MPASS((object->flags & OBJ_POPULATE) != 0); MPASS(pidx < object->size); MPASS(blockcount_read(&object->paging_in_progress) > 0); return ((*pagertab[object->type]->pgo_populate)(object, pidx, fault_type, max_prot, first, last)); } /* * vm_pager_page_unswapped * * Destroy swap associated with the page. * * XXX: A much better name would be "vm_pager_page_dirtied()" * XXX: It is not obvious if this could be profitably used by any * XXX: pagers besides the swap_pager or if it should even be a * XXX: generic pager_op in the first place. */ static __inline void vm_pager_page_unswapped(vm_page_t m) { pgo_pageunswapped_t *method; method = pagertab[m->object->type]->pgo_pageunswapped; if (method != NULL) method(m); } static __inline void vm_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { pgo_writecount_t *method; method = pagertab[object->type]->pgo_update_writecount; if (method != NULL) method(object, start, end); } static __inline void vm_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { pgo_writecount_t *method; method = pagertab[object->type]->pgo_release_writecount; if (method != NULL) method(object, start, end); } static __inline void vm_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) { pgo_getvp_t *method; *vpp = NULL; if (vp_heldp != NULL) *vp_heldp = false; method = pagertab[object->type]->pgo_getvp; if (method != NULL) method(object, vpp, vp_heldp); } static __inline void vm_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { pgo_freespace_t *method; method = pagertab[object->type]->pgo_freespace; if (method != NULL) method(object, start, size); } struct cdev_pager_ops { int (*cdev_pg_fault)(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t *mres); int (*cdev_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last); int (*cdev_pg_ctor)(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color); void (*cdev_pg_dtor)(void *handle); }; vm_object_t cdev_pager_allocate(void *handle, enum obj_type tp, const struct cdev_pager_ops *ops, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred); vm_object_t cdev_pager_lookup(void *handle); void cdev_pager_free_page(vm_object_t object, vm_page_t m); struct phys_pager_ops { int (*phys_pg_getpages)(vm_object_t vm_obj, vm_page_t *m, int count, int *rbehind, int *rahead); int (*phys_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last); boolean_t (*phys_pg_haspage)(vm_object_t obj, vm_pindex_t pindex, int *before, int *after); void (*phys_pg_ctor)(vm_object_t vm_obj, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred); void (*phys_pg_dtor)(vm_object_t vm_obj); }; extern const struct phys_pager_ops default_phys_pg_ops; vm_object_t phys_pager_allocate(void *handle, const struct phys_pager_ops *ops, void *data, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred); #endif /* _KERNEL */ #endif /* _VM_PAGER_ */ diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index fb0b4068183b..d167fcc555fb 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -1,1612 +1,1614 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); static int vnode_pager_generic_getpages_done(struct buf *); static void vnode_pager_generic_getpages_done_async(struct buf *); static void vnode_pager_update_writecount(vm_object_t, vm_offset_t, vm_offset_t); static void vnode_pager_release_writecount(vm_object_t, vm_offset_t, vm_offset_t); static void vnode_pager_getvp(vm_object_t, struct vnode **, bool *); const struct pagerops vnodepagerops = { + .pgo_kvme_type = KVME_TYPE_VNODE, .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, .pgo_update_writecount = vnode_pager_update_writecount, .pgo_release_writecount = vnode_pager_release_writecount, .pgo_set_writeable_dirty = vm_object_set_writeable_dirty_, .pgo_mightbedirty = vm_object_mightbedirty_, .pgo_getvp = vnode_pager_getvp, }; static struct domainset *vnode_domainset = NULL; SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RW, &vnode_domainset, 0, sysctl_handle_domainset, "A", "Default vnode NUMA policy"); static int nvnpbufs; SYSCTL_INT(_vm, OID_AUTO, vnode_pbufs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &nvnpbufs, 0, "number of physical buffers allocated for vnode pager"); static uma_zone_t vnode_pbuf_zone; static void vnode_pager_init(void *dummy) { #ifdef __LP64__ nvnpbufs = nswbuf * 2; #else nvnpbufs = nswbuf / 2; #endif TUNABLE_INT_FETCH("vm.vnode_pbufs", &nvnpbufs); vnode_pbuf_zone = pbuf_zsecond_create("vnpbuf", nvnpbufs); } SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL); /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; struct vattr va; bool last; if (!vn_isdisk(vp) && vn_canvmio(vp) == FALSE) return (0); object = vp->v_object; if (object != NULL) return (0); if (size == 0) { if (vn_isdisk(vp)) { size = IDX_TO_OFF(INT_MAX); } else { if (VOP_GETATTR(vp, &va, td->td_ucred)) return (0); size = va.va_size; } } object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. We still have * to serialize with vnode_pager_dealloc() for the last * potential reference. */ VM_OBJECT_RLOCK(object); last = refcount_release(&object->ref_count); VM_OBJECT_RUNLOCK(object); if (last) vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; if (obj == NULL || obj->handle != vp) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); MPASS(obj->type == OBJT_VNODE); umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { KASSERT((obj->flags & OBJ_DEAD) == 0, ("vnode_destroy_vobject: Terminating dead object")); vm_object_set_flag(obj, OBJ_DEAD); /* * Clean pages and flush buffers. */ vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(obj); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(obj); vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *)handle; ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc"); VNPASS(vp->v_usecount > 0, vp); retry: object = vp->v_object; if (object == NULL) { /* * Add an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->domain.dr_policy = vnode_domainset; object->handle = handle; if ((vp->v_vflag & VV_VMSIZEVNLOCK) != 0) { VM_OBJECT_WLOCK(object); vm_object_set_flag(object, OBJ_SIZEVNLOCK); VM_OBJECT_WUNLOCK(object); } VI_LOCK(vp); if (vp->v_object != NULL) { /* * Object has been created while we were allocating. */ VI_UNLOCK(vp); VM_OBJECT_WLOCK(object); KASSERT(object->ref_count == 1, ("leaked ref %p %d", object, object->ref_count)); object->type = OBJT_DEAD; refcount_init(&object->ref_count, 0); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); goto retry; } vp->v_object = object; VI_UNLOCK(vp); vrefact(vp); } else { vm_object_reference(object); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(object); vm_object_color(object, 0); VM_OBJECT_WUNLOCK(object); } #endif } return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "vnpdea"); refs = object->ref_count; object->handle = NULL; object->type = OBJT_DEAD; ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } vp->v_object = NULL; VI_LOCK(vp); /* * vm_map_entry_set_vnode_text() cannot reach this vnode by * following object->handle. Clear all text references now. * This also clears the transient references from * kern_execve(), which is fine because dead_vnodeops uses nop * for VOP_UNSET_TEXT(). */ if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); if (refs > 0) vunref(vp); VM_OBJECT_WLOCK(object); } static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; uintptr_t lockstate; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_LOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || VN_IS_DOOMED(vp)) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } lockstate = VM_OBJECT_DROP(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_PICKUP(object, lockstate); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; #ifdef DEBUG_VFS_LOCKS { struct mount *mp; mp = vp->v_mount; if (mp != NULL && (mp->mnt_kern_flag & MNTK_VMSETSIZE_BUG) == 0) assert_vop_elocked(vp, "vnode_pager_setsize and not locked vnode"); } #endif VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); return; } KASSERT(object->type == OBJT_VNODE, ("not vnode-backed object %p", object)); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_WUNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if (!(nsize & PAGE_MASK)) goto out; m = vm_page_grab(object, OFF_TO_IDX(nsize), VM_ALLOC_NOCREAT); if (m == NULL) goto out; if (!vm_page_none_valid(m)) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Update the valid bits to reflect the blocks that * have been zeroed. Some of these valid bits may * have already been set. */ vm_page_set_valid_range(m, base, size); /* * Round "base" to the next block boundary so that the * dirty bit for a partially zeroed block is not * cleared. */ base = roundup2(base, DEV_BSIZE); /* * Clear out partial-page dirty bits. * * note that we do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); } vm_page_xunbusy(m); } out: #if defined(__powerpc__) && !defined(__powerpc64__) object->un_pager.vnp.vnp_size = nsize; #else atomic_store_64(&object->un_pager.vnp.vnp_size, nsize); #endif object->size = nobjsize; VM_OBJECT_WUNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (VN_IS_DOOMED(vp)) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize / PAGE_SIZE; *run -= voffset / PAGE_SIZE; } } return (err); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (VN_IS_DOOMED(vp)) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) { KASSERT(bp->b_error != 0, ("%s: buf error but b_error == 0\n", __func__)); error = bp->b_error; } /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); vm_page_bits_set(m, &m->valid, bits); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_WUNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_WLOCK(object); } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) vm_page_valid(m); return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct vnode *vp; int rtval; /* Handle is stable with paging in progress. */ vp = object->handle; rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); return rtval; } static int vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) { struct vnode *vp; int rtval; vp = object->handle; rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages_async not implemented\n")); return (rtval); } /* * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for * local filesystems, where partially valid pages can only occur at * the end of file. */ int vnode_pager_local_getpages(struct vop_getpages_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { int error; error = vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg); if (error != 0 && ap->a_iodone != NULL) ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; #ifdef INVARIANTS off_t blkno0; #endif int bsize, pagesperblock; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (VN_IS_DOOMED(vp)) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= atop(maxphys), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (!vm_page_none_valid(m[count - 1]) && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); MPASS((bp->b_flags & B_MAXPHYS) != 0); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { uma_zfree(vnode_pbuf_zone, bp); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { uma_zfree(vnode_pbuf_zone, bp); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { uma_zfree(vnode_pbuf_zone, bp); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); uma_zfree(vnode_pbuf_zone, bp); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); vm_page_valid(m[0]); return (VM_PAGER_OK); } #ifdef INVARIANTS blkno0 = bp->b_blkno; #endif bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); /* * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ if (rbehind + rahead + count > atop(maxphys)) { int trim, sum; trim = rbehind + rahead + count - atop(maxphys) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ rbehind -= roundup(trim * rbehind / sum, pagesperblock); if (rbehind < 0) rbehind = 0; } else rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } KASSERT(rbehind + rahead + count <= atop(maxphys), ("%s: behind %d ahead %d count %d maxphys %lu", __func__, rbehind, rahead, count, maxphys)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; #ifdef INVARIANTS KASSERT(bp->b_npages <= atop(maxphys), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1, prev = 0; j < bp->b_npages; j++) { if (bp->b_pages[j] == bogus_page) continue; KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex == j - prev, ("%s: pages array not consecutive, bp %p", __func__, bp)); prev = j; } #endif /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) == (blkno0 - bp->b_blkno) * DEV_BSIZE + IDX_TO_OFF(m[0]->pindex) % bsize, ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju " "blkno0 %ju b_blkno %ju", bsize, (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex, (uintmax_t)blkno0, (uintmax_t)bp->b_blkno)); atomic_add_long(&runningbufspace, bp->b_runningbufspace); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } static void vnode_pager_generic_getpages_done_async(struct buf *bp) { int error; error = vnode_pager_generic_getpages_done(bp); /* Run the iodone upon the requested range. */ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); for (int i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); } static int vnode_pager_generic_getpages_done(struct buf *bp) { vm_object_t object; off_t tfoff, nextoff; int i, error; KASSERT((bp->b_ioflags & BIO_ERROR) == 0 || bp->b_error != 0, ("%s: buf error but b_error == 0\n", __func__)); error = (bp->b_ioflags & BIO_ERROR) != 0 ? bp->b_error : 0; object = bp->b_vp->v_object; if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { if (!buf_mapped(bp)) { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } bzero(bp->b_data + bp->b_bcount, PAGE_SIZE * bp->b_npages - bp->b_bcount); } if (buf_mapped(bp)) { pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); bp->b_data = unmapped_buf; } /* * If the read failed, we must free any read ahead/behind pages here. * The requested pages are freed by the caller (for sync requests) * or by the bp->b_pgiodone callback (for async requests). */ if (error != 0) { VM_OBJECT_WLOCK(object); for (i = 0; i < bp->b_pgbefore; i++) vm_page_free_invalid(bp->b_pages[i]); for (i = bp->b_npages - bp->b_pgafter; i < bp->b_npages; i++) vm_page_free_invalid(bp->b_pages[i]); VM_OBJECT_WUNLOCK(object); return (error); } /* Read lock to protect size. */ VM_OBJECT_RLOCK(object); for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = bp->b_pages[i]; if (mt == bogus_page) continue; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ vm_page_valid(mt); KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_valid_range(mt, 0, object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, ("%s: page %p is dirty", __func__, mt)); } if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_RUNLOCK(object); return (error); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int rtval; struct vnode *vp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if (vm_page_count_min()) flags |= VM_PAGER_PUT_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } static int vn_off2bidx(vm_ooffset_t offset) { return ((offset & PAGE_MASK) / DEV_BSIZE); } static bool vn_dirty_blk(vm_page_t m, vm_ooffset_t offset) { KASSERT(IDX_TO_OFF(m->pindex) <= offset && offset < IDX_TO_OFF(m->pindex + 1), ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex, (uintmax_t)offset)); return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { vm_object_t object; vm_page_t m; vm_ooffset_t maxblksz, next_offset, poffset, prev_offset; struct uio auio; struct iovec aiov; off_t prev_resid, wrsz; int count, error, i, maxsize, ncount, pgoff, ppscheck; bool in_hole; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { printf("vnode_pager_generic_putpages: " "attempt to write meta-data 0x%jx(%lx)\n", (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; return (VM_PAGER_BAD); } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(ma[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page busied we are free to fix up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ VM_OBJECT_RLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) { maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { pgoff = roundup2(pgoff, DEV_BSIZE); /* * If the page is busy and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); MPASS(m->dirty != 0); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { maxsize = 0; ncount = 0; } for (i = ncount; i < count; i++) rtvals[i] = VM_PAGER_BAD; } VM_OBJECT_RUNLOCK(object); auio.uio_iov = &aiov; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_td = NULL; maxblksz = roundup2(poffset + maxsize, DEV_BSIZE); for (prev_offset = poffset; prev_offset < maxblksz;) { /* Skip clean blocks. */ for (in_hole = true; in_hole && prev_offset < maxblksz;) { m = ma[OFF_TO_IDX(prev_offset - poffset)]; for (i = vn_off2bidx(prev_offset); i < sizeof(vm_page_bits_t) * NBBY && prev_offset < maxblksz; i++) { if (vn_dirty_blk(m, prev_offset)) { in_hole = false; break; } prev_offset += DEV_BSIZE; } } if (in_hole) goto write_done; /* Find longest run of dirty blocks. */ for (next_offset = prev_offset; next_offset < maxblksz;) { m = ma[OFF_TO_IDX(next_offset - poffset)]; for (i = vn_off2bidx(next_offset); i < sizeof(vm_page_bits_t) * NBBY && next_offset < maxblksz; i++) { if (!vn_dirty_blk(m, next_offset)) goto start_write; next_offset += DEV_BSIZE; } } start_write: if (next_offset > poffset + maxsize) next_offset = poffset + maxsize; /* * Getting here requires finding a dirty block in the * 'skip clean blocks' loop. */ MPASS(prev_offset < next_offset); aiov.iov_base = NULL; auio.uio_iovcnt = 1; auio.uio_offset = prev_offset; prev_resid = auio.uio_resid = aiov.iov_len = next_offset - prev_offset; error = VOP_WRITE(vp, &auio, vnode_pager_putpages_ioflags(flags), curthread->td_ucred); wrsz = prev_resid - auio.uio_resid; if (wrsz == 0) { if (ppsratecheck(&lastfail, &curfail, 1) != 0) { vn_printf(vp, "vnode_pager_putpages: " "zero-length write at %ju resid %zd\n", auio.uio_offset, auio.uio_resid); } break; } /* Adjust the starting offset for next iteration. */ prev_offset += wrsz; MPASS(auio.uio_offset == prev_offset); ppscheck = 0; if (error != 0 && (ppscheck = ppsratecheck(&lastfail, &curfail, 1)) != 0) vn_printf(vp, "vnode_pager_putpages: I/O error %d\n", error); if (auio.uio_resid != 0 && (ppscheck != 0 || ppsratecheck(&lastfail, &curfail, 1) != 0)) vn_printf(vp, "vnode_pager_putpages: residual I/O %zd " "at %ju\n", auio.uio_resid, (uintmax_t)ma[0]->pindex); if (error != 0 || auio.uio_resid != 0) break; } write_done: /* Mark completely processed pages. */ for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++) rtvals[i] = VM_PAGER_OK; /* Mark partial EOF page. */ if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0) rtvals[i++] = VM_PAGER_OK; /* Unwritten pages in range, free bonus if the page is clean. */ for (; i < ncount; i++) rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR; VM_CNT_ADD(v_vnodepgsout, i); VM_CNT_INC(v_vnodeout); return (rtvals[0]); } int vnode_pager_putpages_ioflags(int pager_flags) { int ioflags; /* * Pageouts are already clustered, use IO_ASYNC to force a * bawrite() rather then a bdwrite() to prevent paging I/O * from saturating the buffer cache. Dummy-up the sequential * heuristic to cause large ranges to cluster. If neither * IO_SYNC or IO_ASYNC is set, the system decides how to * cluster. */ ioflags = IO_VMIO; if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0) ioflags |= IO_SYNC; else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0; ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; return (ioflags); } /* * vnode_pager_undirty_pages(). * * A helper to mark pages as clean after pageout that was possibly * done with a short write. The lpos argument specifies the page run * length in bytes, and the written argument specifies how many bytes * were actually written. eof is the offset past the last valid byte * in the vnode using the absolute file position of the first byte in * the run as the base from which it is computed. */ void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof, int lpos) { int i, pos, pos_devb; if (written == 0 && eof >= lpos) return; for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(ma[i]); } else { /* Partially written page. */ rtvals[i] = VM_PAGER_AGAIN; vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } if (eof >= lpos) /* avoid truncation */ return; for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) { if (pos != trunc_page(pos)) { /* * The page contains the last valid byte in * the vnode, mark the rest of the page as * clean, potentially making the whole page * clean. */ pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE); vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE - pos_devb); /* * If the page was cleaned, report the pageout * on it as successful. msync() no longer * needs to write out the page, endlessly * creating write requests and dirty buffers. */ if (ma[i]->dirty == 0) rtvals[i] = VM_PAGER_OK; pos = round_page(pos); } else { /* vm_pageout_flush() clears dirty */ rtvals[i] = VM_PAGER_BAD; pos += PAGE_SIZE; } } } static void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; vm_ooffset_t old_wm; VM_OBJECT_WLOCK(object); if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } old_wm = object->un_pager.vnp.writemappings; object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { ASSERT_VOP_LOCKED(vp, "v_writecount inc"); VOP_ADD_WRITECOUNT_CHECKED(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { ASSERT_VOP_LOCKED(vp, "v_writecount dec"); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } VM_OBJECT_WUNLOCK(object); } static void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_SHARED | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp); vdrop(vp); if (mp != NULL) vn_finished_write(mp); } static void vnode_pager_getvp(vm_object_t object, struct vnode **vpp, bool *vp_heldp) { *vpp = object->handle; }