Index: user/attilio/rm_vmobj_cache/sys/dev/agp/agp.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/dev/agp/agp.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/dev/agp/agp.c (revision 267237) @@ -1,998 +1,998 @@ /*- * Copyright (c) 2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_agp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(agp, 1); MALLOC_DEFINE(M_AGP, "agp", "AGP data structures"); /* agp_drv.c */ static d_open_t agp_open; static d_close_t agp_close; static d_ioctl_t agp_ioctl; static d_mmap_t agp_mmap; static struct cdevsw agp_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = agp_open, .d_close = agp_close, .d_ioctl = agp_ioctl, .d_mmap = agp_mmap, .d_name = "agp", }; static devclass_t agp_devclass; /* Helper functions for implementing chipset mini drivers. */ void agp_flush_cache() { #if defined(__i386__) || defined(__amd64__) wbinvd(); #endif } u_int8_t agp_find_caps(device_t dev) { int capreg; if (pci_find_cap(dev, PCIY_AGP, &capreg) != 0) capreg = 0; return (capreg); } /* * Find an AGP display device (if any). */ static device_t agp_find_display(void) { devclass_t pci = devclass_find("pci"); device_t bus, dev = 0; device_t *kids; int busnum, numkids, i; for (busnum = 0; busnum < devclass_get_maxunit(pci); busnum++) { bus = devclass_get_device(pci, busnum); if (!bus) continue; if (device_get_children(bus, &kids, &numkids) != 0) continue; for (i = 0; i < numkids; i++) { dev = kids[i]; if (pci_get_class(dev) == PCIC_DISPLAY && pci_get_subclass(dev) == PCIS_DISPLAY_VGA) if (agp_find_caps(dev)) { free(kids, M_TEMP); return dev; } } free(kids, M_TEMP); } return 0; } struct agp_gatt * agp_alloc_gatt(device_t dev) { u_int32_t apsize = AGP_GET_APERTURE(dev); u_int32_t entries = apsize >> AGP_PAGE_SHIFT; struct agp_gatt *gatt; if (bootverbose) device_printf(dev, "allocating GATT for aperture of size %dM\n", apsize / (1024*1024)); if (entries == 0) { device_printf(dev, "bad aperture size\n"); return NULL; } gatt = malloc(sizeof(struct agp_gatt), M_AGP, M_NOWAIT); if (!gatt) return 0; gatt->ag_entries = entries; gatt->ag_virtual = contigmalloc(entries * sizeof(u_int32_t), M_AGP, 0, 0, ~0, PAGE_SIZE, 0); if (!gatt->ag_virtual) { if (bootverbose) device_printf(dev, "contiguous allocation failed\n"); free(gatt, M_AGP); return 0; } bzero(gatt->ag_virtual, entries * sizeof(u_int32_t)); gatt->ag_physical = vtophys((vm_offset_t) gatt->ag_virtual); agp_flush_cache(); return gatt; } void agp_free_gatt(struct agp_gatt *gatt) { contigfree(gatt->ag_virtual, gatt->ag_entries * sizeof(u_int32_t), M_AGP); free(gatt, M_AGP); } static u_int agp_max[][2] = { {0, 0}, {32, 4}, {64, 28}, {128, 96}, {256, 204}, {512, 440}, {1024, 942}, {2048, 1920}, {4096, 3932} }; #define agp_max_size (sizeof(agp_max) / sizeof(agp_max[0])) /** * Sets the PCI resource which represents the AGP aperture. * * If not called, the default AGP aperture resource of AGP_APBASE will * be used. Must be called before agp_generic_attach(). */ void agp_set_aperture_resource(device_t dev, int rid) { struct agp_softc *sc = device_get_softc(dev); sc->as_aperture_rid = rid; } int agp_generic_attach(device_t dev) { struct agp_softc *sc = device_get_softc(dev); int i; u_int memsize; /* * Find and map the aperture, RF_SHAREABLE for DRM but not RF_ACTIVE * because the kernel doesn't need to map it. */ if (sc->as_aperture_rid != -1) { if (sc->as_aperture_rid == 0) sc->as_aperture_rid = AGP_APBASE; sc->as_aperture = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->as_aperture_rid, RF_SHAREABLE); if (!sc->as_aperture) return ENOMEM; } /* * Work out an upper bound for agp memory allocation. This * uses a heurisitc table from the Linux driver. */ memsize = ptoa(realmem) >> 20; for (i = 0; i < agp_max_size; i++) { if (memsize <= agp_max[i][0]) break; } if (i == agp_max_size) i = agp_max_size - 1; sc->as_maxmem = agp_max[i][1] << 20U; /* * The lock is used to prevent re-entry to * agp_generic_bind_memory() since that function can sleep. */ mtx_init(&sc->as_lock, "agp lock", NULL, MTX_DEF); /* * Initialise stuff for the userland device. */ agp_devclass = devclass_find("agp"); TAILQ_INIT(&sc->as_memory); sc->as_nextid = 1; sc->as_devnode = make_dev(&agp_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "agpgart"); sc->as_devnode->si_drv1 = dev; return 0; } void agp_free_cdev(device_t dev) { struct agp_softc *sc = device_get_softc(dev); destroy_dev(sc->as_devnode); } void agp_free_res(device_t dev) { struct agp_softc *sc = device_get_softc(dev); if (sc->as_aperture != NULL) bus_release_resource(dev, SYS_RES_MEMORY, sc->as_aperture_rid, sc->as_aperture); mtx_destroy(&sc->as_lock); agp_flush_cache(); } int agp_generic_detach(device_t dev) { agp_free_cdev(dev); agp_free_res(dev); return 0; } /** * Default AGP aperture size detection which simply returns the size of * the aperture's PCI resource. */ u_int32_t agp_generic_get_aperture(device_t dev) { struct agp_softc *sc = device_get_softc(dev); return rman_get_size(sc->as_aperture); } /** * Default AGP aperture size setting function, which simply doesn't allow * changes to resource size. */ int agp_generic_set_aperture(device_t dev, u_int32_t aperture) { u_int32_t current_aperture; current_aperture = AGP_GET_APERTURE(dev); if (current_aperture != aperture) return EINVAL; else return 0; } /* * This does the enable logic for v3, with the same topology * restrictions as in place for v2 -- one bus, one device on the bus. */ static int agp_v3_enable(device_t dev, device_t mdev, u_int32_t mode) { u_int32_t tstatus, mstatus; u_int32_t command; int rq, sba, fw, rate, arqsz, cal; tstatus = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4); mstatus = pci_read_config(mdev, agp_find_caps(mdev) + AGP_STATUS, 4); /* Set RQ to the min of mode, tstatus and mstatus */ rq = AGP_MODE_GET_RQ(mode); if (AGP_MODE_GET_RQ(tstatus) < rq) rq = AGP_MODE_GET_RQ(tstatus); if (AGP_MODE_GET_RQ(mstatus) < rq) rq = AGP_MODE_GET_RQ(mstatus); /* * ARQSZ - Set the value to the maximum one. * Don't allow the mode register to override values. */ arqsz = AGP_MODE_GET_ARQSZ(mode); if (AGP_MODE_GET_ARQSZ(tstatus) > rq) rq = AGP_MODE_GET_ARQSZ(tstatus); if (AGP_MODE_GET_ARQSZ(mstatus) > rq) rq = AGP_MODE_GET_ARQSZ(mstatus); /* Calibration cycle - don't allow override by mode register */ cal = AGP_MODE_GET_CAL(tstatus); if (AGP_MODE_GET_CAL(mstatus) < cal) cal = AGP_MODE_GET_CAL(mstatus); /* SBA must be supported for AGP v3. */ sba = 1; /* Set FW if all three support it. */ fw = (AGP_MODE_GET_FW(tstatus) & AGP_MODE_GET_FW(mstatus) & AGP_MODE_GET_FW(mode)); /* Figure out the max rate */ rate = (AGP_MODE_GET_RATE(tstatus) & AGP_MODE_GET_RATE(mstatus) & AGP_MODE_GET_RATE(mode)); if (rate & AGP_MODE_V3_RATE_8x) rate = AGP_MODE_V3_RATE_8x; else rate = AGP_MODE_V3_RATE_4x; if (bootverbose) device_printf(dev, "Setting AGP v3 mode %d\n", rate * 4); pci_write_config(dev, agp_find_caps(dev) + AGP_COMMAND, 0, 4); /* Construct the new mode word and tell the hardware */ command = 0; command = AGP_MODE_SET_RQ(0, rq); command = AGP_MODE_SET_ARQSZ(command, arqsz); command = AGP_MODE_SET_CAL(command, cal); command = AGP_MODE_SET_SBA(command, sba); command = AGP_MODE_SET_FW(command, fw); command = AGP_MODE_SET_RATE(command, rate); command = AGP_MODE_SET_MODE_3(command, 1); command = AGP_MODE_SET_AGP(command, 1); pci_write_config(dev, agp_find_caps(dev) + AGP_COMMAND, command, 4); pci_write_config(mdev, agp_find_caps(mdev) + AGP_COMMAND, command, 4); return 0; } static int agp_v2_enable(device_t dev, device_t mdev, u_int32_t mode) { u_int32_t tstatus, mstatus; u_int32_t command; int rq, sba, fw, rate; tstatus = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4); mstatus = pci_read_config(mdev, agp_find_caps(mdev) + AGP_STATUS, 4); /* Set RQ to the min of mode, tstatus and mstatus */ rq = AGP_MODE_GET_RQ(mode); if (AGP_MODE_GET_RQ(tstatus) < rq) rq = AGP_MODE_GET_RQ(tstatus); if (AGP_MODE_GET_RQ(mstatus) < rq) rq = AGP_MODE_GET_RQ(mstatus); /* Set SBA if all three can deal with SBA */ sba = (AGP_MODE_GET_SBA(tstatus) & AGP_MODE_GET_SBA(mstatus) & AGP_MODE_GET_SBA(mode)); /* Similar for FW */ fw = (AGP_MODE_GET_FW(tstatus) & AGP_MODE_GET_FW(mstatus) & AGP_MODE_GET_FW(mode)); /* Figure out the max rate */ rate = (AGP_MODE_GET_RATE(tstatus) & AGP_MODE_GET_RATE(mstatus) & AGP_MODE_GET_RATE(mode)); if (rate & AGP_MODE_V2_RATE_4x) rate = AGP_MODE_V2_RATE_4x; else if (rate & AGP_MODE_V2_RATE_2x) rate = AGP_MODE_V2_RATE_2x; else rate = AGP_MODE_V2_RATE_1x; if (bootverbose) device_printf(dev, "Setting AGP v2 mode %d\n", rate); /* Construct the new mode word and tell the hardware */ command = 0; command = AGP_MODE_SET_RQ(0, rq); command = AGP_MODE_SET_SBA(command, sba); command = AGP_MODE_SET_FW(command, fw); command = AGP_MODE_SET_RATE(command, rate); command = AGP_MODE_SET_AGP(command, 1); pci_write_config(dev, agp_find_caps(dev) + AGP_COMMAND, command, 4); pci_write_config(mdev, agp_find_caps(mdev) + AGP_COMMAND, command, 4); return 0; } int agp_generic_enable(device_t dev, u_int32_t mode) { device_t mdev = agp_find_display(); u_int32_t tstatus, mstatus; if (!mdev) { AGP_DPF("can't find display\n"); return ENXIO; } tstatus = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4); mstatus = pci_read_config(mdev, agp_find_caps(mdev) + AGP_STATUS, 4); /* * Check display and bridge for AGP v3 support. AGP v3 allows * more variety in topology than v2, e.g. multiple AGP devices * attached to one bridge, or multiple AGP bridges in one * system. This doesn't attempt to address those situations, * but should work fine for a classic single AGP slot system * with AGP v3. */ if (AGP_MODE_GET_MODE_3(mode) && AGP_MODE_GET_MODE_3(tstatus) && AGP_MODE_GET_MODE_3(mstatus)) return (agp_v3_enable(dev, mdev, mode)); else return (agp_v2_enable(dev, mdev, mode)); } struct agp_memory * agp_generic_alloc_memory(device_t dev, int type, vm_size_t size) { struct agp_softc *sc = device_get_softc(dev); struct agp_memory *mem; if ((size & (AGP_PAGE_SIZE - 1)) != 0) return 0; if (sc->as_allocated + size > sc->as_maxmem) return 0; if (type != 0) { printf("agp_generic_alloc_memory: unsupported type %d\n", type); return 0; } mem = malloc(sizeof *mem, M_AGP, M_WAITOK); mem->am_id = sc->as_nextid++; mem->am_size = size; mem->am_type = 0; mem->am_obj = vm_object_allocate(OBJT_DEFAULT, atop(round_page(size))); mem->am_physical = 0; mem->am_offset = 0; mem->am_is_bound = 0; TAILQ_INSERT_TAIL(&sc->as_memory, mem, am_link); sc->as_allocated += size; return mem; } int agp_generic_free_memory(device_t dev, struct agp_memory *mem) { struct agp_softc *sc = device_get_softc(dev); if (mem->am_is_bound) return EBUSY; sc->as_allocated -= mem->am_size; TAILQ_REMOVE(&sc->as_memory, mem, am_link); vm_object_deallocate(mem->am_obj); free(mem, M_AGP); return 0; } int agp_generic_bind_memory(device_t dev, struct agp_memory *mem, vm_offset_t offset) { struct agp_softc *sc = device_get_softc(dev); vm_offset_t i, j, k; vm_page_t m; int error; /* Do some sanity checks first. */ if ((offset & (AGP_PAGE_SIZE - 1)) != 0 || offset + mem->am_size > AGP_GET_APERTURE(dev)) { device_printf(dev, "binding memory at bad offset %#x\n", (int)offset); return EINVAL; } /* * Allocate the pages early, before acquiring the lock, * because vm_page_grab() may sleep and we can't hold a mutex * while sleeping. */ VM_OBJECT_WLOCK(mem->am_obj); for (i = 0; i < mem->am_size; i += PAGE_SIZE) { /* * Find a page from the object and wire it * down. This page will be mapped using one or more * entries in the GATT (assuming that PAGE_SIZE >= * AGP_PAGE_SIZE. If this is the first call to bind, * the pages will be allocated and zeroed. */ m = vm_page_grab(mem->am_obj, OFF_TO_IDX(i), VM_ALLOC_WIRED | VM_ALLOC_ZERO); AGP_DPF("found page pa=%#jx\n", (uintmax_t)VM_PAGE_TO_PHYS(m)); } VM_OBJECT_WUNLOCK(mem->am_obj); mtx_lock(&sc->as_lock); if (mem->am_is_bound) { device_printf(dev, "memory already bound\n"); error = EINVAL; VM_OBJECT_WLOCK(mem->am_obj); i = 0; goto bad; } /* * Bind the individual pages and flush the chipset's * TLB. */ VM_OBJECT_WLOCK(mem->am_obj); for (i = 0; i < mem->am_size; i += PAGE_SIZE) { m = vm_page_lookup(mem->am_obj, OFF_TO_IDX(i)); /* * Install entries in the GATT, making sure that if * AGP_PAGE_SIZE < PAGE_SIZE and mem->am_size is not * aligned to PAGE_SIZE, we don't modify too many GATT * entries. */ for (j = 0; j < PAGE_SIZE && i + j < mem->am_size; j += AGP_PAGE_SIZE) { vm_offset_t pa = VM_PAGE_TO_PHYS(m) + j; AGP_DPF("binding offset %#jx to pa %#jx\n", (uintmax_t)offset + i + j, (uintmax_t)pa); error = AGP_BIND_PAGE(dev, offset + i + j, pa); if (error) { /* * Bail out. Reverse all the mappings * and unwire the pages. */ for (k = 0; k < i + j; k += AGP_PAGE_SIZE) AGP_UNBIND_PAGE(dev, offset + k); goto bad; } } vm_page_xunbusy(m); } VM_OBJECT_WUNLOCK(mem->am_obj); /* * Flush the cpu cache since we are providing a new mapping * for these pages. */ agp_flush_cache(); /* * Make sure the chipset gets the new mappings. */ AGP_FLUSH_TLB(dev); mem->am_offset = offset; mem->am_is_bound = 1; mtx_unlock(&sc->as_lock); return 0; bad: mtx_unlock(&sc->as_lock); VM_OBJECT_ASSERT_WLOCKED(mem->am_obj); for (k = 0; k < mem->am_size; k += PAGE_SIZE) { m = vm_page_lookup(mem->am_obj, OFF_TO_IDX(k)); if (k >= i) vm_page_xunbusy(m); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(mem->am_obj); return error; } int agp_generic_unbind_memory(device_t dev, struct agp_memory *mem) { struct agp_softc *sc = device_get_softc(dev); vm_page_t m; int i; mtx_lock(&sc->as_lock); if (!mem->am_is_bound) { device_printf(dev, "memory is not bound\n"); mtx_unlock(&sc->as_lock); return EINVAL; } /* * Unbind the individual pages and flush the chipset's * TLB. Unwire the pages so they can be swapped. */ for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) AGP_UNBIND_PAGE(dev, mem->am_offset + i); VM_OBJECT_WLOCK(mem->am_obj); for (i = 0; i < mem->am_size; i += PAGE_SIZE) { m = vm_page_lookup(mem->am_obj, atop(i)); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(mem->am_obj); agp_flush_cache(); AGP_FLUSH_TLB(dev); mem->am_offset = 0; mem->am_is_bound = 0; mtx_unlock(&sc->as_lock); return 0; } /* Helper functions for implementing user/kernel api */ static int agp_acquire_helper(device_t dev, enum agp_acquire_state state) { struct agp_softc *sc = device_get_softc(dev); if (sc->as_state != AGP_ACQUIRE_FREE) return EBUSY; sc->as_state = state; return 0; } static int agp_release_helper(device_t dev, enum agp_acquire_state state) { struct agp_softc *sc = device_get_softc(dev); if (sc->as_state == AGP_ACQUIRE_FREE) return 0; if (sc->as_state != state) return EBUSY; sc->as_state = AGP_ACQUIRE_FREE; return 0; } static struct agp_memory * agp_find_memory(device_t dev, int id) { struct agp_softc *sc = device_get_softc(dev); struct agp_memory *mem; AGP_DPF("searching for memory block %d\n", id); TAILQ_FOREACH(mem, &sc->as_memory, am_link) { AGP_DPF("considering memory block %d\n", mem->am_id); if (mem->am_id == id) return mem; } return 0; } /* Implementation of the userland ioctl api */ static int agp_info_user(device_t dev, agp_info *info) { struct agp_softc *sc = device_get_softc(dev); bzero(info, sizeof *info); info->bridge_id = pci_get_devid(dev); info->agp_mode = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4); if (sc->as_aperture) info->aper_base = rman_get_start(sc->as_aperture); else info->aper_base = 0; info->aper_size = AGP_GET_APERTURE(dev) >> 20; info->pg_total = info->pg_system = sc->as_maxmem >> AGP_PAGE_SHIFT; info->pg_used = sc->as_allocated >> AGP_PAGE_SHIFT; return 0; } static int agp_setup_user(device_t dev, agp_setup *setup) { return AGP_ENABLE(dev, setup->agp_mode); } static int agp_allocate_user(device_t dev, agp_allocate *alloc) { struct agp_memory *mem; mem = AGP_ALLOC_MEMORY(dev, alloc->type, alloc->pg_count << AGP_PAGE_SHIFT); if (mem) { alloc->key = mem->am_id; alloc->physical = mem->am_physical; return 0; } else { return ENOMEM; } } static int agp_deallocate_user(device_t dev, int id) { struct agp_memory *mem = agp_find_memory(dev, id); if (mem) { AGP_FREE_MEMORY(dev, mem); return 0; } else { return ENOENT; } } static int agp_bind_user(device_t dev, agp_bind *bind) { struct agp_memory *mem = agp_find_memory(dev, bind->key); if (!mem) return ENOENT; return AGP_BIND_MEMORY(dev, mem, bind->pg_start << AGP_PAGE_SHIFT); } static int agp_unbind_user(device_t dev, agp_unbind *unbind) { struct agp_memory *mem = agp_find_memory(dev, unbind->key); if (!mem) return ENOENT; return AGP_UNBIND_MEMORY(dev, mem); } static int agp_chipset_flush(device_t dev) { return (AGP_CHIPSET_FLUSH(dev)); } static int agp_open(struct cdev *kdev, int oflags, int devtype, struct thread *td) { device_t dev = kdev->si_drv1; struct agp_softc *sc = device_get_softc(dev); if (!sc->as_isopen) { sc->as_isopen = 1; device_busy(dev); } return 0; } static int agp_close(struct cdev *kdev, int fflag, int devtype, struct thread *td) { device_t dev = kdev->si_drv1; struct agp_softc *sc = device_get_softc(dev); struct agp_memory *mem; /* * Clear the GATT and force release on last close */ while ((mem = TAILQ_FIRST(&sc->as_memory)) != 0) { if (mem->am_is_bound) AGP_UNBIND_MEMORY(dev, mem); AGP_FREE_MEMORY(dev, mem); } if (sc->as_state == AGP_ACQUIRE_USER) agp_release_helper(dev, AGP_ACQUIRE_USER); sc->as_isopen = 0; device_unbusy(dev); return 0; } static int agp_ioctl(struct cdev *kdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { device_t dev = kdev->si_drv1; switch (cmd) { case AGPIOC_INFO: return agp_info_user(dev, (agp_info *) data); case AGPIOC_ACQUIRE: return agp_acquire_helper(dev, AGP_ACQUIRE_USER); case AGPIOC_RELEASE: return agp_release_helper(dev, AGP_ACQUIRE_USER); case AGPIOC_SETUP: return agp_setup_user(dev, (agp_setup *)data); case AGPIOC_ALLOCATE: return agp_allocate_user(dev, (agp_allocate *)data); case AGPIOC_DEALLOCATE: return agp_deallocate_user(dev, *(int *) data); case AGPIOC_BIND: return agp_bind_user(dev, (agp_bind *)data); case AGPIOC_UNBIND: return agp_unbind_user(dev, (agp_unbind *)data); case AGPIOC_CHIPSET_FLUSH: return agp_chipset_flush(dev); } return EINVAL; } static int agp_mmap(struct cdev *kdev, vm_ooffset_t offset, vm_paddr_t *paddr, int prot, vm_memattr_t *memattr) { device_t dev = kdev->si_drv1; struct agp_softc *sc = device_get_softc(dev); if (offset > AGP_GET_APERTURE(dev)) return -1; if (sc->as_aperture == NULL) return -1; *paddr = rman_get_start(sc->as_aperture) + offset; return 0; } /* Implementation of the kernel api */ device_t agp_find_device() { device_t *children, child; int i, count; if (!agp_devclass) return NULL; if (devclass_get_devices(agp_devclass, &children, &count) != 0) return NULL; child = NULL; for (i = 0; i < count; i++) { if (device_is_attached(children[i])) { child = children[i]; break; } } free(children, M_TEMP); return child; } enum agp_acquire_state agp_state(device_t dev) { struct agp_softc *sc = device_get_softc(dev); return sc->as_state; } void agp_get_info(device_t dev, struct agp_info *info) { struct agp_softc *sc = device_get_softc(dev); info->ai_mode = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4); if (sc->as_aperture != NULL) info->ai_aperture_base = rman_get_start(sc->as_aperture); else info->ai_aperture_base = 0; info->ai_aperture_size = AGP_GET_APERTURE(dev); info->ai_memory_allowed = sc->as_maxmem; info->ai_memory_used = sc->as_allocated; } int agp_acquire(device_t dev) { return agp_acquire_helper(dev, AGP_ACQUIRE_KERNEL); } int agp_release(device_t dev) { return agp_release_helper(dev, AGP_ACQUIRE_KERNEL); } int agp_enable(device_t dev, u_int32_t mode) { return AGP_ENABLE(dev, mode); } void *agp_alloc_memory(device_t dev, int type, vm_size_t bytes) { return (void *) AGP_ALLOC_MEMORY(dev, type, bytes); } void agp_free_memory(device_t dev, void *handle) { struct agp_memory *mem = (struct agp_memory *) handle; AGP_FREE_MEMORY(dev, mem); } int agp_bind_memory(device_t dev, void *handle, vm_offset_t offset) { struct agp_memory *mem = (struct agp_memory *) handle; return AGP_BIND_MEMORY(dev, mem, offset); } int agp_unbind_memory(device_t dev, void *handle) { struct agp_memory *mem = (struct agp_memory *) handle; return AGP_UNBIND_MEMORY(dev, mem); } void agp_memory_info(device_t dev, void *handle, struct agp_memory_info *mi) { struct agp_memory *mem = (struct agp_memory *) handle; mi->ami_size = mem->am_size; mi->ami_physical = mem->am_physical; mi->ami_offset = mem->am_offset; mi->ami_is_bound = mem->am_is_bound; } Index: user/attilio/rm_vmobj_cache/sys/dev/agp/agp_i810.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/dev/agp/agp_i810.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/dev/agp/agp_i810.c (revision 267237) @@ -1,2570 +1,2570 @@ /*- * Copyright (c) 2000 Doug Rabson * Copyright (c) 2000 Ruslan Ermilov * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Fixes for 830/845G support: David Dawes * 852GM/855GM/865G support added by David Dawes * * This is generic Intel GTT handling code, morphed from the AGP * bridge code. */ #include __FBSDID("$FreeBSD$"); #if 0 #define KTR_AGP_I810 KTR_DEV #else #define KTR_AGP_I810 0 #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DECLARE(M_AGP); struct agp_i810_match; static int agp_i810_check_active(device_t bridge_dev); static int agp_i830_check_active(device_t bridge_dev); static int agp_i915_check_active(device_t bridge_dev); static int agp_sb_check_active(device_t bridge_dev); static void agp_82852_set_desc(device_t dev, const struct agp_i810_match *match); static void agp_i810_set_desc(device_t dev, const struct agp_i810_match *match); static void agp_i810_dump_regs(device_t dev); static void agp_i830_dump_regs(device_t dev); static void agp_i855_dump_regs(device_t dev); static void agp_i915_dump_regs(device_t dev); static void agp_i965_dump_regs(device_t dev); static void agp_sb_dump_regs(device_t dev); static int agp_i810_get_stolen_size(device_t dev); static int agp_i830_get_stolen_size(device_t dev); static int agp_i915_get_stolen_size(device_t dev); static int agp_sb_get_stolen_size(device_t dev); static int agp_i810_get_gtt_mappable_entries(device_t dev); static int agp_i830_get_gtt_mappable_entries(device_t dev); static int agp_i915_get_gtt_mappable_entries(device_t dev); static int agp_i810_get_gtt_total_entries(device_t dev); static int agp_i965_get_gtt_total_entries(device_t dev); static int agp_gen5_get_gtt_total_entries(device_t dev); static int agp_sb_get_gtt_total_entries(device_t dev); static int agp_i810_install_gatt(device_t dev); static int agp_i830_install_gatt(device_t dev); static void agp_i810_deinstall_gatt(device_t dev); static void agp_i830_deinstall_gatt(device_t dev); static void agp_i810_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags); static void agp_i830_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags); static void agp_i915_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags); static void agp_i965_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags); static void agp_g4x_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags); static void agp_sb_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags); static void agp_i810_write_gtt(device_t dev, u_int index, uint32_t pte); static void agp_i915_write_gtt(device_t dev, u_int index, uint32_t pte); static void agp_i965_write_gtt(device_t dev, u_int index, uint32_t pte); static void agp_g4x_write_gtt(device_t dev, u_int index, uint32_t pte); static void agp_sb_write_gtt(device_t dev, u_int index, uint32_t pte); static u_int32_t agp_i810_read_gtt_pte(device_t dev, u_int index); static u_int32_t agp_i915_read_gtt_pte(device_t dev, u_int index); static u_int32_t agp_i965_read_gtt_pte(device_t dev, u_int index); static u_int32_t agp_g4x_read_gtt_pte(device_t dev, u_int index); static vm_paddr_t agp_i810_read_gtt_pte_paddr(device_t dev, u_int index); static vm_paddr_t agp_i915_read_gtt_pte_paddr(device_t dev, u_int index); static vm_paddr_t agp_sb_read_gtt_pte_paddr(device_t dev, u_int index); static int agp_i810_set_aperture(device_t dev, u_int32_t aperture); static int agp_i830_set_aperture(device_t dev, u_int32_t aperture); static int agp_i915_set_aperture(device_t dev, u_int32_t aperture); static int agp_i810_chipset_flush_setup(device_t dev); static int agp_i915_chipset_flush_setup(device_t dev); static int agp_i965_chipset_flush_setup(device_t dev); static void agp_i810_chipset_flush_teardown(device_t dev); static void agp_i915_chipset_flush_teardown(device_t dev); static void agp_i965_chipset_flush_teardown(device_t dev); static void agp_i810_chipset_flush(device_t dev); static void agp_i830_chipset_flush(device_t dev); static void agp_i915_chipset_flush(device_t dev); enum { CHIP_I810, /* i810/i815 */ CHIP_I830, /* 830M/845G */ CHIP_I855, /* 852GM/855GM/865G */ CHIP_I915, /* 915G/915GM */ CHIP_I965, /* G965 */ CHIP_G33, /* G33/Q33/Q35 */ CHIP_IGD, /* Pineview */ CHIP_G4X, /* G45/Q45 */ CHIP_SB, /* SandyBridge */ }; /* The i810 through i855 have the registers at BAR 1, and the GATT gets * allocated by us. The i915 has registers in BAR 0 and the GATT is at the * start of the stolen memory, and should only be accessed by the OS through * BAR 3. The G965 has registers and GATT in the same BAR (0) -- first 512KB * is registers, second 512KB is GATT. */ static struct resource_spec agp_i810_res_spec[] = { { SYS_RES_MEMORY, AGP_I810_MMADR, RF_ACTIVE | RF_SHAREABLE }, { -1, 0 } }; static struct resource_spec agp_i915_res_spec[] = { { SYS_RES_MEMORY, AGP_I915_MMADR, RF_ACTIVE | RF_SHAREABLE }, { SYS_RES_MEMORY, AGP_I915_GTTADR, RF_ACTIVE | RF_SHAREABLE }, { -1, 0 } }; static struct resource_spec agp_i965_res_spec[] = { { SYS_RES_MEMORY, AGP_I965_GTTMMADR, RF_ACTIVE | RF_SHAREABLE }, { -1, 0 } }; static struct resource_spec agp_g4x_res_spec[] = { { SYS_RES_MEMORY, AGP_G4X_MMADR, RF_ACTIVE | RF_SHAREABLE }, { SYS_RES_MEMORY, AGP_G4X_GTTADR, RF_ACTIVE | RF_SHAREABLE }, { -1, 0 } }; struct agp_i810_softc { struct agp_softc agp; u_int32_t initial_aperture; /* aperture size at startup */ struct agp_gatt *gatt; u_int32_t dcache_size; /* i810 only */ u_int32_t stolen; /* number of i830/845 gtt entries for stolen memory */ u_int stolen_size; /* BIOS-reserved graphics memory */ u_int gtt_total_entries; /* Total number of gtt ptes */ u_int gtt_mappable_entries; /* Number of gtt ptes mappable by CPU */ device_t bdev; /* bridge device */ void *argb_cursor; /* contigmalloc area for ARGB cursor */ struct resource *sc_res[2]; const struct agp_i810_match *match; int sc_flush_page_rid; struct resource *sc_flush_page_res; void *sc_flush_page_vaddr; int sc_bios_allocated_flush_page; }; static device_t intel_agp; struct agp_i810_driver { int chiptype; int gen; int busdma_addr_mask_sz; struct resource_spec *res_spec; int (*check_active)(device_t); void (*set_desc)(device_t, const struct agp_i810_match *); void (*dump_regs)(device_t); int (*get_stolen_size)(device_t); int (*get_gtt_total_entries)(device_t); int (*get_gtt_mappable_entries)(device_t); int (*install_gatt)(device_t); void (*deinstall_gatt)(device_t); void (*write_gtt)(device_t, u_int, uint32_t); void (*install_gtt_pte)(device_t, u_int, vm_offset_t, int); u_int32_t (*read_gtt_pte)(device_t, u_int); vm_paddr_t (*read_gtt_pte_paddr)(device_t , u_int); int (*set_aperture)(device_t, u_int32_t); int (*chipset_flush_setup)(device_t); void (*chipset_flush_teardown)(device_t); void (*chipset_flush)(device_t); }; static const struct agp_i810_driver agp_i810_i810_driver = { .chiptype = CHIP_I810, .gen = 1, .busdma_addr_mask_sz = 32, .res_spec = agp_i810_res_spec, .check_active = agp_i810_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i810_dump_regs, .get_stolen_size = agp_i810_get_stolen_size, .get_gtt_mappable_entries = agp_i810_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i810_get_gtt_total_entries, .install_gatt = agp_i810_install_gatt, .deinstall_gatt = agp_i810_deinstall_gatt, .write_gtt = agp_i810_write_gtt, .install_gtt_pte = agp_i810_install_gtt_pte, .read_gtt_pte = agp_i810_read_gtt_pte, .read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr, .set_aperture = agp_i810_set_aperture, .chipset_flush_setup = agp_i810_chipset_flush_setup, .chipset_flush_teardown = agp_i810_chipset_flush_teardown, .chipset_flush = agp_i810_chipset_flush, }; static const struct agp_i810_driver agp_i810_i815_driver = { .chiptype = CHIP_I810, .gen = 2, .busdma_addr_mask_sz = 32, .res_spec = agp_i810_res_spec, .check_active = agp_i810_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i810_dump_regs, .get_stolen_size = agp_i810_get_stolen_size, .get_gtt_mappable_entries = agp_i830_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i810_get_gtt_total_entries, .install_gatt = agp_i810_install_gatt, .deinstall_gatt = agp_i810_deinstall_gatt, .write_gtt = agp_i810_write_gtt, .install_gtt_pte = agp_i810_install_gtt_pte, .read_gtt_pte = agp_i810_read_gtt_pte, .read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr, .set_aperture = agp_i810_set_aperture, .chipset_flush_setup = agp_i810_chipset_flush_setup, .chipset_flush_teardown = agp_i810_chipset_flush_teardown, .chipset_flush = agp_i830_chipset_flush, }; static const struct agp_i810_driver agp_i810_i830_driver = { .chiptype = CHIP_I830, .gen = 2, .busdma_addr_mask_sz = 32, .res_spec = agp_i810_res_spec, .check_active = agp_i830_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i830_dump_regs, .get_stolen_size = agp_i830_get_stolen_size, .get_gtt_mappable_entries = agp_i830_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i810_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_i810_write_gtt, .install_gtt_pte = agp_i830_install_gtt_pte, .read_gtt_pte = agp_i810_read_gtt_pte, .read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr, .set_aperture = agp_i830_set_aperture, .chipset_flush_setup = agp_i810_chipset_flush_setup, .chipset_flush_teardown = agp_i810_chipset_flush_teardown, .chipset_flush = agp_i830_chipset_flush, }; static const struct agp_i810_driver agp_i810_i855_driver = { .chiptype = CHIP_I855, .gen = 2, .busdma_addr_mask_sz = 32, .res_spec = agp_i810_res_spec, .check_active = agp_i830_check_active, .set_desc = agp_82852_set_desc, .dump_regs = agp_i855_dump_regs, .get_stolen_size = agp_i915_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i810_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_i810_write_gtt, .install_gtt_pte = agp_i830_install_gtt_pte, .read_gtt_pte = agp_i810_read_gtt_pte, .read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr, .set_aperture = agp_i830_set_aperture, .chipset_flush_setup = agp_i810_chipset_flush_setup, .chipset_flush_teardown = agp_i810_chipset_flush_teardown, .chipset_flush = agp_i830_chipset_flush, }; static const struct agp_i810_driver agp_i810_i865_driver = { .chiptype = CHIP_I855, .gen = 2, .busdma_addr_mask_sz = 32, .res_spec = agp_i810_res_spec, .check_active = agp_i830_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i855_dump_regs, .get_stolen_size = agp_i915_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i810_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_i810_write_gtt, .install_gtt_pte = agp_i830_install_gtt_pte, .read_gtt_pte = agp_i810_read_gtt_pte, .read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr, .set_aperture = agp_i915_set_aperture, .chipset_flush_setup = agp_i810_chipset_flush_setup, .chipset_flush_teardown = agp_i810_chipset_flush_teardown, .chipset_flush = agp_i830_chipset_flush, }; static const struct agp_i810_driver agp_i810_i915_driver = { .chiptype = CHIP_I915, .gen = 3, .busdma_addr_mask_sz = 32, .res_spec = agp_i915_res_spec, .check_active = agp_i915_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i915_dump_regs, .get_stolen_size = agp_i915_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i810_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_i915_write_gtt, .install_gtt_pte = agp_i915_install_gtt_pte, .read_gtt_pte = agp_i915_read_gtt_pte, .read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr, .set_aperture = agp_i915_set_aperture, .chipset_flush_setup = agp_i915_chipset_flush_setup, .chipset_flush_teardown = agp_i915_chipset_flush_teardown, .chipset_flush = agp_i915_chipset_flush, }; static const struct agp_i810_driver agp_i810_g965_driver = { .chiptype = CHIP_I965, .gen = 4, .busdma_addr_mask_sz = 36, .res_spec = agp_i965_res_spec, .check_active = agp_i915_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i965_dump_regs, .get_stolen_size = agp_i915_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i965_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_i965_write_gtt, .install_gtt_pte = agp_i965_install_gtt_pte, .read_gtt_pte = agp_i965_read_gtt_pte, .read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr, .set_aperture = agp_i915_set_aperture, .chipset_flush_setup = agp_i965_chipset_flush_setup, .chipset_flush_teardown = agp_i965_chipset_flush_teardown, .chipset_flush = agp_i915_chipset_flush, }; static const struct agp_i810_driver agp_i810_g33_driver = { .chiptype = CHIP_G33, .gen = 3, .busdma_addr_mask_sz = 36, .res_spec = agp_i915_res_spec, .check_active = agp_i915_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i965_dump_regs, .get_stolen_size = agp_i915_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i965_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_i915_write_gtt, .install_gtt_pte = agp_i915_install_gtt_pte, .read_gtt_pte = agp_i915_read_gtt_pte, .read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr, .set_aperture = agp_i915_set_aperture, .chipset_flush_setup = agp_i965_chipset_flush_setup, .chipset_flush_teardown = agp_i965_chipset_flush_teardown, .chipset_flush = agp_i915_chipset_flush, }; static const struct agp_i810_driver agp_i810_igd_driver = { .chiptype = CHIP_IGD, .gen = 3, .busdma_addr_mask_sz = 36, .res_spec = agp_i915_res_spec, .check_active = agp_i915_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i915_dump_regs, .get_stolen_size = agp_i915_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_i965_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_i915_write_gtt, .install_gtt_pte = agp_i915_install_gtt_pte, .read_gtt_pte = agp_i915_read_gtt_pte, .read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr, .set_aperture = agp_i915_set_aperture, .chipset_flush_setup = agp_i965_chipset_flush_setup, .chipset_flush_teardown = agp_i965_chipset_flush_teardown, .chipset_flush = agp_i915_chipset_flush, }; static const struct agp_i810_driver agp_i810_g4x_driver = { .chiptype = CHIP_G4X, .gen = 5, .busdma_addr_mask_sz = 36, .res_spec = agp_i965_res_spec, .check_active = agp_i915_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_i965_dump_regs, .get_stolen_size = agp_i915_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_gen5_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_g4x_write_gtt, .install_gtt_pte = agp_g4x_install_gtt_pte, .read_gtt_pte = agp_g4x_read_gtt_pte, .read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr, .set_aperture = agp_i915_set_aperture, .chipset_flush_setup = agp_i965_chipset_flush_setup, .chipset_flush_teardown = agp_i965_chipset_flush_teardown, .chipset_flush = agp_i915_chipset_flush, }; static const struct agp_i810_driver agp_i810_sb_driver = { .chiptype = CHIP_SB, .gen = 6, .busdma_addr_mask_sz = 40, .res_spec = agp_g4x_res_spec, .check_active = agp_sb_check_active, .set_desc = agp_i810_set_desc, .dump_regs = agp_sb_dump_regs, .get_stolen_size = agp_sb_get_stolen_size, .get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries, .get_gtt_total_entries = agp_sb_get_gtt_total_entries, .install_gatt = agp_i830_install_gatt, .deinstall_gatt = agp_i830_deinstall_gatt, .write_gtt = agp_sb_write_gtt, .install_gtt_pte = agp_sb_install_gtt_pte, .read_gtt_pte = agp_g4x_read_gtt_pte, .read_gtt_pte_paddr = agp_sb_read_gtt_pte_paddr, .set_aperture = agp_i915_set_aperture, .chipset_flush_setup = agp_i810_chipset_flush_setup, .chipset_flush_teardown = agp_i810_chipset_flush_teardown, .chipset_flush = agp_i810_chipset_flush, }; /* For adding new devices, devid is the id of the graphics controller * (pci:0:2:0, for example). The placeholder (usually at pci:0:2:1) for the * second head should never be added. The bridge_offset is the offset to * subtract from devid to get the id of the hostb that the device is on. */ static const struct agp_i810_match { int devid; char *name; const struct agp_i810_driver *driver; } agp_i810_matches[] = { { .devid = 0x71218086, .name = "Intel 82810 (i810 GMCH) SVGA controller", .driver = &agp_i810_i810_driver }, { .devid = 0x71238086, .name = "Intel 82810-DC100 (i810-DC100 GMCH) SVGA controller", .driver = &agp_i810_i810_driver }, { .devid = 0x71258086, .name = "Intel 82810E (i810E GMCH) SVGA controller", .driver = &agp_i810_i810_driver }, { .devid = 0x11328086, .name = "Intel 82815 (i815 GMCH) SVGA controller", .driver = &agp_i810_i815_driver }, { .devid = 0x35778086, .name = "Intel 82830M (830M GMCH) SVGA controller", .driver = &agp_i810_i830_driver }, { .devid = 0x25628086, .name = "Intel 82845M (845M GMCH) SVGA controller", .driver = &agp_i810_i830_driver }, { .devid = 0x35828086, .name = "Intel 82852/855GM SVGA controller", .driver = &agp_i810_i855_driver }, { .devid = 0x25728086, .name = "Intel 82865G (865G GMCH) SVGA controller", .driver = &agp_i810_i865_driver }, { .devid = 0x25828086, .name = "Intel 82915G (915G GMCH) SVGA controller", .driver = &agp_i810_i915_driver }, { .devid = 0x258A8086, .name = "Intel E7221 SVGA controller", .driver = &agp_i810_i915_driver }, { .devid = 0x25928086, .name = "Intel 82915GM (915GM GMCH) SVGA controller", .driver = &agp_i810_i915_driver }, { .devid = 0x27728086, .name = "Intel 82945G (945G GMCH) SVGA controller", .driver = &agp_i810_i915_driver }, { .devid = 0x27A28086, .name = "Intel 82945GM (945GM GMCH) SVGA controller", .driver = &agp_i810_i915_driver }, { .devid = 0x27AE8086, .name = "Intel 945GME SVGA controller", .driver = &agp_i810_i915_driver }, { .devid = 0x29728086, .name = "Intel 946GZ SVGA controller", .driver = &agp_i810_g965_driver }, { .devid = 0x29828086, .name = "Intel G965 SVGA controller", .driver = &agp_i810_g965_driver }, { .devid = 0x29928086, .name = "Intel Q965 SVGA controller", .driver = &agp_i810_g965_driver }, { .devid = 0x29A28086, .name = "Intel G965 SVGA controller", .driver = &agp_i810_g965_driver }, { .devid = 0x29B28086, .name = "Intel Q35 SVGA controller", .driver = &agp_i810_g33_driver }, { .devid = 0x29C28086, .name = "Intel G33 SVGA controller", .driver = &agp_i810_g33_driver }, { .devid = 0x29D28086, .name = "Intel Q33 SVGA controller", .driver = &agp_i810_g33_driver }, { .devid = 0xA0018086, .name = "Intel Pineview SVGA controller", .driver = &agp_i810_igd_driver }, { .devid = 0xA0118086, .name = "Intel Pineview (M) SVGA controller", .driver = &agp_i810_igd_driver }, { .devid = 0x2A028086, .name = "Intel GM965 SVGA controller", .driver = &agp_i810_g965_driver }, { .devid = 0x2A128086, .name = "Intel GME965 SVGA controller", .driver = &agp_i810_g965_driver }, { .devid = 0x2A428086, .name = "Intel GM45 SVGA controller", .driver = &agp_i810_g4x_driver }, { .devid = 0x2E028086, .name = "Intel Eaglelake SVGA controller", .driver = &agp_i810_g4x_driver }, { .devid = 0x2E128086, .name = "Intel Q45 SVGA controller", .driver = &agp_i810_g4x_driver }, { .devid = 0x2E228086, .name = "Intel G45 SVGA controller", .driver = &agp_i810_g4x_driver }, { .devid = 0x2E328086, .name = "Intel G41 SVGA controller", .driver = &agp_i810_g4x_driver }, { .devid = 0x00428086, .name = "Intel Ironlake (D) SVGA controller", .driver = &agp_i810_g4x_driver }, { .devid = 0x00468086, .name = "Intel Ironlake (M) SVGA controller", .driver = &agp_i810_g4x_driver }, { .devid = 0x01028086, .name = "SandyBridge desktop GT1 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01128086, .name = "SandyBridge desktop GT2 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01228086, .name = "SandyBridge desktop GT2+ IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01068086, .name = "SandyBridge mobile GT1 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01168086, .name = "SandyBridge mobile GT2 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01268086, .name = "SandyBridge mobile GT2+ IG", .driver = &agp_i810_sb_driver }, { .devid = 0x010a8086, .name = "SandyBridge server IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01528086, .name = "IvyBridge desktop GT1 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01628086, .name = "IvyBridge desktop GT2 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01568086, .name = "IvyBridge mobile GT1 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x01668086, .name = "IvyBridge mobile GT2 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x015a8086, .name = "IvyBridge server GT1 IG", .driver = &agp_i810_sb_driver }, { .devid = 0x016a8086, .name = "IvyBridge server GT2 IG", .driver = &agp_i810_sb_driver }, { .devid = 0, } }; static const struct agp_i810_match* agp_i810_match(device_t dev) { int i, devid; if (pci_get_class(dev) != PCIC_DISPLAY || pci_get_subclass(dev) != PCIS_DISPLAY_VGA) return (NULL); devid = pci_get_devid(dev); for (i = 0; agp_i810_matches[i].devid != 0; i++) { if (agp_i810_matches[i].devid == devid) break; } if (agp_i810_matches[i].devid == 0) return (NULL); else return (&agp_i810_matches[i]); } /* * Find bridge device. */ static device_t agp_i810_find_bridge(device_t dev) { return (pci_find_dbsf(0, 0, 0, 0)); } static void agp_i810_identify(driver_t *driver, device_t parent) { if (device_find_child(parent, "agp", -1) == NULL && agp_i810_match(parent)) device_add_child(parent, "agp", -1); } static int agp_i810_check_active(device_t bridge_dev) { u_int8_t smram; smram = pci_read_config(bridge_dev, AGP_I810_SMRAM, 1); if ((smram & AGP_I810_SMRAM_GMS) == AGP_I810_SMRAM_GMS_DISABLED) return (ENXIO); return (0); } static int agp_i830_check_active(device_t bridge_dev) { int gcc1; gcc1 = pci_read_config(bridge_dev, AGP_I830_GCC1, 1); if ((gcc1 & AGP_I830_GCC1_DEV2) == AGP_I830_GCC1_DEV2_DISABLED) return (ENXIO); return (0); } static int agp_i915_check_active(device_t bridge_dev) { int deven; deven = pci_read_config(bridge_dev, AGP_I915_DEVEN, 4); if ((deven & AGP_I915_DEVEN_D2F0) == AGP_I915_DEVEN_D2F0_DISABLED) return (ENXIO); return (0); } static int agp_sb_check_active(device_t bridge_dev) { int deven; deven = pci_read_config(bridge_dev, AGP_I915_DEVEN, 4); if ((deven & AGP_SB_DEVEN_D2EN) == AGP_SB_DEVEN_D2EN_DISABLED) return (ENXIO); return (0); } static void agp_82852_set_desc(device_t dev, const struct agp_i810_match *match) { switch (pci_read_config(dev, AGP_I85X_CAPID, 1)) { case AGP_I855_GME: device_set_desc(dev, "Intel 82855GME (855GME GMCH) SVGA controller"); break; case AGP_I855_GM: device_set_desc(dev, "Intel 82855GM (855GM GMCH) SVGA controller"); break; case AGP_I852_GME: device_set_desc(dev, "Intel 82852GME (852GME GMCH) SVGA controller"); break; case AGP_I852_GM: device_set_desc(dev, "Intel 82852GM (852GM GMCH) SVGA controller"); break; default: device_set_desc(dev, "Intel 8285xM (85xGM GMCH) SVGA controller"); break; } } static void agp_i810_set_desc(device_t dev, const struct agp_i810_match *match) { device_set_desc(dev, match->name); } static int agp_i810_probe(device_t dev) { device_t bdev; const struct agp_i810_match *match; int err; if (resource_disabled("agp", device_get_unit(dev))) return (ENXIO); match = agp_i810_match(dev); if (match == NULL) return (ENXIO); bdev = agp_i810_find_bridge(dev); if (bdev == NULL) { if (bootverbose) printf("I810: can't find bridge device\n"); return (ENXIO); } /* * checking whether internal graphics device has been activated. */ err = match->driver->check_active(bdev); if (err != 0) { if (bootverbose) printf("i810: disabled, not probing\n"); return (err); } match->driver->set_desc(dev, match); return (BUS_PROBE_DEFAULT); } static void agp_i810_dump_regs(device_t dev) { struct agp_i810_softc *sc = device_get_softc(dev); device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n", bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL)); device_printf(dev, "AGP_I810_MISCC: 0x%04x\n", pci_read_config(sc->bdev, AGP_I810_MISCC, 2)); } static void agp_i830_dump_regs(device_t dev) { struct agp_i810_softc *sc = device_get_softc(dev); device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n", bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL)); device_printf(dev, "AGP_I830_GCC1: 0x%02x\n", pci_read_config(sc->bdev, AGP_I830_GCC1, 1)); } static void agp_i855_dump_regs(device_t dev) { struct agp_i810_softc *sc = device_get_softc(dev); device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n", bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL)); device_printf(dev, "AGP_I855_GCC1: 0x%02x\n", pci_read_config(sc->bdev, AGP_I855_GCC1, 1)); } static void agp_i915_dump_regs(device_t dev) { struct agp_i810_softc *sc = device_get_softc(dev); device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n", bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL)); device_printf(dev, "AGP_I855_GCC1: 0x%02x\n", pci_read_config(sc->bdev, AGP_I855_GCC1, 1)); device_printf(dev, "AGP_I915_MSAC: 0x%02x\n", pci_read_config(sc->bdev, AGP_I915_MSAC, 1)); } static void agp_i965_dump_regs(device_t dev) { struct agp_i810_softc *sc = device_get_softc(dev); device_printf(dev, "AGP_I965_PGTBL_CTL2: %08x\n", bus_read_4(sc->sc_res[0], AGP_I965_PGTBL_CTL2)); device_printf(dev, "AGP_I855_GCC1: 0x%02x\n", pci_read_config(sc->bdev, AGP_I855_GCC1, 1)); device_printf(dev, "AGP_I965_MSAC: 0x%02x\n", pci_read_config(sc->bdev, AGP_I965_MSAC, 1)); } static void agp_sb_dump_regs(device_t dev) { struct agp_i810_softc *sc = device_get_softc(dev); device_printf(dev, "AGP_SNB_GFX_MODE: %08x\n", bus_read_4(sc->sc_res[0], AGP_SNB_GFX_MODE)); device_printf(dev, "AGP_SNB_GCC1: 0x%04x\n", pci_read_config(sc->bdev, AGP_SNB_GCC1, 2)); } static int agp_i810_get_stolen_size(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); sc->stolen = 0; sc->stolen_size = 0; return (0); } static int agp_i830_get_stolen_size(device_t dev) { struct agp_i810_softc *sc; unsigned int gcc1; sc = device_get_softc(dev); gcc1 = pci_read_config(sc->bdev, AGP_I830_GCC1, 1); switch (gcc1 & AGP_I830_GCC1_GMS) { case AGP_I830_GCC1_GMS_STOLEN_512: sc->stolen = (512 - 132) * 1024 / 4096; sc->stolen_size = 512 * 1024; break; case AGP_I830_GCC1_GMS_STOLEN_1024: sc->stolen = (1024 - 132) * 1024 / 4096; sc->stolen_size = 1024 * 1024; break; case AGP_I830_GCC1_GMS_STOLEN_8192: sc->stolen = (8192 - 132) * 1024 / 4096; sc->stolen_size = 8192 * 1024; break; default: sc->stolen = 0; device_printf(dev, "unknown memory configuration, disabling (GCC1 %x)\n", gcc1); return (EINVAL); } return (0); } static int agp_i915_get_stolen_size(device_t dev) { struct agp_i810_softc *sc; unsigned int gcc1, stolen, gtt_size; sc = device_get_softc(dev); /* * Stolen memory is set up at the beginning of the aperture by * the BIOS, consisting of the GATT followed by 4kb for the * BIOS display. */ switch (sc->match->driver->chiptype) { case CHIP_I855: gtt_size = 128; break; case CHIP_I915: gtt_size = 256; break; case CHIP_I965: switch (bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL) & AGP_I810_PGTBL_SIZE_MASK) { case AGP_I810_PGTBL_SIZE_128KB: gtt_size = 128; break; case AGP_I810_PGTBL_SIZE_256KB: gtt_size = 256; break; case AGP_I810_PGTBL_SIZE_512KB: gtt_size = 512; break; case AGP_I965_PGTBL_SIZE_1MB: gtt_size = 1024; break; case AGP_I965_PGTBL_SIZE_2MB: gtt_size = 2048; break; case AGP_I965_PGTBL_SIZE_1_5MB: gtt_size = 1024 + 512; break; default: device_printf(dev, "Bad PGTBL size\n"); return (EINVAL); } break; case CHIP_G33: gcc1 = pci_read_config(sc->bdev, AGP_I855_GCC1, 2); switch (gcc1 & AGP_G33_MGGC_GGMS_MASK) { case AGP_G33_MGGC_GGMS_SIZE_1M: gtt_size = 1024; break; case AGP_G33_MGGC_GGMS_SIZE_2M: gtt_size = 2048; break; default: device_printf(dev, "Bad PGTBL size\n"); return (EINVAL); } break; case CHIP_IGD: case CHIP_G4X: gtt_size = 0; break; default: device_printf(dev, "Bad chiptype\n"); return (EINVAL); } /* GCC1 is called MGGC on i915+ */ gcc1 = pci_read_config(sc->bdev, AGP_I855_GCC1, 1); switch (gcc1 & AGP_I855_GCC1_GMS) { case AGP_I855_GCC1_GMS_STOLEN_1M: stolen = 1024; break; case AGP_I855_GCC1_GMS_STOLEN_4M: stolen = 4 * 1024; break; case AGP_I855_GCC1_GMS_STOLEN_8M: stolen = 8 * 1024; break; case AGP_I855_GCC1_GMS_STOLEN_16M: stolen = 16 * 1024; break; case AGP_I855_GCC1_GMS_STOLEN_32M: stolen = 32 * 1024; break; case AGP_I915_GCC1_GMS_STOLEN_48M: stolen = sc->match->driver->gen > 2 ? 48 * 1024 : 0; break; case AGP_I915_GCC1_GMS_STOLEN_64M: stolen = sc->match->driver->gen > 2 ? 64 * 1024 : 0; break; case AGP_G33_GCC1_GMS_STOLEN_128M: stolen = sc->match->driver->gen > 2 ? 128 * 1024 : 0; break; case AGP_G33_GCC1_GMS_STOLEN_256M: stolen = sc->match->driver->gen > 2 ? 256 * 1024 : 0; break; case AGP_G4X_GCC1_GMS_STOLEN_96M: if (sc->match->driver->chiptype == CHIP_I965 || sc->match->driver->chiptype == CHIP_G4X) stolen = 96 * 1024; else stolen = 0; break; case AGP_G4X_GCC1_GMS_STOLEN_160M: if (sc->match->driver->chiptype == CHIP_I965 || sc->match->driver->chiptype == CHIP_G4X) stolen = 160 * 1024; else stolen = 0; break; case AGP_G4X_GCC1_GMS_STOLEN_224M: if (sc->match->driver->chiptype == CHIP_I965 || sc->match->driver->chiptype == CHIP_G4X) stolen = 224 * 1024; else stolen = 0; break; case AGP_G4X_GCC1_GMS_STOLEN_352M: if (sc->match->driver->chiptype == CHIP_I965 || sc->match->driver->chiptype == CHIP_G4X) stolen = 352 * 1024; else stolen = 0; break; default: device_printf(dev, "unknown memory configuration, disabling (GCC1 %x)\n", gcc1); return (EINVAL); } gtt_size += 4; sc->stolen_size = stolen * 1024; sc->stolen = (stolen - gtt_size) * 1024 / 4096; return (0); } static int agp_sb_get_stolen_size(device_t dev) { struct agp_i810_softc *sc; uint16_t gmch_ctl; sc = device_get_softc(dev); gmch_ctl = pci_read_config(sc->bdev, AGP_SNB_GCC1, 2); switch (gmch_ctl & AGP_SNB_GMCH_GMS_STOLEN_MASK) { case AGP_SNB_GMCH_GMS_STOLEN_32M: sc->stolen_size = 32 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_64M: sc->stolen_size = 64 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_96M: sc->stolen_size = 96 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_128M: sc->stolen_size = 128 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_160M: sc->stolen_size = 160 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_192M: sc->stolen_size = 192 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_224M: sc->stolen_size = 224 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_256M: sc->stolen_size = 256 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_288M: sc->stolen_size = 288 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_320M: sc->stolen_size = 320 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_352M: sc->stolen_size = 352 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_384M: sc->stolen_size = 384 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_416M: sc->stolen_size = 416 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_448M: sc->stolen_size = 448 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_480M: sc->stolen_size = 480 * 1024 * 1024; break; case AGP_SNB_GMCH_GMS_STOLEN_512M: sc->stolen_size = 512 * 1024 * 1024; break; } sc->stolen = (sc->stolen_size - 4) / 4096; return (0); } static int agp_i810_get_gtt_mappable_entries(device_t dev) { struct agp_i810_softc *sc; uint32_t ap; uint16_t miscc; sc = device_get_softc(dev); miscc = pci_read_config(sc->bdev, AGP_I810_MISCC, 2); if ((miscc & AGP_I810_MISCC_WINSIZE) == AGP_I810_MISCC_WINSIZE_32) ap = 32; else ap = 64; sc->gtt_mappable_entries = (ap * 1024 * 1024) >> AGP_PAGE_SHIFT; return (0); } static int agp_i830_get_gtt_mappable_entries(device_t dev) { struct agp_i810_softc *sc; uint32_t ap; uint16_t gmch_ctl; sc = device_get_softc(dev); gmch_ctl = pci_read_config(sc->bdev, AGP_I830_GCC1, 2); if ((gmch_ctl & AGP_I830_GCC1_GMASIZE) == AGP_I830_GCC1_GMASIZE_64) ap = 64; else ap = 128; sc->gtt_mappable_entries = (ap * 1024 * 1024) >> AGP_PAGE_SHIFT; return (0); } static int agp_i915_get_gtt_mappable_entries(device_t dev) { struct agp_i810_softc *sc; uint32_t ap; sc = device_get_softc(dev); ap = AGP_GET_APERTURE(dev); sc->gtt_mappable_entries = ap >> AGP_PAGE_SHIFT; return (0); } static int agp_i810_get_gtt_total_entries(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); sc->gtt_total_entries = sc->gtt_mappable_entries; return (0); } static int agp_i965_get_gtt_total_entries(device_t dev) { struct agp_i810_softc *sc; uint32_t pgetbl_ctl; int error; sc = device_get_softc(dev); error = 0; pgetbl_ctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL); switch (pgetbl_ctl & AGP_I810_PGTBL_SIZE_MASK) { case AGP_I810_PGTBL_SIZE_128KB: sc->gtt_total_entries = 128 * 1024 / 4; break; case AGP_I810_PGTBL_SIZE_256KB: sc->gtt_total_entries = 256 * 1024 / 4; break; case AGP_I810_PGTBL_SIZE_512KB: sc->gtt_total_entries = 512 * 1024 / 4; break; /* GTT pagetable sizes bigger than 512KB are not possible on G33! */ case AGP_I810_PGTBL_SIZE_1MB: sc->gtt_total_entries = 1024 * 1024 / 4; break; case AGP_I810_PGTBL_SIZE_2MB: sc->gtt_total_entries = 2 * 1024 * 1024 / 4; break; case AGP_I810_PGTBL_SIZE_1_5MB: sc->gtt_total_entries = (1024 + 512) * 1024 / 4; break; default: device_printf(dev, "Unknown page table size\n"); error = ENXIO; } return (error); } static void agp_gen5_adjust_pgtbl_size(device_t dev, uint32_t sz) { struct agp_i810_softc *sc; uint32_t pgetbl_ctl, pgetbl_ctl2; sc = device_get_softc(dev); /* Disable per-process page table. */ pgetbl_ctl2 = bus_read_4(sc->sc_res[0], AGP_I965_PGTBL_CTL2); pgetbl_ctl2 &= ~AGP_I810_PGTBL_ENABLED; bus_write_4(sc->sc_res[0], AGP_I965_PGTBL_CTL2, pgetbl_ctl2); /* Write the new ggtt size. */ pgetbl_ctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL); pgetbl_ctl &= ~AGP_I810_PGTBL_SIZE_MASK; pgetbl_ctl |= sz; bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, pgetbl_ctl); } static int agp_gen5_get_gtt_total_entries(device_t dev) { struct agp_i810_softc *sc; uint16_t gcc1; sc = device_get_softc(dev); gcc1 = pci_read_config(sc->bdev, AGP_I830_GCC1, 2); switch (gcc1 & AGP_G4x_GCC1_SIZE_MASK) { case AGP_G4x_GCC1_SIZE_1M: case AGP_G4x_GCC1_SIZE_VT_1M: agp_gen5_adjust_pgtbl_size(dev, AGP_I810_PGTBL_SIZE_1MB); break; case AGP_G4x_GCC1_SIZE_VT_1_5M: agp_gen5_adjust_pgtbl_size(dev, AGP_I810_PGTBL_SIZE_1_5MB); break; case AGP_G4x_GCC1_SIZE_2M: case AGP_G4x_GCC1_SIZE_VT_2M: agp_gen5_adjust_pgtbl_size(dev, AGP_I810_PGTBL_SIZE_2MB); break; default: device_printf(dev, "Unknown page table size\n"); return (ENXIO); } return (agp_i965_get_gtt_total_entries(dev)); } static int agp_sb_get_gtt_total_entries(device_t dev) { struct agp_i810_softc *sc; uint16_t gcc1; sc = device_get_softc(dev); gcc1 = pci_read_config(sc->bdev, AGP_SNB_GCC1, 2); switch (gcc1 & AGP_SNB_GTT_SIZE_MASK) { default: case AGP_SNB_GTT_SIZE_0M: printf("Bad GTT size mask: 0x%04x\n", gcc1); return (ENXIO); case AGP_SNB_GTT_SIZE_1M: sc->gtt_total_entries = 1024 * 1024 / 4; break; case AGP_SNB_GTT_SIZE_2M: sc->gtt_total_entries = 2 * 1024 * 1024 / 4; break; } return (0); } static int agp_i810_install_gatt(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); /* Some i810s have on-chip memory called dcache. */ if ((bus_read_1(sc->sc_res[0], AGP_I810_DRT) & AGP_I810_DRT_POPULATED) != 0) sc->dcache_size = 4 * 1024 * 1024; else sc->dcache_size = 0; /* According to the specs the gatt on the i810 must be 64k. */ sc->gatt->ag_virtual = contigmalloc(64 * 1024, M_AGP, 0, 0, ~0, PAGE_SIZE, 0); if (sc->gatt->ag_virtual == NULL) { if (bootverbose) device_printf(dev, "contiguous allocation failed\n"); return (ENOMEM); } bzero(sc->gatt->ag_virtual, sc->gatt->ag_entries * sizeof(u_int32_t)); sc->gatt->ag_physical = vtophys((vm_offset_t)sc->gatt->ag_virtual); agp_flush_cache(); /* Install the GATT. */ bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, sc->gatt->ag_physical | 1); return (0); } static int agp_i830_install_gatt(device_t dev) { struct agp_i810_softc *sc; uint32_t pgtblctl; sc = device_get_softc(dev); /* * The i830 automatically initializes the 128k gatt on boot. * GATT address is already in there, make sure it's enabled. */ pgtblctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL); pgtblctl |= 1; bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, pgtblctl); sc->gatt->ag_physical = pgtblctl & ~1; return (0); } static int agp_i810_attach(device_t dev) { struct agp_i810_softc *sc; int error; sc = device_get_softc(dev); sc->bdev = agp_i810_find_bridge(dev); if (sc->bdev == NULL) return (ENOENT); sc->match = agp_i810_match(dev); agp_set_aperture_resource(dev, sc->match->driver->gen <= 2 ? AGP_APBASE : AGP_I915_GMADR); error = agp_generic_attach(dev); if (error) return (error); if (ptoa((vm_paddr_t)Maxmem) > (1ULL << sc->match->driver->busdma_addr_mask_sz) - 1) { device_printf(dev, "agp_i810 does not support physical " "memory above %ju.\n", (uintmax_t)(1ULL << sc->match->driver->busdma_addr_mask_sz) - 1); return (ENOENT); } if (bus_alloc_resources(dev, sc->match->driver->res_spec, sc->sc_res)) { agp_generic_detach(dev); return (ENODEV); } sc->initial_aperture = AGP_GET_APERTURE(dev); sc->gatt = malloc(sizeof(struct agp_gatt), M_AGP, M_WAITOK); sc->gatt->ag_entries = AGP_GET_APERTURE(dev) >> AGP_PAGE_SHIFT; if ((error = sc->match->driver->get_stolen_size(dev)) != 0 || (error = sc->match->driver->install_gatt(dev)) != 0 || (error = sc->match->driver->get_gtt_mappable_entries(dev)) != 0 || (error = sc->match->driver->get_gtt_total_entries(dev)) != 0 || (error = sc->match->driver->chipset_flush_setup(dev)) != 0) { bus_release_resources(dev, sc->match->driver->res_spec, sc->sc_res); free(sc->gatt, M_AGP); agp_generic_detach(dev); return (error); } intel_agp = dev; device_printf(dev, "aperture size is %dM", sc->initial_aperture / 1024 / 1024); if (sc->stolen > 0) printf(", detected %dk stolen memory\n", sc->stolen * 4); else printf("\n"); if (bootverbose) { sc->match->driver->dump_regs(dev); device_printf(dev, "Mappable GTT entries: %d\n", sc->gtt_mappable_entries); device_printf(dev, "Total GTT entries: %d\n", sc->gtt_total_entries); } return (0); } static void agp_i810_deinstall_gatt(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, 0); contigfree(sc->gatt->ag_virtual, 64 * 1024, M_AGP); } static void agp_i830_deinstall_gatt(device_t dev) { struct agp_i810_softc *sc; unsigned int pgtblctl; sc = device_get_softc(dev); pgtblctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL); pgtblctl &= ~1; bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, pgtblctl); } static int agp_i810_detach(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); agp_free_cdev(dev); /* Clear the GATT base. */ sc->match->driver->deinstall_gatt(dev); sc->match->driver->chipset_flush_teardown(dev); /* Put the aperture back the way it started. */ AGP_SET_APERTURE(dev, sc->initial_aperture); free(sc->gatt, M_AGP); bus_release_resources(dev, sc->match->driver->res_spec, sc->sc_res); agp_free_res(dev); return (0); } static int agp_i810_resume(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); AGP_SET_APERTURE(dev, sc->initial_aperture); /* Install the GATT. */ bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, sc->gatt->ag_physical | 1); return (bus_generic_resume(dev)); } /** * Sets the PCI resource size of the aperture on i830-class and below chipsets, * while returning failure on later chipsets when an actual change is * requested. * * This whole function is likely bogus, as the kernel would probably need to * reconfigure the placement of the AGP aperture if a larger size is requested, * which doesn't happen currently. */ static int agp_i810_set_aperture(device_t dev, u_int32_t aperture) { struct agp_i810_softc *sc; u_int16_t miscc; sc = device_get_softc(dev); /* * Double check for sanity. */ if (aperture != 32 * 1024 * 1024 && aperture != 64 * 1024 * 1024) { device_printf(dev, "bad aperture size %d\n", aperture); return (EINVAL); } miscc = pci_read_config(sc->bdev, AGP_I810_MISCC, 2); miscc &= ~AGP_I810_MISCC_WINSIZE; if (aperture == 32 * 1024 * 1024) miscc |= AGP_I810_MISCC_WINSIZE_32; else miscc |= AGP_I810_MISCC_WINSIZE_64; pci_write_config(sc->bdev, AGP_I810_MISCC, miscc, 2); return (0); } static int agp_i830_set_aperture(device_t dev, u_int32_t aperture) { struct agp_i810_softc *sc; u_int16_t gcc1; sc = device_get_softc(dev); if (aperture != 64 * 1024 * 1024 && aperture != 128 * 1024 * 1024) { device_printf(dev, "bad aperture size %d\n", aperture); return (EINVAL); } gcc1 = pci_read_config(sc->bdev, AGP_I830_GCC1, 2); gcc1 &= ~AGP_I830_GCC1_GMASIZE; if (aperture == 64 * 1024 * 1024) gcc1 |= AGP_I830_GCC1_GMASIZE_64; else gcc1 |= AGP_I830_GCC1_GMASIZE_128; pci_write_config(sc->bdev, AGP_I830_GCC1, gcc1, 2); return (0); } static int agp_i915_set_aperture(device_t dev, u_int32_t aperture) { return (agp_generic_set_aperture(dev, aperture)); } static int agp_i810_method_set_aperture(device_t dev, u_int32_t aperture) { struct agp_i810_softc *sc; sc = device_get_softc(dev); return (sc->match->driver->set_aperture(dev, aperture)); } /** * Writes a GTT entry mapping the page at the given offset from the * beginning of the aperture to the given physical address. Setup the * caching mode according to flags. * * For gen 1, 2 and 3, GTT start is located at AGP_I810_GTT offset * from corresponding BAR start. For gen 4, offset is 512KB + * AGP_I810_GTT, for gen 5 and 6 it is 2MB + AGP_I810_GTT. * * Also, the bits of the physical page address above 4GB needs to be * placed into bits 40-32 of PTE. */ static void agp_i810_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags) { uint32_t pte; pte = (u_int32_t)physical | I810_PTE_VALID; if (flags == AGP_DCACHE_MEMORY) pte |= I810_PTE_LOCAL; else if (flags == AGP_USER_CACHED_MEMORY) pte |= I830_PTE_SYSTEM_CACHED; agp_i810_write_gtt(dev, index, pte); } static void agp_i810_write_gtt(device_t dev, u_int index, uint32_t pte) { struct agp_i810_softc *sc; sc = device_get_softc(dev); bus_write_4(sc->sc_res[0], AGP_I810_GTT + index * 4, pte); CTR2(KTR_AGP_I810, "810_pte %x %x", index, pte); } static void agp_i830_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags) { uint32_t pte; pte = (u_int32_t)physical | I810_PTE_VALID; if (flags == AGP_USER_CACHED_MEMORY) pte |= I830_PTE_SYSTEM_CACHED; agp_i810_write_gtt(dev, index, pte); } static void agp_i915_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags) { uint32_t pte; pte = (u_int32_t)physical | I810_PTE_VALID; if (flags == AGP_USER_CACHED_MEMORY) pte |= I830_PTE_SYSTEM_CACHED; pte |= (physical & 0x0000000f00000000ull) >> 28; agp_i915_write_gtt(dev, index, pte); } static void agp_i915_write_gtt(device_t dev, u_int index, uint32_t pte) { struct agp_i810_softc *sc; sc = device_get_softc(dev); bus_write_4(sc->sc_res[1], index * 4, pte); CTR2(KTR_AGP_I810, "915_pte %x %x", index, pte); } static void agp_i965_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags) { uint32_t pte; pte = (u_int32_t)physical | I810_PTE_VALID; if (flags == AGP_USER_CACHED_MEMORY) pte |= I830_PTE_SYSTEM_CACHED; pte |= (physical & 0x0000000f00000000ull) >> 28; agp_i965_write_gtt(dev, index, pte); } static void agp_i965_write_gtt(device_t dev, u_int index, uint32_t pte) { struct agp_i810_softc *sc; sc = device_get_softc(dev); bus_write_4(sc->sc_res[0], index * 4 + (512 * 1024), pte); CTR2(KTR_AGP_I810, "965_pte %x %x", index, pte); } static void agp_g4x_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags) { uint32_t pte; pte = (u_int32_t)physical | I810_PTE_VALID; if (flags == AGP_USER_CACHED_MEMORY) pte |= I830_PTE_SYSTEM_CACHED; pte |= (physical & 0x0000000f00000000ull) >> 28; agp_g4x_write_gtt(dev, index, pte); } static void agp_g4x_write_gtt(device_t dev, u_int index, uint32_t pte) { struct agp_i810_softc *sc; sc = device_get_softc(dev); bus_write_4(sc->sc_res[0], index * 4 + (2 * 1024 * 1024), pte); CTR2(KTR_AGP_I810, "g4x_pte %x %x", index, pte); } static void agp_sb_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical, int flags) { int type_mask, gfdt; uint32_t pte; pte = (u_int32_t)physical | I810_PTE_VALID; type_mask = flags & ~AGP_USER_CACHED_MEMORY_GFDT; gfdt = (flags & AGP_USER_CACHED_MEMORY_GFDT) != 0 ? GEN6_PTE_GFDT : 0; if (type_mask == AGP_USER_MEMORY) pte |= GEN6_PTE_UNCACHED; else if (type_mask == AGP_USER_CACHED_MEMORY_LLC_MLC) pte |= GEN6_PTE_LLC_MLC | gfdt; else pte |= GEN6_PTE_LLC | gfdt; pte |= (physical & 0x000000ff00000000ull) >> 28; agp_sb_write_gtt(dev, index, pte); } static void agp_sb_write_gtt(device_t dev, u_int index, uint32_t pte) { struct agp_i810_softc *sc; sc = device_get_softc(dev); bus_write_4(sc->sc_res[0], index * 4 + (2 * 1024 * 1024), pte); CTR2(KTR_AGP_I810, "sb_pte %x %x", index, pte); } static int agp_i810_bind_page(device_t dev, vm_offset_t offset, vm_offset_t physical) { struct agp_i810_softc *sc = device_get_softc(dev); u_int index; if (offset >= (sc->gatt->ag_entries << AGP_PAGE_SHIFT)) { device_printf(dev, "failed: offset is 0x%08jx, " "shift is %d, entries is %d\n", (intmax_t)offset, AGP_PAGE_SHIFT, sc->gatt->ag_entries); return (EINVAL); } index = offset >> AGP_PAGE_SHIFT; if (sc->stolen != 0 && index < sc->stolen) { device_printf(dev, "trying to bind into stolen memory\n"); return (EINVAL); } sc->match->driver->install_gtt_pte(dev, index, physical, 0); return (0); } static int agp_i810_unbind_page(device_t dev, vm_offset_t offset) { struct agp_i810_softc *sc; u_int index; sc = device_get_softc(dev); if (offset >= (sc->gatt->ag_entries << AGP_PAGE_SHIFT)) return (EINVAL); index = offset >> AGP_PAGE_SHIFT; if (sc->stolen != 0 && index < sc->stolen) { device_printf(dev, "trying to unbind from stolen memory\n"); return (EINVAL); } sc->match->driver->install_gtt_pte(dev, index, 0, 0); return (0); } static u_int32_t agp_i810_read_gtt_pte(device_t dev, u_int index) { struct agp_i810_softc *sc; u_int32_t pte; sc = device_get_softc(dev); pte = bus_read_4(sc->sc_res[0], AGP_I810_GTT + index * 4); return (pte); } static u_int32_t agp_i915_read_gtt_pte(device_t dev, u_int index) { struct agp_i810_softc *sc; u_int32_t pte; sc = device_get_softc(dev); pte = bus_read_4(sc->sc_res[1], index * 4); return (pte); } static u_int32_t agp_i965_read_gtt_pte(device_t dev, u_int index) { struct agp_i810_softc *sc; u_int32_t pte; sc = device_get_softc(dev); pte = bus_read_4(sc->sc_res[0], index * 4 + (512 * 1024)); return (pte); } static u_int32_t agp_g4x_read_gtt_pte(device_t dev, u_int index) { struct agp_i810_softc *sc; u_int32_t pte; sc = device_get_softc(dev); pte = bus_read_4(sc->sc_res[0], index * 4 + (2 * 1024 * 1024)); return (pte); } static vm_paddr_t agp_i810_read_gtt_pte_paddr(device_t dev, u_int index) { struct agp_i810_softc *sc; u_int32_t pte; vm_paddr_t res; sc = device_get_softc(dev); pte = sc->match->driver->read_gtt_pte(dev, index); res = pte & ~PAGE_MASK; return (res); } static vm_paddr_t agp_i915_read_gtt_pte_paddr(device_t dev, u_int index) { struct agp_i810_softc *sc; u_int32_t pte; vm_paddr_t res; sc = device_get_softc(dev); pte = sc->match->driver->read_gtt_pte(dev, index); res = (pte & ~PAGE_MASK) | ((pte & 0xf0) << 28); return (res); } static vm_paddr_t agp_sb_read_gtt_pte_paddr(device_t dev, u_int index) { struct agp_i810_softc *sc; u_int32_t pte; vm_paddr_t res; sc = device_get_softc(dev); pte = sc->match->driver->read_gtt_pte(dev, index); res = (pte & ~PAGE_MASK) | ((pte & 0xff0) << 28); return (res); } /* * Writing via memory mapped registers already flushes all TLBs. */ static void agp_i810_flush_tlb(device_t dev) { } static int agp_i810_enable(device_t dev, u_int32_t mode) { return (0); } static struct agp_memory * agp_i810_alloc_memory(device_t dev, int type, vm_size_t size) { struct agp_i810_softc *sc; struct agp_memory *mem; vm_page_t m; sc = device_get_softc(dev); if ((size & (AGP_PAGE_SIZE - 1)) != 0 || sc->agp.as_allocated + size > sc->agp.as_maxmem) return (0); if (type == 1) { /* * Mapping local DRAM into GATT. */ if (sc->match->driver->chiptype != CHIP_I810) return (0); if (size != sc->dcache_size) return (0); } else if (type == 2) { /* * Type 2 is the contiguous physical memory type, that hands * back a physical address. This is used for cursors on i810. * Hand back as many single pages with physical as the user * wants, but only allow one larger allocation (ARGB cursor) * for simplicity. */ if (size != AGP_PAGE_SIZE) { if (sc->argb_cursor != NULL) return (0); /* Allocate memory for ARGB cursor, if we can. */ sc->argb_cursor = contigmalloc(size, M_AGP, 0, 0, ~0, PAGE_SIZE, 0); if (sc->argb_cursor == NULL) return (0); } } mem = malloc(sizeof *mem, M_AGP, M_WAITOK); mem->am_id = sc->agp.as_nextid++; mem->am_size = size; mem->am_type = type; if (type != 1 && (type != 2 || size == AGP_PAGE_SIZE)) mem->am_obj = vm_object_allocate(OBJT_DEFAULT, atop(round_page(size))); else mem->am_obj = 0; if (type == 2) { if (size == AGP_PAGE_SIZE) { /* * Allocate and wire down the page now so that we can * get its physical address. */ VM_OBJECT_WLOCK(mem->am_obj); m = vm_page_grab(mem->am_obj, 0, VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); VM_OBJECT_WUNLOCK(mem->am_obj); mem->am_physical = VM_PAGE_TO_PHYS(m); } else { /* Our allocation is already nicely wired down for us. * Just grab the physical address. */ mem->am_physical = vtophys(sc->argb_cursor); } } else mem->am_physical = 0; mem->am_offset = 0; mem->am_is_bound = 0; TAILQ_INSERT_TAIL(&sc->agp.as_memory, mem, am_link); sc->agp.as_allocated += size; return (mem); } static int agp_i810_free_memory(device_t dev, struct agp_memory *mem) { struct agp_i810_softc *sc; vm_page_t m; if (mem->am_is_bound) return (EBUSY); sc = device_get_softc(dev); if (mem->am_type == 2) { if (mem->am_size == AGP_PAGE_SIZE) { /* * Unwire the page which we wired in alloc_memory. */ VM_OBJECT_WLOCK(mem->am_obj); m = vm_page_lookup(mem->am_obj, 0); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); VM_OBJECT_WUNLOCK(mem->am_obj); } else { contigfree(sc->argb_cursor, mem->am_size, M_AGP); sc->argb_cursor = NULL; } } sc->agp.as_allocated -= mem->am_size; TAILQ_REMOVE(&sc->agp.as_memory, mem, am_link); if (mem->am_obj) vm_object_deallocate(mem->am_obj); free(mem, M_AGP); return (0); } static int agp_i810_bind_memory(device_t dev, struct agp_memory *mem, vm_offset_t offset) { struct agp_i810_softc *sc; vm_offset_t i; /* Do some sanity checks first. */ if ((offset & (AGP_PAGE_SIZE - 1)) != 0 || offset + mem->am_size > AGP_GET_APERTURE(dev)) { device_printf(dev, "binding memory at bad offset %#x\n", (int)offset); return (EINVAL); } sc = device_get_softc(dev); if (mem->am_type == 2 && mem->am_size != AGP_PAGE_SIZE) { mtx_lock(&sc->agp.as_lock); if (mem->am_is_bound) { mtx_unlock(&sc->agp.as_lock); return (EINVAL); } /* The memory's already wired down, just stick it in the GTT. */ for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) { sc->match->driver->install_gtt_pte(dev, (offset + i) >> AGP_PAGE_SHIFT, mem->am_physical + i, 0); } agp_flush_cache(); mem->am_offset = offset; mem->am_is_bound = 1; mtx_unlock(&sc->agp.as_lock); return (0); } if (mem->am_type != 1) return (agp_generic_bind_memory(dev, mem, offset)); /* * Mapping local DRAM into GATT. */ if (sc->match->driver->chiptype != CHIP_I810) return (EINVAL); for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) bus_write_4(sc->sc_res[0], AGP_I810_GTT + (i >> AGP_PAGE_SHIFT) * 4, i | 3); return (0); } static int agp_i810_unbind_memory(device_t dev, struct agp_memory *mem) { struct agp_i810_softc *sc; vm_offset_t i; sc = device_get_softc(dev); if (mem->am_type == 2 && mem->am_size != AGP_PAGE_SIZE) { mtx_lock(&sc->agp.as_lock); if (!mem->am_is_bound) { mtx_unlock(&sc->agp.as_lock); return (EINVAL); } for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) { sc->match->driver->install_gtt_pte(dev, (mem->am_offset + i) >> AGP_PAGE_SHIFT, 0, 0); } agp_flush_cache(); mem->am_is_bound = 0; mtx_unlock(&sc->agp.as_lock); return (0); } if (mem->am_type != 1) return (agp_generic_unbind_memory(dev, mem)); if (sc->match->driver->chiptype != CHIP_I810) return (EINVAL); for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) { sc->match->driver->install_gtt_pte(dev, i >> AGP_PAGE_SHIFT, 0, 0); } return (0); } static device_method_t agp_i810_methods[] = { /* Device interface */ DEVMETHOD(device_identify, agp_i810_identify), DEVMETHOD(device_probe, agp_i810_probe), DEVMETHOD(device_attach, agp_i810_attach), DEVMETHOD(device_detach, agp_i810_detach), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, agp_i810_resume), /* AGP interface */ DEVMETHOD(agp_get_aperture, agp_generic_get_aperture), DEVMETHOD(agp_set_aperture, agp_i810_method_set_aperture), DEVMETHOD(agp_bind_page, agp_i810_bind_page), DEVMETHOD(agp_unbind_page, agp_i810_unbind_page), DEVMETHOD(agp_flush_tlb, agp_i810_flush_tlb), DEVMETHOD(agp_enable, agp_i810_enable), DEVMETHOD(agp_alloc_memory, agp_i810_alloc_memory), DEVMETHOD(agp_free_memory, agp_i810_free_memory), DEVMETHOD(agp_bind_memory, agp_i810_bind_memory), DEVMETHOD(agp_unbind_memory, agp_i810_unbind_memory), DEVMETHOD(agp_chipset_flush, agp_intel_gtt_chipset_flush), { 0, 0 } }; static driver_t agp_i810_driver = { "agp", agp_i810_methods, sizeof(struct agp_i810_softc), }; static devclass_t agp_devclass; DRIVER_MODULE(agp_i810, vgapci, agp_i810_driver, agp_devclass, 0, 0); MODULE_DEPEND(agp_i810, agp, 1, 1, 1); MODULE_DEPEND(agp_i810, pci, 1, 1, 1); extern vm_page_t bogus_page; void agp_intel_gtt_clear_range(device_t dev, u_int first_entry, u_int num_entries) { struct agp_i810_softc *sc; u_int i; sc = device_get_softc(dev); for (i = 0; i < num_entries; i++) sc->match->driver->install_gtt_pte(dev, first_entry + i, VM_PAGE_TO_PHYS(bogus_page), 0); sc->match->driver->read_gtt_pte(dev, first_entry + num_entries - 1); } void agp_intel_gtt_insert_pages(device_t dev, u_int first_entry, u_int num_entries, vm_page_t *pages, u_int flags) { struct agp_i810_softc *sc; u_int i; sc = device_get_softc(dev); for (i = 0; i < num_entries; i++) { MPASS(pages[i]->valid == VM_PAGE_BITS_ALL); MPASS(pages[i]->wire_count > 0); sc->match->driver->install_gtt_pte(dev, first_entry + i, VM_PAGE_TO_PHYS(pages[i]), flags); } sc->match->driver->read_gtt_pte(dev, first_entry + num_entries - 1); } struct intel_gtt agp_intel_gtt_get(device_t dev) { struct agp_i810_softc *sc; struct intel_gtt res; sc = device_get_softc(dev); res.stolen_size = sc->stolen_size; res.gtt_total_entries = sc->gtt_total_entries; res.gtt_mappable_entries = sc->gtt_mappable_entries; res.do_idle_maps = 0; res.scratch_page_dma = VM_PAGE_TO_PHYS(bogus_page); return (res); } static int agp_i810_chipset_flush_setup(device_t dev) { return (0); } static void agp_i810_chipset_flush_teardown(device_t dev) { /* Nothing to do. */ } static void agp_i810_chipset_flush(device_t dev) { /* Nothing to do. */ } static void agp_i830_chipset_flush(device_t dev) { struct agp_i810_softc *sc; uint32_t hic; int i; sc = device_get_softc(dev); pmap_invalidate_cache(); hic = bus_read_4(sc->sc_res[0], AGP_I830_HIC); bus_write_4(sc->sc_res[0], AGP_I830_HIC, hic | (1U << 31)); for (i = 0; i < 20000 /* 1 sec */; i++) { hic = bus_read_4(sc->sc_res[0], AGP_I830_HIC); if ((hic & (1U << 31)) == 0) break; DELAY(50); } } static int agp_i915_chipset_flush_alloc_page(device_t dev, uint64_t start, uint64_t end) { struct agp_i810_softc *sc; device_t vga; sc = device_get_softc(dev); vga = device_get_parent(dev); sc->sc_flush_page_rid = 100; sc->sc_flush_page_res = BUS_ALLOC_RESOURCE(device_get_parent(vga), dev, SYS_RES_MEMORY, &sc->sc_flush_page_rid, start, end, PAGE_SIZE, RF_ACTIVE); if (sc->sc_flush_page_res == NULL) { device_printf(dev, "Failed to allocate flush page at 0x%jx\n", (uintmax_t)start); return (EINVAL); } sc->sc_flush_page_vaddr = rman_get_virtual(sc->sc_flush_page_res); if (bootverbose) { device_printf(dev, "Allocated flush page phys 0x%jx virt %p\n", (uintmax_t)rman_get_start(sc->sc_flush_page_res), sc->sc_flush_page_vaddr); } return (0); } static void agp_i915_chipset_flush_free_page(device_t dev) { struct agp_i810_softc *sc; device_t vga; sc = device_get_softc(dev); vga = device_get_parent(dev); if (sc->sc_flush_page_res == NULL) return; BUS_DEACTIVATE_RESOURCE(device_get_parent(vga), dev, SYS_RES_MEMORY, sc->sc_flush_page_rid, sc->sc_flush_page_res); BUS_RELEASE_RESOURCE(device_get_parent(vga), dev, SYS_RES_MEMORY, sc->sc_flush_page_rid, sc->sc_flush_page_res); } static int agp_i915_chipset_flush_setup(device_t dev) { struct agp_i810_softc *sc; uint32_t temp; int error; sc = device_get_softc(dev); temp = pci_read_config(sc->bdev, AGP_I915_IFPADDR, 4); if ((temp & 1) != 0) { temp &= ~1; if (bootverbose) device_printf(dev, "Found already configured flush page at 0x%jx\n", (uintmax_t)temp); sc->sc_bios_allocated_flush_page = 1; /* * In the case BIOS initialized the flush pointer (?) * register, expect that BIOS also set up the resource * for the page. */ error = agp_i915_chipset_flush_alloc_page(dev, temp, temp + PAGE_SIZE - 1); if (error != 0) return (error); } else { sc->sc_bios_allocated_flush_page = 0; error = agp_i915_chipset_flush_alloc_page(dev, 0, 0xffffffff); if (error != 0) return (error); temp = rman_get_start(sc->sc_flush_page_res); pci_write_config(sc->bdev, AGP_I915_IFPADDR, temp | 1, 4); } return (0); } static void agp_i915_chipset_flush_teardown(device_t dev) { struct agp_i810_softc *sc; uint32_t temp; sc = device_get_softc(dev); if (sc->sc_flush_page_res == NULL) return; if (!sc->sc_bios_allocated_flush_page) { temp = pci_read_config(sc->bdev, AGP_I915_IFPADDR, 4); temp &= ~1; pci_write_config(sc->bdev, AGP_I915_IFPADDR, temp, 4); } agp_i915_chipset_flush_free_page(dev); } static int agp_i965_chipset_flush_setup(device_t dev) { struct agp_i810_softc *sc; uint64_t temp; uint32_t temp_hi, temp_lo; int error; sc = device_get_softc(dev); temp_hi = pci_read_config(sc->bdev, AGP_I965_IFPADDR + 4, 4); temp_lo = pci_read_config(sc->bdev, AGP_I965_IFPADDR, 4); if ((temp_lo & 1) != 0) { temp = ((uint64_t)temp_hi << 32) | (temp_lo & ~1); if (bootverbose) device_printf(dev, "Found already configured flush page at 0x%jx\n", (uintmax_t)temp); sc->sc_bios_allocated_flush_page = 1; /* * In the case BIOS initialized the flush pointer (?) * register, expect that BIOS also set up the resource * for the page. */ error = agp_i915_chipset_flush_alloc_page(dev, temp, temp + PAGE_SIZE - 1); if (error != 0) return (error); } else { sc->sc_bios_allocated_flush_page = 0; error = agp_i915_chipset_flush_alloc_page(dev, 0, ~0); if (error != 0) return (error); temp = rman_get_start(sc->sc_flush_page_res); pci_write_config(sc->bdev, AGP_I965_IFPADDR + 4, (temp >> 32) & UINT32_MAX, 4); pci_write_config(sc->bdev, AGP_I965_IFPADDR, (temp & UINT32_MAX) | 1, 4); } return (0); } static void agp_i965_chipset_flush_teardown(device_t dev) { struct agp_i810_softc *sc; uint32_t temp_lo; sc = device_get_softc(dev); if (sc->sc_flush_page_res == NULL) return; if (!sc->sc_bios_allocated_flush_page) { temp_lo = pci_read_config(sc->bdev, AGP_I965_IFPADDR, 4); temp_lo &= ~1; pci_write_config(sc->bdev, AGP_I965_IFPADDR, temp_lo, 4); } agp_i915_chipset_flush_free_page(dev); } static void agp_i915_chipset_flush(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); *(uint32_t *)sc->sc_flush_page_vaddr = 1; } int agp_intel_gtt_chipset_flush(device_t dev) { struct agp_i810_softc *sc; sc = device_get_softc(dev); sc->match->driver->chipset_flush(dev); return (0); } void agp_intel_gtt_unmap_memory(device_t dev, struct sglist *sg_list) { } int agp_intel_gtt_map_memory(device_t dev, vm_page_t *pages, u_int num_entries, struct sglist **sg_list) { struct agp_i810_softc *sc; struct sglist *sg; int i; #if 0 int error; bus_dma_tag_t dmat; #endif if (*sg_list != NULL) return (0); sc = device_get_softc(dev); sg = sglist_alloc(num_entries, M_WAITOK /* XXXKIB */); for (i = 0; i < num_entries; i++) { sg->sg_segs[i].ss_paddr = VM_PAGE_TO_PHYS(pages[i]); sg->sg_segs[i].ss_len = PAGE_SIZE; } #if 0 error = bus_dma_tag_create(bus_get_dma_tag(dev), 1 /* alignment */, 0 /* boundary */, 1ULL << sc->match->busdma_addr_mask_sz /* lowaddr */, BUS_SPACE_MAXADDR /* highaddr */, NULL /* filtfunc */, NULL /* filtfuncarg */, BUS_SPACE_MAXADDR /* maxsize */, BUS_SPACE_UNRESTRICTED /* nsegments */, BUS_SPACE_MAXADDR /* maxsegsz */, 0 /* flags */, NULL /* lockfunc */, NULL /* lockfuncarg */, &dmat); if (error != 0) { sglist_free(sg); return (error); } /* XXXKIB */ #endif *sg_list = sg; return (0); } void agp_intel_gtt_insert_sg_entries(device_t dev, struct sglist *sg_list, u_int first_entry, u_int flags) { struct agp_i810_softc *sc; vm_paddr_t spaddr; size_t slen; u_int i, j; sc = device_get_softc(dev); for (i = j = 0; j < sg_list->sg_nseg; j++) { spaddr = sg_list->sg_segs[i].ss_paddr; slen = sg_list->sg_segs[i].ss_len; for (; slen > 0; i++) { sc->match->driver->install_gtt_pte(dev, first_entry + i, spaddr, flags); spaddr += AGP_PAGE_SIZE; slen -= AGP_PAGE_SIZE; } } sc->match->driver->read_gtt_pte(dev, first_entry + i - 1); } void intel_gtt_clear_range(u_int first_entry, u_int num_entries) { agp_intel_gtt_clear_range(intel_agp, first_entry, num_entries); } void intel_gtt_insert_pages(u_int first_entry, u_int num_entries, vm_page_t *pages, u_int flags) { agp_intel_gtt_insert_pages(intel_agp, first_entry, num_entries, pages, flags); } struct intel_gtt intel_gtt_get(void) { return (agp_intel_gtt_get(intel_agp)); } int intel_gtt_chipset_flush(void) { return (agp_intel_gtt_chipset_flush(intel_agp)); } void intel_gtt_unmap_memory(struct sglist *sg_list) { agp_intel_gtt_unmap_memory(intel_agp, sg_list); } int intel_gtt_map_memory(vm_page_t *pages, u_int num_entries, struct sglist **sg_list) { return (agp_intel_gtt_map_memory(intel_agp, pages, num_entries, sg_list)); } void intel_gtt_insert_sg_entries(struct sglist *sg_list, u_int first_entry, u_int flags) { agp_intel_gtt_insert_sg_entries(intel_agp, sg_list, first_entry, flags); } device_t intel_gtt_get_bridge_device(void) { struct agp_i810_softc *sc; sc = device_get_softc(intel_agp); return (sc->bdev); } vm_paddr_t intel_gtt_read_pte_paddr(u_int entry) { struct agp_i810_softc *sc; sc = device_get_softc(intel_agp); return (sc->match->driver->read_gtt_pte_paddr(intel_agp, entry)); } u_int32_t intel_gtt_read_pte(u_int entry) { struct agp_i810_softc *sc; sc = device_get_softc(intel_agp); return (sc->match->driver->read_gtt_pte(intel_agp, entry)); } void intel_gtt_write(u_int entry, uint32_t val) { struct agp_i810_softc *sc; sc = device_get_softc(intel_agp); return (sc->match->driver->write_gtt(intel_agp, entry, val)); } Index: user/attilio/rm_vmobj_cache/sys/dev/cxgbe/tom/t4_ddp.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/dev/cxgbe/tom/t4_ddp.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/dev/cxgbe/tom/t4_ddp.c (revision 267237) @@ -1,1285 +1,1285 @@ /*- * Copyright (c) 2012 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #ifdef TCP_OFFLOAD #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) #define PPOD_SIZE (PPOD_SZ(1)) /* XXX: must match A_ULP_RX_TDDP_PSZ */ static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6}; #if 0 static void t4_dump_tcb(struct adapter *sc, int tid) { uint32_t tcb_base, off, i, j; /* Dump TCB for the tid */ tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2), tcb_base + tid * TCB_SIZE); t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2)); off = 0; printf("\n"); for (i = 0; i < 4; i++) { uint32_t buf[8]; for (j = 0; j < 8; j++, off += 4) buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off)); printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); } } #endif #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) static int alloc_ppods(struct tom_data *td, int n, struct ppod_region *pr) { int ppod; KASSERT(n > 0, ("%s: nonsense allocation (%d)", __func__, n)); mtx_lock(&td->ppod_lock); if (n > td->nppods_free) { mtx_unlock(&td->ppod_lock); return (-1); } if (td->nppods_free_head >= n) { td->nppods_free_head -= n; ppod = td->nppods_free_head; TAILQ_INSERT_HEAD(&td->ppods, pr, link); } else { struct ppod_region *p; ppod = td->nppods_free_head; TAILQ_FOREACH(p, &td->ppods, link) { ppod += p->used + p->free; if (n <= p->free) { ppod -= n; p->free -= n; TAILQ_INSERT_AFTER(&td->ppods, p, pr, link); goto allocated; } } if (__predict_false(ppod != td->nppods)) { panic("%s: ppods TAILQ (%p) corrupt." " At %d instead of %d at the end of the queue.", __func__, &td->ppods, ppod, td->nppods); } mtx_unlock(&td->ppod_lock); return (-1); } allocated: pr->used = n; pr->free = 0; td->nppods_free -= n; mtx_unlock(&td->ppod_lock); return (ppod); } static void free_ppods(struct tom_data *td, struct ppod_region *pr) { struct ppod_region *p; KASSERT(pr->used > 0, ("%s: nonsense free (%d)", __func__, pr->used)); mtx_lock(&td->ppod_lock); p = TAILQ_PREV(pr, ppod_head, link); if (p != NULL) p->free += pr->used + pr->free; else td->nppods_free_head += pr->used + pr->free; td->nppods_free += pr->used; KASSERT(td->nppods_free <= td->nppods, ("%s: nppods_free (%d) > nppods (%d). %d freed this time.", __func__, td->nppods_free, td->nppods, pr->used)); TAILQ_REMOVE(&td->ppods, pr, link); mtx_unlock(&td->ppod_lock); } static inline int pages_to_nppods(int npages, int ddp_pgsz) { int nsegs = npages * PAGE_SIZE / ddp_pgsz; return (howmany(nsegs, PPOD_PAGES)); } static void free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) { if (db == NULL) return; if (db->pages) free(db->pages, M_CXGBE); if (db->nppods > 0) free_ppods(td, &db->ppod_region); free(db, M_CXGBE); } void release_ddp_resources(struct toepcb *toep) { int i; for (i = 0; i < nitems(toep->db); i++) { if (toep->db[i] != NULL) { free_ddp_buffer(toep->td, toep->db[i]); toep->db[i] = NULL; } } } /* XXX: handle_ddp_data code duplication */ void insert_ddp_data(struct toepcb *toep, uint32_t n) { struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); struct sockbuf *sb = &inp->inp_socket->so_rcv; struct mbuf *m; INP_WLOCK_ASSERT(inp); SOCKBUF_LOCK_ASSERT(sb); m = get_ddp_mbuf(n); tp->rcv_nxt += n; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); tp->rcv_wnd -= n; #endif KASSERT(toep->sb_cc >= sb->sb_cc, ("%s: sb %p has more data (%d) than last time (%d).", __func__, sb, sb->sb_cc, toep->sb_cc)); toep->rx_credits += toep->sb_cc - sb->sb_cc; #ifdef USE_DDP_RX_FLOW_CONTROL toep->rx_credits -= n; /* adjust for F_RX_FC_DDP */ #endif sbappendstream_locked(sb, m); toep->sb_cc = sb->sb_cc; } /* SET_TCB_FIELD sent as a ULP command looks like this */ #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) /* RX_DATA_ACK sent as a ULP command looks like this */ #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) static inline void * mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, uint64_t word, uint64_t mask, uint64_t val) { struct ulptx_idata *ulpsc; struct cpl_set_tcb_field_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); req->reply_ctrl = htobe16(V_NO_REPLY(1) | V_QUEUENO(toep->ofld_rxq->iq.abs_id)); req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); req->mask = htobe64(mask); req->val = htobe64(val); ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__SET_TCB_FIELD_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static inline void * mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) { struct ulptx_idata *ulpsc; struct cpl_rx_data_ack_core *req; ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(sizeof(*req)); req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); req->credit_dack = htobe32(F_RX_MODULATE_RX); ulpsc = (struct ulptx_idata *)(req + 1); if (LEN__RX_DATA_ACK_ULP % 16) { ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); ulpsc->len = htobe32(0); return (ulpsc + 1); } return (ulpsc); } static inline uint64_t select_ddp_flags(struct socket *so, int flags, int db_idx) { uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0); int waitall = flags & MSG_WAITALL; int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO); KASSERT(db_idx == 0 || db_idx == 1, ("%s: bad DDP buffer index %d", __func__, db_idx)); if (db_idx == 0) { ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0); if (waitall) ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1); else if (nb) ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); else ddp_flags |= V_TF_DDP_BUF0_FLUSH(0); } else { ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1); if (waitall) ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1); else if (nb) ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); else ddp_flags |= V_TF_DDP_BUF1_FLUSH(0); } return (ddp_flags); } static struct wrqe * mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, int offset, uint64_t ddp_flags) { struct ddp_buffer *db = toep->db[db_idx]; struct wrqe *wr; struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; int len; KASSERT(db_idx == 0 || db_idx == 1, ("%s: bad DDP buffer index %d", __func__, db_idx)); /* * We'll send a compound work request that has 3 SET_TCB_FIELDs and an * RX_DATA_ACK (with RX_MODULATE to speed up delivery). * * The work request header is 16B and always ends at a 16B boundary. * The ULPTX master commands that follow must all end at 16B boundaries * too so we round up the size to 16. */ len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + roundup2(LEN__RX_DATA_ACK_ULP, 16); wr = alloc_wrqe(len, toep->ctrlq); if (wr == NULL) return (NULL); wrh = wrtod(wr); INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); /* Write the buffer's tag */ ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF0_TAG + db_idx, V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), V_TCB_RX_DDP_BUF0_TAG(db->tag)); /* Update the current offset in the DDP buffer and its total length */ if (db_idx == 0) ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF0_OFFSET, V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_OFFSET(offset) | V_TCB_RX_DDP_BUF0_LEN(db->len)); else ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF1_OFFSET, V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), V_TCB_RX_DDP_BUF1_OFFSET(offset) | V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32)); /* Update DDP flags */ ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags); /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); return (wr); } static void discourage_ddp(struct toepcb *toep) { if (toep->ddp_score && --toep->ddp_score == 0) { toep->ddp_flags &= ~DDP_OK; toep->ddp_disabled = time_uptime; CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u", __func__, toep->tid, time_uptime); } } static int handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) { uint32_t report = be32toh(ddp_report); unsigned int db_flag; struct inpcb *inp = toep->inp; struct tcpcb *tp; struct socket *so; struct sockbuf *sb; struct mbuf *m; db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; if (__predict_false(!(report & F_DDP_INV))) CXGBE_UNIMPLEMENTED("DDP buffer still valid"); INP_WLOCK(inp); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { /* * XXX: think a bit more. * tcpcb probably gone, but socket should still be around * because we always wait for DDP completion in soreceive no * matter what. Just wake it up and let it clean up. */ CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); SOCKBUF_LOCK(sb); goto wakeup; } tp = intotcpcb(inp); len += be32toh(rcv_nxt) - tp->rcv_nxt; tp->rcv_nxt += len; tp->t_rcvtime = ticks; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); tp->rcv_wnd -= len; #endif m = get_ddp_mbuf(len); SOCKBUF_LOCK(sb); if (report & F_DDP_BUF_COMPLETE) toep->ddp_score = DDP_HIGH_SCORE; else discourage_ddp(toep); KASSERT(toep->sb_cc >= sb->sb_cc, ("%s: sb %p has more data (%d) than last time (%d).", __func__, sb, sb->sb_cc, toep->sb_cc)); toep->rx_credits += toep->sb_cc - sb->sb_cc; #ifdef USE_DDP_RX_FLOW_CONTROL toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ #endif sbappendstream_locked(sb, m); toep->sb_cc = sb->sb_cc; wakeup: KASSERT(toep->ddp_flags & db_flag, ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", __func__, toep, toep->ddp_flags, report)); toep->ddp_flags &= ~db_flag; sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); INP_WUNLOCK(inp); return (0); } #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) static int do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); uint32_t vld; struct toepcb *toep = lookup_tid(sc, tid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); vld = be32toh(cpl->ddpvld); if (__predict_false(vld & DDP_ERR)) { panic("%s: DDP error 0x%x (tid %d, toep %p)", __func__, vld, tid, toep); } handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); return (0); } static int do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); unsigned int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); return (0); } void enable_ddp(struct adapter *sc, struct toepcb *toep) { KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, ("%s: toep %p has bad ddp_flags 0x%x", __func__, toep, toep->ddp_flags)); CTR3(KTR_CXGBE, "%s: tid %u (time %u)", __func__, toep->tid, time_uptime); toep->ddp_flags |= DDP_SC_REQ; t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1)); t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, V_TF_RCV_COALESCE_ENABLE(1), 0); } static inline void disable_ddp(struct adapter *sc, struct toepcb *toep) { KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON, ("%s: toep %p has bad ddp_flags 0x%x", __func__, toep, toep->ddp_flags)); CTR3(KTR_CXGBE, "%s: tid %u (time %u)", __func__, toep->tid, time_uptime); toep->ddp_flags |= DDP_SC_REQ; t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1)); t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), V_TF_DDP_OFF(1)); } static int hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages) { struct vm_map *map; struct iovec *iov; vm_offset_t start, end; vm_page_t *pp; int n; KASSERT(uio->uio_iovcnt == 1, ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt)); KASSERT(uio->uio_td->td_proc == curproc, ("%s: uio proc (%p) is not curproc (%p)", __func__, uio->uio_td->td_proc, curproc)); map = &curproc->p_vmspace->vm_map; iov = &uio->uio_iov[0]; start = trunc_page((uintptr_t)iov->iov_base); end = round_page((vm_offset_t)iov->iov_base + iov->iov_len); n = howmany(end - start, PAGE_SIZE); if (end - start > MAX_DDP_BUFFER_SIZE) return (E2BIG); pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT); if (pp == NULL) return (ENOMEM); if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base, iov->iov_len, VM_PROT_WRITE, pp, n) < 0) { free(pp, M_CXGBE); return (EFAULT); } *ppages = pp; *pnpages = n; return (0); } static int bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len) { int i; if (db == NULL || db->npages != npages || db->offset != offset || db->len != len) return (1); for (i = 0; i < npages; i++) { if (pages[i]->phys_addr != db->pages[i]->phys_addr) return (1); } return (0); } static int calculate_hcf(int n1, int n2) { int a, b, t; if (n1 <= n2) { a = n1; b = n2; } else { a = n2; b = n1; } while (a != 0) { t = a; a = b % a; b = t; } return (b); } static struct ddp_buffer * alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset, int len) { int i, hcf, seglen, idx, ppod, nppods; struct ddp_buffer *db; /* * The DDP page size is unrelated to the VM page size. We combine * contiguous physical pages into larger segments to get the best DDP * page size possible. This is the largest of the four sizes in * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in * the page list. */ hcf = 0; for (i = 0; i < npages; i++) { seglen = PAGE_SIZE; while (i < npages - 1 && pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) { seglen += PAGE_SIZE; i++; } hcf = calculate_hcf(hcf, seglen); if (hcf < t4_ddp_pgsz[1]) { idx = 0; goto have_pgsz; /* give up, short circuit */ } } if (hcf % t4_ddp_pgsz[0] != 0) { /* hmmm. This could only happen when PAGE_SIZE < 4K */ KASSERT(PAGE_SIZE < 4096, ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf)); CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf); return (NULL); } for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) { if (hcf % t4_ddp_pgsz[idx] == 0) break; } have_pgsz: db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT); if (db == NULL) { CTR1(KTR_CXGBE, "%s: malloc failed.", __func__); return (NULL); } nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]); ppod = alloc_ppods(td, nppods, &db->ppod_region); if (ppod < 0) { free(db, M_CXGBE); CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d", __func__, nppods, len, t4_ddp_pgsz[idx]); return (NULL); } KASSERT(idx <= M_PPOD_PGSZ && ppod <= M_PPOD_TAG, ("%s: DDP pgsz_idx = %d, ppod = %d", __func__, idx, ppod)); db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod); db->nppods = nppods; db->npages = npages; db->pages = pages; db->offset = offset; db->len = len; CTR6(KTR_CXGBE, "New DDP buffer. " "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d", t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset, db->len); return (db); } #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) static int write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db) { struct wrqe *wr; struct ulp_mem_io *ulpmc; struct ulptx_idata *ulpsc; struct pagepod *ppod; int i, j, k, n, chunk, len, ddp_pgsz, idx, ppod_addr; uint32_t cmd; cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); if (is_t4(sc)) cmd |= htobe32(F_ULP_MEMIO_ORDER); else cmd |= htobe32(F_T5_ULP_MEMIO_IMM); ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)]; ppod_addr = sc->vres.ddp.start + G_PPOD_TAG(db->tag) * PPOD_SIZE; for (i = 0; i < db->nppods; ppod_addr += chunk) { /* How many page pods are we writing in this cycle */ n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS); chunk = PPOD_SZ(n); len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); wr = alloc_wrqe(len, toep->ctrlq); if (wr == NULL) return (ENOMEM); /* ok to just bail out */ ulpmc = wrtod(wr); INIT_ULPTX_WR(ulpmc, len, 0, 0); ulpmc->cmd = cmd; ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); ulpsc = (struct ulptx_idata *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); ulpsc->len = htobe32(chunk); ppod = (struct pagepod *)(ulpsc + 1); for (j = 0; j < n; i++, j++, ppod++) { ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | V_PPOD_TID(toep->tid) | db->tag); ppod->len_offset = htobe64(V_PPOD_LEN(db->len) | V_PPOD_OFST(db->offset)); ppod->rsvd = 0; idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); for (k = 0; k < nitems(ppod->addr); k++) { if (idx < db->npages) { ppod->addr[k] = htobe64(db->pages[idx]->phys_addr); idx += ddp_pgsz / PAGE_SIZE; } else ppod->addr[k] = 0; #if 0 CTR5(KTR_CXGBE, "%s: tid %d ppod[%d]->addr[%d] = %p", __func__, toep->tid, i, k, htobe64(ppod->addr[k])); #endif } } t4_wrq_tx(sc, wr); } return (0); } /* * Reuse, or allocate (and program the page pods for) a new DDP buffer. The * "pages" array is handed over to this function and should not be used in any * way by the caller after that. */ static int select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages, int npages, int db_off, int db_len) { struct ddp_buffer *db; struct tom_data *td = sc->tom_softc; int i, empty_slot = -1; /* Try to reuse */ for (i = 0; i < nitems(toep->db); i++) { if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) { free(pages, M_CXGBE); return (i); /* pages still held */ } else if (toep->db[i] == NULL && empty_slot < 0) empty_slot = i; } /* Allocate new buffer, write its page pods. */ db = alloc_ddp_buffer(td, pages, npages, db_off, db_len); if (db == NULL) { vm_page_unhold_pages(pages, npages); free(pages, M_CXGBE); return (-1); } if (write_page_pods(sc, toep, db) != 0) { vm_page_unhold_pages(pages, npages); free_ddp_buffer(td, db); return (-1); } i = empty_slot; if (i < 0) { i = arc4random() % nitems(toep->db); free_ddp_buffer(td, toep->db[i]); } toep->db[i] = db; CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)", __func__, toep->tid, i, db, db->tag); return (i); } static void wire_ddp_buffer(struct ddp_buffer *db) { int i; vm_page_t p; for (i = 0; i < db->npages; i++) { p = db->pages[i]; vm_page_lock(p); vm_page_wire(p); vm_page_unhold(p); vm_page_unlock(p); } } static void unwire_ddp_buffer(struct ddp_buffer *db) { int i; vm_page_t p; for (i = 0; i < db->npages; i++) { p = db->pages[i]; vm_page_lock(p); - vm_page_unwire(p, 0); + vm_page_unwire(p, PQ_INACTIVE); vm_page_unlock(p); } } static int handle_ddp(struct socket *so, struct uio *uio, int flags, int error) { struct sockbuf *sb = &so->so_rcv; struct tcpcb *tp = so_sototcpcb(so); struct toepcb *toep = tp->t_toe; struct adapter *sc = td_adapter(toep->td); vm_page_t *pages; int npages, db_idx, rc, buf_flag; struct ddp_buffer *db; struct wrqe *wr; uint64_t ddp_flags; SOCKBUF_LOCK_ASSERT(sb); #if 0 if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) { CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d", __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid); } #endif /* XXX: too eager to disable DDP, could handle NBIO better than this. */ if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres || uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 || so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) || error || so->so_error || sb->sb_state & SBS_CANTRCVMORE) goto no_ddp; /* * Fault in and then hold the pages of the uio buffers. We'll wire them * a bit later if everything else works out. */ SOCKBUF_UNLOCK(sb); if (hold_uio(uio, &pages, &npages) != 0) { SOCKBUF_LOCK(sb); goto no_ddp; } SOCKBUF_LOCK(sb); if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) { vm_page_unhold_pages(pages, npages); free(pages, M_CXGBE); goto no_ddp; } /* * Figure out which one of the two DDP buffers to use this time. */ db_idx = select_ddp_buffer(sc, toep, pages, npages, (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid); pages = NULL; /* handed off to select_ddp_buffer */ if (db_idx < 0) goto no_ddp; db = toep->db[db_idx]; buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE; /* * Build the compound work request that tells the chip where to DMA the * payload. */ ddp_flags = select_ddp_flags(so, flags, db_idx); wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags); if (wr == NULL) { /* * Just unhold the pages. The DDP buffer's software state is * left as-is in the toep. The page pods were written * successfully and we may have an opportunity to use it in the * future. */ vm_page_unhold_pages(db->pages, db->npages); goto no_ddp; } /* Wire (and then unhold) the pages, and give the chip the go-ahead. */ wire_ddp_buffer(db); t4_wrq_tx(sc, wr); sb->sb_flags &= ~SB_DDP_INDICATE; toep->ddp_flags |= buf_flag; /* * Wait for the DDP operation to complete and then unwire the pages. * The return code from the sbwait will be the final return code of this * function. But we do need to wait for DDP no matter what. */ rc = sbwait(sb); while (toep->ddp_flags & buf_flag) { sb->sb_flags |= SB_WAIT; msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0); } unwire_ddp_buffer(db); return (rc); no_ddp: disable_ddp(sc, toep); discourage_ddp(toep); sb->sb_flags &= ~SB_DDP_INDICATE; return (0); } void t4_init_ddp(struct adapter *sc, struct tom_data *td) { int nppods = sc->vres.ddp.size / PPOD_SIZE; td->nppods = nppods; td->nppods_free = nppods; td->nppods_free_head = nppods; TAILQ_INIT(&td->ppods); mtx_init(&td->ppod_lock, "page pods", NULL, MTX_DEF); t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp); t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); } void t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td) { KASSERT(td->nppods == td->nppods_free, ("%s: page pods still in use, nppods = %d, free = %d", __func__, td->nppods, td->nppods_free)); if (mtx_initialized(&td->ppod_lock)) mtx_destroy(&td->ppod_lock); } #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { CXGBE_UNIMPLEMENTED(__func__); } static char ddp_magic_str[] = "nothing to see here"; struct mbuf * get_ddp_mbuf(int len) { struct mbuf *m; m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) CXGBE_UNIMPLEMENTED("mbuf alloc failure"); m->m_len = len; m->m_data = &ddp_magic_str[0]; return (m); } static inline int is_ddp_mbuf(struct mbuf *m) { return (m->m_data == &ddp_magic_str[0]); } /* * Copy an mbuf chain into a uio limited by len if set. */ static int m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len) { int error, length, total; int progress = 0; if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* Fill the uio with data from the mbufs. */ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); if (is_ddp_mbuf(m)) { enum uio_seg segflag = uio->uio_segflg; uio->uio_segflg = UIO_NOCOPY; error = uiomove(mtod(m, void *), length, uio); uio->uio_segflg = segflag; } else error = uiomove(mtod(m, void *), length, uio); if (error) return (error); progress += length; } return (0); } /* * Based on soreceive_stream() in uipc_socket.c */ int t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid, ddp_handled = 0; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (controlp != NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) goto out; SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { /* uio should be just as it was at entry */ KASSERT(oresid == uio->uio_resid, ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d", __func__, oresid, uio->uio_resid, sb->sb_cc)); error = handle_ddp(so, uio, flags, 0); ddp_handled = 1; if (error) goto out; } /* Abort if socket has reported problems. */ if (so->so_error) { if (sb->sb_cc > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sb->sb_cc > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sb->sb_cc == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sb->sb_cc >= sb->sb_lowat || sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) { if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { (void) handle_ddp(so, uio, flags, 1); ddp_handled = 1; } goto out; } goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) goto restart; /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sb->sb_cc); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { for (*mp0 = m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } sb->sb_mb = m; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); n->m_next = NULL; } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= m->m_len; if (*mp0 != NULL) n->m_next = m; else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio_ddp(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } #endif Index: user/attilio/rm_vmobj_cache/sys/dev/drm/via_dmablit.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/dev/drm/via_dmablit.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/dev/drm/via_dmablit.c (revision 267237) @@ -1,790 +1,790 @@ /* via_dmablit.c -- PCI DMA BitBlt support for the VIA Unichrome/Pro * * Copyright (C) 2005 Thomas Hellstrom, All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Thomas Hellstrom. * Partially based on code obtained from Digeo Inc. */ #include __FBSDID("$FreeBSD$"); /* * Unmaps the DMA mappings. * FIXME: Is this a NoOp on x86? Also * FIXME: What happens if this one is called and a pending blit has previously done * the same DMA mappings? */ #include "dev/drm/drmP.h" #include "dev/drm/via_drm.h" #include "dev/drm/via_drv.h" #include "dev/drm/via_dmablit.h" #define VIA_PGDN(x) (((unsigned long)(x)) & ~PAGE_MASK) #define VIA_PGOFF(x) (((unsigned long)(x)) & PAGE_MASK) #define VIA_PFN(x) ((unsigned long)(x) >> PAGE_SHIFT) typedef struct _drm_via_descriptor { uint32_t mem_addr; uint32_t dev_addr; uint32_t size; uint32_t next; } drm_via_descriptor_t; static void via_dmablit_timer(void *arg); /* * Unmap a DMA mapping. */ static void via_unmap_blit_from_device(drm_via_sg_info_t *vsg) { int num_desc = vsg->num_desc; unsigned cur_descriptor_page = num_desc / vsg->descriptors_per_page; unsigned descriptor_this_page = num_desc % vsg->descriptors_per_page; drm_via_descriptor_t *desc_ptr = vsg->desc_pages[cur_descriptor_page] + descriptor_this_page; dma_addr_t next = vsg->chain_start; while(num_desc--) { if (descriptor_this_page-- == 0) { cur_descriptor_page--; descriptor_this_page = vsg->descriptors_per_page - 1; desc_ptr = vsg->desc_pages[cur_descriptor_page] + descriptor_this_page; } next = (dma_addr_t) desc_ptr->next; desc_ptr--; } } /* * If mode = 0, count how many descriptors are needed. * If mode = 1, Map the DMA pages for the device, put together and map also the descriptors. * Descriptors are run in reverse order by the hardware because we are not allowed to update the * 'next' field without syncing calls when the descriptor is already mapped. */ static void via_map_blit_for_device(const drm_via_dmablit_t *xfer, drm_via_sg_info_t *vsg, int mode) { unsigned cur_descriptor_page = 0; unsigned num_descriptors_this_page = 0; unsigned char *mem_addr = xfer->mem_addr; unsigned char *cur_mem; unsigned char *first_addr = (unsigned char *)VIA_PGDN(mem_addr); uint32_t fb_addr = xfer->fb_addr; uint32_t cur_fb; unsigned long line_len; unsigned remaining_len; int num_desc = 0; int cur_line; dma_addr_t next = 0 | VIA_DMA_DPR_EC; drm_via_descriptor_t *desc_ptr = NULL; if (mode == 1) desc_ptr = vsg->desc_pages[cur_descriptor_page]; for (cur_line = 0; cur_line < xfer->num_lines; ++cur_line) { line_len = xfer->line_length; cur_fb = fb_addr; cur_mem = mem_addr; while (line_len > 0) { remaining_len = min(PAGE_SIZE - VIA_PGOFF(cur_mem), line_len); line_len -= remaining_len; if (mode == 1) { desc_ptr->mem_addr = VM_PAGE_TO_PHYS( vsg->pages[VIA_PFN(cur_mem) - VIA_PFN(first_addr)]) + VIA_PGOFF(cur_mem); desc_ptr->dev_addr = cur_fb; desc_ptr->size = remaining_len; desc_ptr->next = (uint32_t) next; next = vtophys(desc_ptr); desc_ptr++; if (++num_descriptors_this_page >= vsg->descriptors_per_page) { num_descriptors_this_page = 0; desc_ptr = vsg->desc_pages[++cur_descriptor_page]; } } num_desc++; cur_mem += remaining_len; cur_fb += remaining_len; } mem_addr += xfer->mem_stride; fb_addr += xfer->fb_stride; } if (mode == 1) { vsg->chain_start = next; vsg->state = dr_via_device_mapped; } vsg->num_desc = num_desc; } /* * Function that frees up all resources for a blit. It is usable even if the * blit info has only been partially built as long as the status enum is consistent * with the actual status of the used resources. */ static void via_free_sg_info(drm_via_sg_info_t *vsg) { vm_page_t page; int i; switch(vsg->state) { case dr_via_device_mapped: via_unmap_blit_from_device(vsg); case dr_via_desc_pages_alloc: for (i=0; inum_desc_pages; ++i) { if (vsg->desc_pages[i] != NULL) free(vsg->desc_pages[i], DRM_MEM_PAGES); } free(vsg->desc_pages, DRM_MEM_DRIVER); case dr_via_pages_locked: for (i=0; i < vsg->num_pages; ++i) { page = vsg->pages[i]; vm_page_lock(page); - vm_page_unwire(page, 0); + vm_page_unwire(page, PQ_INACTIVE); vm_page_unlock(page); } case dr_via_pages_alloc: free(vsg->pages, DRM_MEM_DRIVER); default: vsg->state = dr_via_sg_init; } free(vsg->bounce_buffer, DRM_MEM_DRIVER); vsg->bounce_buffer = NULL; vsg->free_on_sequence = 0; } /* * Fire a blit engine. */ static void via_fire_dmablit(struct drm_device *dev, drm_via_sg_info_t *vsg, int engine) { drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private; VIA_WRITE(VIA_PCI_DMA_MAR0 + engine*0x10, 0); VIA_WRITE(VIA_PCI_DMA_DAR0 + engine*0x10, 0); VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_DD | VIA_DMA_CSR_TD | VIA_DMA_CSR_DE); VIA_WRITE(VIA_PCI_DMA_MR0 + engine*0x04, VIA_DMA_MR_CM | VIA_DMA_MR_TDIE); VIA_WRITE(VIA_PCI_DMA_BCR0 + engine*0x10, 0); VIA_WRITE(VIA_PCI_DMA_DPR0 + engine*0x10, vsg->chain_start); DRM_WRITEMEMORYBARRIER(); VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_DE | VIA_DMA_CSR_TS); (void)VIA_READ(VIA_PCI_DMA_CSR0 + engine*0x04); } /* * Obtain a page pointer array and lock all pages into system memory. A segmentation violation will * occur here if the calling user does not have access to the submitted address. */ static int via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) { unsigned long first_pfn = VIA_PFN(xfer->mem_addr); vm_page_t m; int i; vsg->num_pages = VIA_PFN(xfer->mem_addr + (xfer->num_lines * xfer->mem_stride -1)) - first_pfn + 1; if (NULL == (vsg->pages = malloc(sizeof(vm_page_t) * vsg->num_pages, DRM_MEM_DRIVER, M_NOWAIT))) return -ENOMEM; vsg->state = dr_via_pages_alloc; if (vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)xfer->mem_addr, vsg->num_pages * PAGE_SIZE, VM_PROT_READ | VM_PROT_WRITE, vsg->pages, vsg->num_pages) < 0) return -EACCES; for (i = 0; i < vsg->num_pages; i++) { m = vsg->pages[i]; vm_page_lock(m); vm_page_wire(m); vm_page_unhold(m); vm_page_unlock(m); } vsg->state = dr_via_pages_locked; DRM_DEBUG("DMA pages locked\n"); return 0; } /* * Allocate DMA capable memory for the blit descriptor chain, and an array that keeps track of the * pages we allocate. We don't want to use kmalloc for the descriptor chain because it may be * quite large for some blits, and pages don't need to be contingous. */ static int via_alloc_desc_pages(drm_via_sg_info_t *vsg) { int i; vsg->descriptors_per_page = PAGE_SIZE / sizeof(drm_via_descriptor_t); vsg->num_desc_pages = (vsg->num_desc + vsg->descriptors_per_page - 1) / vsg->descriptors_per_page; if (NULL == (vsg->desc_pages = malloc(vsg->num_desc_pages * sizeof(void *), DRM_MEM_DRIVER, M_NOWAIT | M_ZERO))) return -ENOMEM; vsg->state = dr_via_desc_pages_alloc; for (i = 0; i < vsg->num_desc_pages; ++i) { if (NULL == (vsg->desc_pages[i] = (drm_via_descriptor_t *)malloc(PAGE_SIZE, DRM_MEM_PAGES, M_NOWAIT | M_ZERO))) return -ENOMEM; } DRM_DEBUG("Allocated %d pages for %d descriptors.\n", vsg->num_desc_pages, vsg->num_desc); return 0; } static void via_abort_dmablit(struct drm_device *dev, int engine) { drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private; VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_TA); } static void via_dmablit_engine_off(struct drm_device *dev, int engine) { drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private; VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_TD | VIA_DMA_CSR_DD); } /* * The dmablit part of the IRQ handler. Trying to do only reasonably fast things here. * The rest, like unmapping and freeing memory for done blits is done in a separate workqueue * task. Basically the task of the interrupt handler is to submit a new blit to the engine, while * the workqueue task takes care of processing associated with the old blit. */ void via_dmablit_handler(struct drm_device *dev, int engine, int from_irq) { drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private; drm_via_blitq_t *blitq = dev_priv->blit_queues + engine; int cur; int done_transfer; uint32_t status = 0; DRM_DEBUG("DMA blit handler called. engine = %d, from_irq = %d, blitq = 0x%lx\n", engine, from_irq, (unsigned long) blitq); mtx_lock(&blitq->blit_lock); done_transfer = blitq->is_active && (( status = VIA_READ(VIA_PCI_DMA_CSR0 + engine*0x04)) & VIA_DMA_CSR_TD); done_transfer = done_transfer || ( blitq->aborting && !(status & VIA_DMA_CSR_DE)); cur = blitq->cur; if (done_transfer) { blitq->blits[cur]->aborted = blitq->aborting; blitq->done_blit_handle++; DRM_WAKEUP(&blitq->blit_queue[cur]); cur++; if (cur >= VIA_NUM_BLIT_SLOTS) cur = 0; blitq->cur = cur; /* * Clear transfer done flag. */ VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_TD); blitq->is_active = 0; blitq->aborting = 0; taskqueue_enqueue(taskqueue_swi, &blitq->wq); } else if (blitq->is_active && (ticks >= blitq->end)) { /* * Abort transfer after one second. */ via_abort_dmablit(dev, engine); blitq->aborting = 1; blitq->end = ticks + DRM_HZ; } if (!blitq->is_active) { if (blitq->num_outstanding) { via_fire_dmablit(dev, blitq->blits[cur], engine); blitq->is_active = 1; blitq->cur = cur; blitq->num_outstanding--; blitq->end = ticks + DRM_HZ; if (!callout_pending(&blitq->poll_timer)) callout_reset(&blitq->poll_timer, 1, (timeout_t *)via_dmablit_timer, (void *)blitq); } else { if (callout_pending(&blitq->poll_timer)) { callout_stop(&blitq->poll_timer); } via_dmablit_engine_off(dev, engine); } } mtx_unlock(&blitq->blit_lock); } /* * Check whether this blit is still active, performing necessary locking. */ static int via_dmablit_active(drm_via_blitq_t *blitq, int engine, uint32_t handle, wait_queue_head_t **queue) { uint32_t slot; int active; mtx_lock(&blitq->blit_lock); /* * Allow for handle wraparounds. */ active = ((blitq->done_blit_handle - handle) > (1 << 23)) && ((blitq->cur_blit_handle - handle) <= (1 << 23)); if (queue && active) { slot = handle - blitq->done_blit_handle + blitq->cur -1; if (slot >= VIA_NUM_BLIT_SLOTS) { slot -= VIA_NUM_BLIT_SLOTS; } *queue = blitq->blit_queue + slot; } mtx_unlock(&blitq->blit_lock); return active; } /* * Sync. Wait for at least three seconds for the blit to be performed. */ static int via_dmablit_sync(struct drm_device *dev, uint32_t handle, int engine) { drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private; drm_via_blitq_t *blitq = dev_priv->blit_queues + engine; wait_queue_head_t *queue; int ret = 0; if (via_dmablit_active(blitq, engine, handle, &queue)) { DRM_WAIT_ON(ret, *queue, 3 * DRM_HZ, !via_dmablit_active(blitq, engine, handle, NULL)); } DRM_DEBUG("DMA blit sync handle 0x%x engine %d returned %d\n", handle, engine, ret); return ret; } /* * A timer that regularly polls the blit engine in cases where we don't have interrupts: * a) Broken hardware (typically those that don't have any video capture facility). * b) Blit abort. The hardware doesn't send an interrupt when a blit is aborted. * The timer and hardware IRQ's can and do work in parallel. If the hardware has * irqs, it will shorten the latency somewhat. */ static void via_dmablit_timer(void *arg) { drm_via_blitq_t *blitq = (drm_via_blitq_t *)arg; struct drm_device *dev = blitq->dev; int engine = (int) (blitq - ((drm_via_private_t *)dev->dev_private)->blit_queues); DRM_DEBUG("Polling timer called for engine %d, jiffies %lu\n", engine, (unsigned long) jiffies); via_dmablit_handler(dev, engine, 0); if (!callout_pending(&blitq->poll_timer)) { callout_schedule(&blitq->poll_timer, 1); /* * Rerun handler to delete timer if engines are off, and * to shorten abort latency. This is a little nasty. */ via_dmablit_handler(dev, engine, 0); } } /* * Workqueue task that frees data and mappings associated with a blit. * Also wakes up waiting processes. Each of these tasks handles one * blit engine only and may not be called on each interrupt. */ static void via_dmablit_workqueue(void *arg, int pending) { drm_via_blitq_t *blitq = (drm_via_blitq_t *)arg; struct drm_device *dev = blitq->dev; drm_via_sg_info_t *cur_sg; int cur_released; DRM_DEBUG("task called for blit engine %ld\n",(unsigned long) (blitq - ((drm_via_private_t *)dev->dev_private)->blit_queues)); mtx_lock(&blitq->blit_lock); while(blitq->serviced != blitq->cur) { cur_released = blitq->serviced++; DRM_DEBUG("Releasing blit slot %d\n", cur_released); if (blitq->serviced >= VIA_NUM_BLIT_SLOTS) blitq->serviced = 0; cur_sg = blitq->blits[cur_released]; blitq->num_free++; mtx_unlock(&blitq->blit_lock); DRM_WAKEUP(&blitq->busy_queue); via_free_sg_info(cur_sg); free(cur_sg, DRM_MEM_DRIVER); mtx_lock(&blitq->blit_lock); } mtx_unlock(&blitq->blit_lock); } /* * Init all blit engines. Currently we use two, but some hardware have 4. */ void via_init_dmablit(struct drm_device *dev) { int i,j; drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private; drm_via_blitq_t *blitq; for (i=0; i< VIA_NUM_BLIT_ENGINES; ++i) { blitq = dev_priv->blit_queues + i; blitq->dev = dev; blitq->cur_blit_handle = 0; blitq->done_blit_handle = 0; blitq->head = 0; blitq->cur = 0; blitq->serviced = 0; blitq->num_free = VIA_NUM_BLIT_SLOTS - 1; blitq->num_outstanding = 0; blitq->is_active = 0; blitq->aborting = 0; mtx_init(&blitq->blit_lock, "via_blit_lk", NULL, MTX_DEF); for (j=0; jblit_queue + j); } DRM_INIT_WAITQUEUE(&blitq->busy_queue); TASK_INIT(&blitq->wq, 0, via_dmablit_workqueue, blitq); callout_init(&blitq->poll_timer, 0); } } /* * Build all info and do all mappings required for a blit. */ static int via_build_sg_info(struct drm_device *dev, drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) { int ret = 0; vsg->bounce_buffer = NULL; vsg->state = dr_via_sg_init; if (xfer->num_lines <= 0 || xfer->line_length <= 0) { DRM_ERROR("Zero size bitblt.\n"); return -EINVAL; } /* * Below check is a driver limitation, not a hardware one. We * don't want to lock unused pages, and don't want to incoporate the * extra logic of avoiding them. Make sure there are no. * (Not a big limitation anyway.) */ if ((xfer->mem_stride - xfer->line_length) > 2 * PAGE_SIZE) { DRM_ERROR("Too large system memory stride. Stride: %d, " "Length: %d\n", xfer->mem_stride, xfer->line_length); return -EINVAL; } if ((xfer->mem_stride == xfer->line_length) && (xfer->fb_stride == xfer->line_length)) { xfer->mem_stride *= xfer->num_lines; xfer->line_length = xfer->mem_stride; xfer->fb_stride = xfer->mem_stride; xfer->num_lines = 1; } /* * Don't lock an arbitrary large number of pages, since that causes a * DOS security hole. */ if (xfer->num_lines > 2048 || (xfer->num_lines*xfer->mem_stride > (2048*2048*4))) { DRM_ERROR("Too large PCI DMA bitblt.\n"); return -EINVAL; } /* * we allow a negative fb stride to allow flipping of images in * transfer. */ if (xfer->mem_stride < xfer->line_length || abs(xfer->fb_stride) < xfer->line_length) { DRM_ERROR("Invalid frame-buffer / memory stride.\n"); return -EINVAL; } /* * A hardware bug seems to be worked around if system memory addresses * start on 16 byte boundaries. This seems a bit restrictive however. * VIA is contacted about this. Meanwhile, impose the following * restrictions: */ #ifdef VIA_BUGFREE if ((((unsigned long)xfer->mem_addr & 3) != ((unsigned long)xfer->fb_addr & 3)) || ((xfer->num_lines > 1) && ((xfer->mem_stride & 3) != (xfer->fb_stride & 3)))) { DRM_ERROR("Invalid DRM bitblt alignment.\n"); return -EINVAL; } #else if ((((unsigned long)xfer->mem_addr & 15) || ((unsigned long)xfer->fb_addr & 3)) || ((xfer->num_lines > 1) && ((xfer->mem_stride & 15) || (xfer->fb_stride & 3)))) { DRM_ERROR("Invalid DRM bitblt alignment.\n"); return -EINVAL; } #endif if (0 != (ret = via_lock_all_dma_pages(vsg, xfer))) { DRM_ERROR("Could not lock DMA pages.\n"); via_free_sg_info(vsg); return ret; } via_map_blit_for_device(xfer, vsg, 0); if (0 != (ret = via_alloc_desc_pages(vsg))) { DRM_ERROR("Could not allocate DMA descriptor pages.\n"); via_free_sg_info(vsg); return ret; } via_map_blit_for_device(xfer, vsg, 1); return 0; } /* * Reserve one free slot in the blit queue. Will wait for one second for one * to become available. Otherwise -EBUSY is returned. */ static int via_dmablit_grab_slot(drm_via_blitq_t *blitq, int engine) { struct drm_device *dev = blitq->dev; int ret=0; DRM_DEBUG("Num free is %d\n", blitq->num_free); mtx_lock(&blitq->blit_lock); while(blitq->num_free == 0) { mtx_unlock(&blitq->blit_lock); DRM_WAIT_ON(ret, blitq->busy_queue, DRM_HZ, blitq->num_free > 0); if (ret) { return (-EINTR == ret) ? -EAGAIN : ret; } mtx_lock(&blitq->blit_lock); } blitq->num_free--; mtx_unlock(&blitq->blit_lock); return 0; } /* * Hand back a free slot if we changed our mind. */ static void via_dmablit_release_slot(drm_via_blitq_t *blitq) { mtx_lock(&blitq->blit_lock); blitq->num_free++; mtx_unlock(&blitq->blit_lock); DRM_WAKEUP( &blitq->busy_queue ); } /* * Grab a free slot. Build blit info and queue a blit. */ static int via_dmablit(struct drm_device *dev, drm_via_dmablit_t *xfer) { drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private; drm_via_sg_info_t *vsg; drm_via_blitq_t *blitq; int ret; int engine; if (dev_priv == NULL) { DRM_ERROR("Called without initialization.\n"); return -EINVAL; } engine = (xfer->to_fb) ? 0 : 1; blitq = dev_priv->blit_queues + engine; if (0 != (ret = via_dmablit_grab_slot(blitq, engine))) { return ret; } if (NULL == (vsg = malloc(sizeof(*vsg), DRM_MEM_DRIVER, M_NOWAIT | M_ZERO))) { via_dmablit_release_slot(blitq); return -ENOMEM; } if (0 != (ret = via_build_sg_info(dev, vsg, xfer))) { via_dmablit_release_slot(blitq); free(vsg, DRM_MEM_DRIVER); return ret; } mtx_lock(&blitq->blit_lock); blitq->blits[blitq->head++] = vsg; if (blitq->head >= VIA_NUM_BLIT_SLOTS) blitq->head = 0; blitq->num_outstanding++; xfer->sync.sync_handle = ++blitq->cur_blit_handle; mtx_unlock(&blitq->blit_lock); xfer->sync.engine = engine; via_dmablit_handler(dev, engine, 0); return 0; } /* * Sync on a previously submitted blit. Note that the X server use signals * extensively, and that there is a very big probability that this IOCTL will * be interrupted by a signal. In that case it returns with -EAGAIN for the * signal to be delivered. The caller should then reissue the IOCTL. This is * similar to what is being done for drmGetLock(). */ int via_dma_blit_sync( struct drm_device *dev, void *data, struct drm_file *file_priv ) { drm_via_blitsync_t *sync = data; int err; if (sync->engine >= VIA_NUM_BLIT_ENGINES) return -EINVAL; err = via_dmablit_sync(dev, sync->sync_handle, sync->engine); if (-EINTR == err) err = -EAGAIN; return err; } /* * Queue a blit and hand back a handle to be used for sync. This IOCTL may be * interrupted by a signal while waiting for a free slot in the blit queue. * In that case it returns with -EAGAIN and should be reissued. See the above * IOCTL code. */ int via_dma_blit( struct drm_device *dev, void *data, struct drm_file *file_priv ) { drm_via_dmablit_t *xfer = data; int err; err = via_dmablit(dev, xfer); return err; } Index: user/attilio/rm_vmobj_cache/sys/dev/drm2/i915/i915_gem.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/dev/drm2/i915/i915_gem.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/dev/drm2/i915/i915_gem.c (revision 267237) @@ -1,3785 +1,3785 @@ /*- * Copyright © 2008 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * Eric Anholt * * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include static void i915_gem_object_flush_cpu_write_domain( struct drm_i915_gem_object *obj); static uint32_t i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size, int tiling_mode); static uint32_t i915_gem_get_gtt_alignment(struct drm_device *dev, uint32_t size, int tiling_mode); static int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj, unsigned alignment, bool map_and_fenceable); static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj, int flags); static void i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj); static int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write); static void i915_gem_object_set_to_full_cpu_read_domain( struct drm_i915_gem_object *obj); static int i915_gem_object_set_cpu_read_domain_range( struct drm_i915_gem_object *obj, uint64_t offset, uint64_t size); static void i915_gem_object_finish_gtt(struct drm_i915_gem_object *obj); static void i915_gem_object_truncate(struct drm_i915_gem_object *obj); static int i915_gem_object_is_purgeable(struct drm_i915_gem_object *obj); static bool i915_gem_object_is_inactive(struct drm_i915_gem_object *obj); static int i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj); static vm_page_t i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex); static void i915_gem_process_flushing_list(struct intel_ring_buffer *ring, uint32_t flush_domains); static void i915_gem_clear_fence_reg(struct drm_device *dev, struct drm_i915_fence_reg *reg); static void i915_gem_reset_fences(struct drm_device *dev); static void i915_gem_retire_task_handler(void *arg, int pending); static int i915_gem_phys_pwrite(struct drm_device *dev, struct drm_i915_gem_object *obj, uint64_t data_ptr, uint64_t offset, uint64_t size, struct drm_file *file_priv); static void i915_gem_lowmem(void *arg); MALLOC_DEFINE(DRM_I915_GEM, "i915gem", "Allocations from i915 gem"); long i915_gem_wired_pages_cnt; static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv, size_t size) { dev_priv->mm.object_count++; dev_priv->mm.object_memory += size; } static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv, size_t size) { dev_priv->mm.object_count--; dev_priv->mm.object_memory -= size; } static int i915_gem_wait_for_error(struct drm_device *dev) { struct drm_i915_private *dev_priv; int ret; dev_priv = dev->dev_private; if (!atomic_load_acq_int(&dev_priv->mm.wedged)) return (0); mtx_lock(&dev_priv->error_completion_lock); while (dev_priv->error_completion == 0) { ret = -msleep(&dev_priv->error_completion, &dev_priv->error_completion_lock, PCATCH, "915wco", 0); if (ret != 0) { mtx_unlock(&dev_priv->error_completion_lock); return (ret); } } mtx_unlock(&dev_priv->error_completion_lock); if (atomic_load_acq_int(&dev_priv->mm.wedged)) { mtx_lock(&dev_priv->error_completion_lock); dev_priv->error_completion++; mtx_unlock(&dev_priv->error_completion_lock); } return (0); } int i915_mutex_lock_interruptible(struct drm_device *dev) { struct drm_i915_private *dev_priv; int ret; dev_priv = dev->dev_private; ret = i915_gem_wait_for_error(dev); if (ret != 0) return (ret); /* * interruptible shall it be. might indeed be if dev_lock is * changed to sx */ ret = sx_xlock_sig(&dev->dev_struct_lock); if (ret != 0) return (-ret); return (0); } static void i915_gem_free_object_tail(struct drm_i915_gem_object *obj) { struct drm_device *dev; drm_i915_private_t *dev_priv; int ret; dev = obj->base.dev; dev_priv = dev->dev_private; ret = i915_gem_object_unbind(obj); if (ret == -ERESTART) { list_move(&obj->mm_list, &dev_priv->mm.deferred_free_list); return; } CTR1(KTR_DRM, "object_destroy_tail %p", obj); drm_gem_free_mmap_offset(&obj->base); drm_gem_object_release(&obj->base); i915_gem_info_remove_obj(dev_priv, obj->base.size); free(obj->page_cpu_valid, DRM_I915_GEM); free(obj->bit_17, DRM_I915_GEM); free(obj, DRM_I915_GEM); } void i915_gem_free_object(struct drm_gem_object *gem_obj) { struct drm_i915_gem_object *obj; struct drm_device *dev; obj = to_intel_bo(gem_obj); dev = obj->base.dev; while (obj->pin_count > 0) i915_gem_object_unpin(obj); if (obj->phys_obj != NULL) i915_gem_detach_phys_object(dev, obj); i915_gem_free_object_tail(obj); } static void init_ring_lists(struct intel_ring_buffer *ring) { INIT_LIST_HEAD(&ring->active_list); INIT_LIST_HEAD(&ring->request_list); INIT_LIST_HEAD(&ring->gpu_write_list); } void i915_gem_load(struct drm_device *dev) { drm_i915_private_t *dev_priv; int i; dev_priv = dev->dev_private; INIT_LIST_HEAD(&dev_priv->mm.active_list); INIT_LIST_HEAD(&dev_priv->mm.flushing_list); INIT_LIST_HEAD(&dev_priv->mm.inactive_list); INIT_LIST_HEAD(&dev_priv->mm.pinned_list); INIT_LIST_HEAD(&dev_priv->mm.fence_list); INIT_LIST_HEAD(&dev_priv->mm.deferred_free_list); INIT_LIST_HEAD(&dev_priv->mm.gtt_list); for (i = 0; i < I915_NUM_RINGS; i++) init_ring_lists(&dev_priv->rings[i]); for (i = 0; i < I915_MAX_NUM_FENCES; i++) INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); TIMEOUT_TASK_INIT(dev_priv->tq, &dev_priv->mm.retire_task, 0, i915_gem_retire_task_handler, dev_priv); dev_priv->error_completion = 0; /* On GEN3 we really need to make sure the ARB C3 LP bit is set */ if (IS_GEN3(dev)) { u32 tmp = I915_READ(MI_ARB_STATE); if (!(tmp & MI_ARB_C3_LP_WRITE_ENABLE)) { /* * arb state is a masked write, so set bit + * bit in mask. */ tmp = MI_ARB_C3_LP_WRITE_ENABLE | (MI_ARB_C3_LP_WRITE_ENABLE << MI_ARB_MASK_SHIFT); I915_WRITE(MI_ARB_STATE, tmp); } } dev_priv->relative_constants_mode = I915_EXEC_CONSTANTS_REL_GENERAL; /* Old X drivers will take 0-2 for front, back, depth buffers */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) dev_priv->fence_reg_start = 3; if (INTEL_INFO(dev)->gen >= 4 || IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev)) dev_priv->num_fence_regs = 16; else dev_priv->num_fence_regs = 8; /* Initialize fence registers to zero */ for (i = 0; i < dev_priv->num_fence_regs; i++) { i915_gem_clear_fence_reg(dev, &dev_priv->fence_regs[i]); } i915_gem_detect_bit_6_swizzle(dev); dev_priv->mm.interruptible = true; dev_priv->mm.i915_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, i915_gem_lowmem, dev, EVENTHANDLER_PRI_ANY); } int i915_gem_do_init(struct drm_device *dev, unsigned long start, unsigned long mappable_end, unsigned long end) { drm_i915_private_t *dev_priv; unsigned long mappable; int error; dev_priv = dev->dev_private; mappable = min(end, mappable_end) - start; drm_mm_init(&dev_priv->mm.gtt_space, start, end - start); dev_priv->mm.gtt_start = start; dev_priv->mm.gtt_mappable_end = mappable_end; dev_priv->mm.gtt_end = end; dev_priv->mm.gtt_total = end - start; dev_priv->mm.mappable_gtt_total = mappable; /* Take over this portion of the GTT */ intel_gtt_clear_range(start / PAGE_SIZE, (end-start) / PAGE_SIZE); device_printf(dev->device, "taking over the fictitious range 0x%lx-0x%lx\n", dev->agp->base + start, dev->agp->base + start + mappable); error = -vm_phys_fictitious_reg_range(dev->agp->base + start, dev->agp->base + start + mappable, VM_MEMATTR_WRITE_COMBINING); return (error); } int i915_gem_init_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_init *args; drm_i915_private_t *dev_priv; dev_priv = dev->dev_private; args = data; if (args->gtt_start >= args->gtt_end || (args->gtt_end | args->gtt_start) & (PAGE_SIZE - 1)) return (-EINVAL); if (mtx_initialized(&dev_priv->mm.gtt_space.unused_lock)) return (-EBUSY); /* * XXXKIB. The second-time initialization should be guarded * against. */ return (i915_gem_do_init(dev, args->gtt_start, args->gtt_end, args->gtt_end)); } int i915_gem_idle(struct drm_device *dev) { drm_i915_private_t *dev_priv; int ret; dev_priv = dev->dev_private; if (dev_priv->mm.suspended) return (0); ret = i915_gpu_idle(dev, true); if (ret != 0) return (ret); /* Under UMS, be paranoid and evict. */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) { ret = i915_gem_evict_inactive(dev, false); if (ret != 0) return ret; } i915_gem_reset_fences(dev); /* Hack! Don't let anybody do execbuf while we don't control the chip. * We need to replace this with a semaphore, or something. * And not confound mm.suspended! */ dev_priv->mm.suspended = 1; callout_stop(&dev_priv->hangcheck_timer); i915_kernel_lost_context(dev); i915_gem_cleanup_ringbuffer(dev); /* Cancel the retire work handler, which should be idle now. */ taskqueue_cancel_timeout(dev_priv->tq, &dev_priv->mm.retire_task, NULL); return (ret); } void i915_gem_init_swizzling(struct drm_device *dev) { drm_i915_private_t *dev_priv; dev_priv = dev->dev_private; if (INTEL_INFO(dev)->gen < 5 || dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE) return; I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) | DISP_TILE_SURFACE_SWIZZLING); if (IS_GEN5(dev)) return; I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL); if (IS_GEN6(dev)) I915_WRITE(ARB_MODE, ARB_MODE_ENABLE(ARB_MODE_SWIZZLE_SNB)); else I915_WRITE(ARB_MODE, ARB_MODE_ENABLE(ARB_MODE_SWIZZLE_IVB)); } void i915_gem_init_ppgtt(struct drm_device *dev) { drm_i915_private_t *dev_priv; struct i915_hw_ppgtt *ppgtt; uint32_t pd_offset, pd_entry; vm_paddr_t pt_addr; struct intel_ring_buffer *ring; u_int first_pd_entry_in_global_pt, i; dev_priv = dev->dev_private; ppgtt = dev_priv->mm.aliasing_ppgtt; if (ppgtt == NULL) return; first_pd_entry_in_global_pt = 512 * 1024 - I915_PPGTT_PD_ENTRIES; for (i = 0; i < ppgtt->num_pd_entries; i++) { pt_addr = VM_PAGE_TO_PHYS(ppgtt->pt_pages[i]); pd_entry = GEN6_PDE_ADDR_ENCODE(pt_addr); pd_entry |= GEN6_PDE_VALID; intel_gtt_write(first_pd_entry_in_global_pt + i, pd_entry); } intel_gtt_read_pte(first_pd_entry_in_global_pt); pd_offset = ppgtt->pd_offset; pd_offset /= 64; /* in cachelines, */ pd_offset <<= 16; if (INTEL_INFO(dev)->gen == 6) { uint32_t ecochk = I915_READ(GAM_ECOCHK); I915_WRITE(GAM_ECOCHK, ecochk | ECOCHK_SNB_BIT | ECOCHK_PPGTT_CACHE64B); I915_WRITE(GFX_MODE, GFX_MODE_ENABLE(GFX_PPGTT_ENABLE)); } else if (INTEL_INFO(dev)->gen >= 7) { I915_WRITE(GAM_ECOCHK, ECOCHK_PPGTT_CACHE64B); /* GFX_MODE is per-ring on gen7+ */ } for (i = 0; i < I915_NUM_RINGS; i++) { ring = &dev_priv->rings[i]; if (INTEL_INFO(dev)->gen >= 7) I915_WRITE(RING_MODE_GEN7(ring), GFX_MODE_ENABLE(GFX_PPGTT_ENABLE)); I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G); I915_WRITE(RING_PP_DIR_BASE(ring), pd_offset); } } int i915_gem_init_hw(struct drm_device *dev) { drm_i915_private_t *dev_priv; int ret; dev_priv = dev->dev_private; i915_gem_init_swizzling(dev); ret = intel_init_render_ring_buffer(dev); if (ret != 0) return (ret); if (HAS_BSD(dev)) { ret = intel_init_bsd_ring_buffer(dev); if (ret != 0) goto cleanup_render_ring; } if (HAS_BLT(dev)) { ret = intel_init_blt_ring_buffer(dev); if (ret != 0) goto cleanup_bsd_ring; } dev_priv->next_seqno = 1; i915_gem_init_ppgtt(dev); return (0); cleanup_bsd_ring: intel_cleanup_ring_buffer(&dev_priv->rings[VCS]); cleanup_render_ring: intel_cleanup_ring_buffer(&dev_priv->rings[RCS]); return (ret); } int i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_private *dev_priv; struct drm_i915_gem_get_aperture *args; struct drm_i915_gem_object *obj; size_t pinned; dev_priv = dev->dev_private; args = data; if (!(dev->driver->driver_features & DRIVER_GEM)) return (-ENODEV); pinned = 0; DRM_LOCK(dev); list_for_each_entry(obj, &dev_priv->mm.pinned_list, mm_list) pinned += obj->gtt_space->size; DRM_UNLOCK(dev); args->aper_size = dev_priv->mm.gtt_total; args->aper_available_size = args->aper_size - pinned; return (0); } int i915_gem_object_pin(struct drm_i915_gem_object *obj, uint32_t alignment, bool map_and_fenceable) { struct drm_device *dev; struct drm_i915_private *dev_priv; int ret; dev = obj->base.dev; dev_priv = dev->dev_private; KASSERT(obj->pin_count != DRM_I915_GEM_OBJECT_MAX_PIN_COUNT, ("Max pin count")); if (obj->gtt_space != NULL) { if ((alignment && obj->gtt_offset & (alignment - 1)) || (map_and_fenceable && !obj->map_and_fenceable)) { DRM_DEBUG("bo is already pinned with incorrect alignment:" " offset=%x, req.alignment=%x, req.map_and_fenceable=%d," " obj->map_and_fenceable=%d\n", obj->gtt_offset, alignment, map_and_fenceable, obj->map_and_fenceable); ret = i915_gem_object_unbind(obj); if (ret != 0) return (ret); } } if (obj->gtt_space == NULL) { ret = i915_gem_object_bind_to_gtt(obj, alignment, map_and_fenceable); if (ret) return (ret); } if (obj->pin_count++ == 0 && !obj->active) list_move_tail(&obj->mm_list, &dev_priv->mm.pinned_list); obj->pin_mappable |= map_and_fenceable; #if 1 KIB_NOTYET(); #else WARN_ON(i915_verify_lists(dev)); #endif return (0); } void i915_gem_object_unpin(struct drm_i915_gem_object *obj) { struct drm_device *dev; drm_i915_private_t *dev_priv; dev = obj->base.dev; dev_priv = dev->dev_private; #if 1 KIB_NOTYET(); #else WARN_ON(i915_verify_lists(dev)); #endif KASSERT(obj->pin_count != 0, ("zero pin count")); KASSERT(obj->gtt_space != NULL, ("No gtt mapping")); if (--obj->pin_count == 0) { if (!obj->active) list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list); obj->pin_mappable = false; } #if 1 KIB_NOTYET(); #else WARN_ON(i915_verify_lists(dev)); #endif } int i915_gem_pin_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pin *args; struct drm_i915_gem_object *obj; struct drm_gem_object *gobj; int ret; args = data; ret = i915_mutex_lock_interruptible(dev); if (ret != 0) return ret; gobj = drm_gem_object_lookup(dev, file, args->handle); if (gobj == NULL) { ret = -ENOENT; goto unlock; } obj = to_intel_bo(gobj); if (obj->madv != I915_MADV_WILLNEED) { DRM_ERROR("Attempting to pin a purgeable buffer\n"); ret = -EINVAL; goto out; } if (obj->pin_filp != NULL && obj->pin_filp != file) { DRM_ERROR("Already pinned in i915_gem_pin_ioctl(): %d\n", args->handle); ret = -EINVAL; goto out; } obj->user_pin_count++; obj->pin_filp = file; if (obj->user_pin_count == 1) { ret = i915_gem_object_pin(obj, args->alignment, true); if (ret != 0) goto out; } /* XXX - flush the CPU caches for pinned objects * as the X server doesn't manage domains yet */ i915_gem_object_flush_cpu_write_domain(obj); args->offset = obj->gtt_offset; out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return (ret); } int i915_gem_unpin_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pin *args; struct drm_i915_gem_object *obj; int ret; args = data; ret = i915_mutex_lock_interruptible(dev); if (ret != 0) return (ret); obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->pin_filp != file) { DRM_ERROR("Not pinned by caller in i915_gem_pin_ioctl(): %d\n", args->handle); ret = -EINVAL; goto out; } obj->user_pin_count--; if (obj->user_pin_count == 0) { obj->pin_filp = NULL; i915_gem_object_unpin(obj); } out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return (ret); } int i915_gem_busy_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_busy *args; struct drm_i915_gem_object *obj; struct drm_i915_gem_request *request; int ret; args = data; ret = i915_mutex_lock_interruptible(dev); if (ret != 0) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } args->busy = obj->active; if (args->busy) { if (obj->base.write_domain & I915_GEM_GPU_DOMAINS) { ret = i915_gem_flush_ring(obj->ring, 0, obj->base.write_domain); } else if (obj->ring->outstanding_lazy_request == obj->last_rendering_seqno) { request = malloc(sizeof(*request), DRM_I915_GEM, M_WAITOK | M_ZERO); ret = i915_add_request(obj->ring, NULL, request); if (ret != 0) free(request, DRM_I915_GEM); } i915_gem_retire_requests_ring(obj->ring); args->busy = obj->active; } drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return (ret); } static int i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) { struct drm_i915_private *dev_priv; struct drm_i915_file_private *file_priv; unsigned long recent_enough; struct drm_i915_gem_request *request; struct intel_ring_buffer *ring; u32 seqno; int ret; dev_priv = dev->dev_private; if (atomic_load_acq_int(&dev_priv->mm.wedged)) return (-EIO); file_priv = file->driver_priv; recent_enough = ticks - (20 * hz / 1000); ring = NULL; seqno = 0; mtx_lock(&file_priv->mm.lck); list_for_each_entry(request, &file_priv->mm.request_list, client_list) { if (time_after_eq(request->emitted_jiffies, recent_enough)) break; ring = request->ring; seqno = request->seqno; } mtx_unlock(&file_priv->mm.lck); if (seqno == 0) return (0); ret = 0; mtx_lock(&ring->irq_lock); if (!i915_seqno_passed(ring->get_seqno(ring), seqno)) { if (ring->irq_get(ring)) { while (ret == 0 && !(i915_seqno_passed(ring->get_seqno(ring), seqno) || atomic_load_acq_int(&dev_priv->mm.wedged))) ret = -msleep(ring, &ring->irq_lock, PCATCH, "915thr", 0); ring->irq_put(ring); if (ret == 0 && atomic_load_acq_int(&dev_priv->mm.wedged)) ret = -EIO; } else if (_intel_wait_for(dev, i915_seqno_passed(ring->get_seqno(ring), seqno) || atomic_load_acq_int(&dev_priv->mm.wedged), 3000, 0, "915rtr")) { ret = -EBUSY; } } mtx_unlock(&ring->irq_lock); if (ret == 0) taskqueue_enqueue_timeout(dev_priv->tq, &dev_priv->mm.retire_task, 0); return (ret); } int i915_gem_throttle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return (i915_gem_ring_throttle(dev, file_priv)); } int i915_gem_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_i915_gem_madvise *args; struct drm_i915_gem_object *obj; int ret; args = data; switch (args->madv) { case I915_MADV_DONTNEED: case I915_MADV_WILLNEED: break; default: return (-EINVAL); } ret = i915_mutex_lock_interruptible(dev); if (ret != 0) return (ret); obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->pin_count != 0) { ret = -EINVAL; goto out; } if (obj->madv != I915_MADV_PURGED_INTERNAL) obj->madv = args->madv; if (i915_gem_object_is_purgeable(obj) && obj->gtt_space == NULL) i915_gem_object_truncate(obj); args->retained = obj->madv != I915_MADV_PURGED_INTERNAL; out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return (ret); } void i915_gem_cleanup_ringbuffer(struct drm_device *dev) { drm_i915_private_t *dev_priv; int i; dev_priv = dev->dev_private; for (i = 0; i < I915_NUM_RINGS; i++) intel_cleanup_ring_buffer(&dev_priv->rings[i]); } int i915_gem_entervt_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_i915_private_t *dev_priv; int ret, i; if (drm_core_check_feature(dev, DRIVER_MODESET)) return (0); dev_priv = dev->dev_private; if (atomic_load_acq_int(&dev_priv->mm.wedged) != 0) { DRM_ERROR("Reenabling wedged hardware, good luck\n"); atomic_store_rel_int(&dev_priv->mm.wedged, 0); } dev_priv->mm.suspended = 0; ret = i915_gem_init_hw(dev); if (ret != 0) { return (ret); } KASSERT(list_empty(&dev_priv->mm.active_list), ("active list")); KASSERT(list_empty(&dev_priv->mm.flushing_list), ("flushing list")); KASSERT(list_empty(&dev_priv->mm.inactive_list), ("inactive list")); for (i = 0; i < I915_NUM_RINGS; i++) { KASSERT(list_empty(&dev_priv->rings[i].active_list), ("ring %d active list", i)); KASSERT(list_empty(&dev_priv->rings[i].request_list), ("ring %d request list", i)); } DRM_UNLOCK(dev); ret = drm_irq_install(dev); DRM_LOCK(dev); if (ret) goto cleanup_ringbuffer; return (0); cleanup_ringbuffer: i915_gem_cleanup_ringbuffer(dev); dev_priv->mm.suspended = 1; return (ret); } int i915_gem_leavevt_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { if (drm_core_check_feature(dev, DRIVER_MODESET)) return 0; drm_irq_uninstall(dev); return (i915_gem_idle(dev)); } int i915_gem_create(struct drm_file *file, struct drm_device *dev, uint64_t size, uint32_t *handle_p) { struct drm_i915_gem_object *obj; uint32_t handle; int ret; size = roundup(size, PAGE_SIZE); if (size == 0) return (-EINVAL); obj = i915_gem_alloc_object(dev, size); if (obj == NULL) return (-ENOMEM); handle = 0; ret = drm_gem_handle_create(file, &obj->base, &handle); if (ret != 0) { drm_gem_object_release(&obj->base); i915_gem_info_remove_obj(dev->dev_private, obj->base.size); free(obj, DRM_I915_GEM); return (-ret); } /* drop reference from allocate - handle holds it now */ drm_gem_object_unreference(&obj->base); CTR2(KTR_DRM, "object_create %p %x", obj, size); *handle_p = handle; return (0); } int i915_gem_dumb_create(struct drm_file *file, struct drm_device *dev, struct drm_mode_create_dumb *args) { /* have to work out size/pitch and return them */ args->pitch = roundup2(args->width * ((args->bpp + 7) / 8), 64); args->size = args->pitch * args->height; return (i915_gem_create(file, dev, args->size, &args->handle)); } int i915_gem_dumb_destroy(struct drm_file *file, struct drm_device *dev, uint32_t handle) { return (drm_gem_handle_delete(file, handle)); } int i915_gem_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_create *args = data; return (i915_gem_create(file, dev, args->size, &args->handle)); } static int i915_gem_swap_io(struct drm_device *dev, struct drm_i915_gem_object *obj, uint64_t data_ptr, uint64_t size, uint64_t offset, enum uio_rw rw, struct drm_file *file) { vm_object_t vm_obj; vm_page_t m; struct sf_buf *sf; vm_offset_t mkva; vm_pindex_t obj_pi; int cnt, do_bit17_swizzling, length, obj_po, ret, swizzled_po; if (obj->gtt_offset != 0 && rw == UIO_READ) do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj); else do_bit17_swizzling = 0; obj->dirty = 1; vm_obj = obj->base.vm_obj; ret = 0; VM_OBJECT_WLOCK(vm_obj); vm_object_pip_add(vm_obj, 1); while (size > 0) { obj_pi = OFF_TO_IDX(offset); obj_po = offset & PAGE_MASK; m = i915_gem_wire_page(vm_obj, obj_pi); VM_OBJECT_WUNLOCK(vm_obj); sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); mkva = sf_buf_kva(sf); length = min(size, PAGE_SIZE - obj_po); while (length > 0) { if (do_bit17_swizzling && (VM_PAGE_TO_PHYS(m) & (1 << 17)) != 0) { cnt = roundup2(obj_po + 1, 64); cnt = min(cnt - obj_po, length); swizzled_po = obj_po ^ 64; } else { cnt = length; swizzled_po = obj_po; } if (rw == UIO_READ) ret = -copyout_nofault( (char *)mkva + swizzled_po, (void *)(uintptr_t)data_ptr, cnt); else ret = -copyin_nofault( (void *)(uintptr_t)data_ptr, (char *)mkva + swizzled_po, cnt); if (ret != 0) break; data_ptr += cnt; size -= cnt; length -= cnt; offset += cnt; obj_po += cnt; } sf_buf_free(sf); sched_unpin(); VM_OBJECT_WLOCK(vm_obj); if (rw == UIO_WRITE) vm_page_dirty(m); vm_page_reference(m); vm_page_lock(m); - vm_page_unwire(m, 1); + vm_page_unwire(m, PQ_ACTIVE); vm_page_unlock(m); atomic_add_long(&i915_gem_wired_pages_cnt, -1); if (ret != 0) break; } vm_object_pip_wakeup(vm_obj); VM_OBJECT_WUNLOCK(vm_obj); return (ret); } static int i915_gem_gtt_write(struct drm_device *dev, struct drm_i915_gem_object *obj, uint64_t data_ptr, uint64_t size, uint64_t offset, struct drm_file *file) { vm_offset_t mkva; vm_pindex_t obj_pi; int obj_po, ret; obj_pi = OFF_TO_IDX(offset); obj_po = offset & PAGE_MASK; mkva = (vm_offset_t)pmap_mapdev_attr(dev->agp->base + obj->gtt_offset + IDX_TO_OFF(obj_pi), size, PAT_WRITE_COMBINING); ret = -copyin_nofault((void *)(uintptr_t)data_ptr, (char *)mkva + obj_po, size); pmap_unmapdev(mkva, size); return (ret); } static int i915_gem_obj_io(struct drm_device *dev, uint32_t handle, uint64_t data_ptr, uint64_t size, uint64_t offset, enum uio_rw rw, struct drm_file *file) { struct drm_i915_gem_object *obj; vm_page_t *ma; vm_offset_t start, end; int npages, ret; if (size == 0) return (0); start = trunc_page(data_ptr); end = round_page(data_ptr + size); npages = howmany(end - start, PAGE_SIZE); ma = malloc(npages * sizeof(vm_page_t), DRM_I915_GEM, M_WAITOK | M_ZERO); npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)data_ptr, size, (rw == UIO_READ ? VM_PROT_WRITE : 0 ) | VM_PROT_READ, ma, npages); if (npages == -1) { ret = -EFAULT; goto free_ma; } ret = i915_mutex_lock_interruptible(dev); if (ret != 0) goto unlocked; obj = to_intel_bo(drm_gem_object_lookup(dev, file, handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (offset > obj->base.size || size > obj->base.size - offset) { ret = -EINVAL; goto out; } if (rw == UIO_READ) { CTR3(KTR_DRM, "object_pread %p %jx %jx", obj, offset, size); ret = i915_gem_object_set_cpu_read_domain_range(obj, offset, size); if (ret != 0) goto out; ret = i915_gem_swap_io(dev, obj, data_ptr, size, offset, UIO_READ, file); } else { if (obj->phys_obj) { CTR3(KTR_DRM, "object_phys_write %p %jx %jx", obj, offset, size); ret = i915_gem_phys_pwrite(dev, obj, data_ptr, offset, size, file); } else if (obj->gtt_space && obj->base.write_domain != I915_GEM_DOMAIN_CPU) { CTR3(KTR_DRM, "object_gtt_write %p %jx %jx", obj, offset, size); ret = i915_gem_object_pin(obj, 0, true); if (ret != 0) goto out; ret = i915_gem_object_set_to_gtt_domain(obj, true); if (ret != 0) goto out_unpin; ret = i915_gem_object_put_fence(obj); if (ret != 0) goto out_unpin; ret = i915_gem_gtt_write(dev, obj, data_ptr, size, offset, file); out_unpin: i915_gem_object_unpin(obj); } else { CTR3(KTR_DRM, "object_pwrite %p %jx %jx", obj, offset, size); ret = i915_gem_object_set_to_cpu_domain(obj, true); if (ret != 0) goto out; ret = i915_gem_swap_io(dev, obj, data_ptr, size, offset, UIO_WRITE, file); } } out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); unlocked: vm_page_unhold_pages(ma, npages); free_ma: free(ma, DRM_I915_GEM); return (ret); } int i915_gem_pread_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pread *args; args = data; return (i915_gem_obj_io(dev, args->handle, args->data_ptr, args->size, args->offset, UIO_READ, file)); } int i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pwrite *args; args = data; return (i915_gem_obj_io(dev, args->handle, args->data_ptr, args->size, args->offset, UIO_WRITE, file)); } int i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_set_domain *args; struct drm_i915_gem_object *obj; uint32_t read_domains; uint32_t write_domain; int ret; if ((dev->driver->driver_features & DRIVER_GEM) == 0) return (-ENODEV); args = data; read_domains = args->read_domains; write_domain = args->write_domain; if ((write_domain & I915_GEM_GPU_DOMAINS) != 0 || (read_domains & I915_GEM_GPU_DOMAINS) != 0 || (write_domain != 0 && read_domains != write_domain)) return (-EINVAL); ret = i915_mutex_lock_interruptible(dev); if (ret != 0) return (ret); obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if ((read_domains & I915_GEM_DOMAIN_GTT) != 0) { ret = i915_gem_object_set_to_gtt_domain(obj, write_domain != 0); if (ret == -EINVAL) ret = 0; } else ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0); drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return (ret); } int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_sw_finish *args; struct drm_i915_gem_object *obj; int ret; args = data; ret = 0; if ((dev->driver->driver_features & DRIVER_GEM) == 0) return (ENODEV); ret = i915_mutex_lock_interruptible(dev); if (ret != 0) return (ret); obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->pin_count != 0) i915_gem_object_flush_cpu_write_domain(obj); drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return (ret); } int i915_gem_mmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_mmap *args; struct drm_gem_object *obj; struct proc *p; vm_map_t map; vm_offset_t addr; vm_size_t size; int error, rv; args = data; if ((dev->driver->driver_features & DRIVER_GEM) == 0) return (-ENODEV); obj = drm_gem_object_lookup(dev, file, args->handle); if (obj == NULL) return (-ENOENT); error = 0; if (args->size == 0) goto out; p = curproc; map = &p->p_vmspace->vm_map; size = round_page(args->size); PROC_LOCK(p); if (map->size + size > lim_cur(p, RLIMIT_VMEM)) { PROC_UNLOCK(p); error = ENOMEM; goto out; } PROC_UNLOCK(p); addr = 0; vm_object_reference(obj->vm_obj); DRM_UNLOCK(dev); rv = vm_map_find(map, obj->vm_obj, args->offset, &addr, args->size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, MAP_INHERIT_SHARE); if (rv != KERN_SUCCESS) { vm_object_deallocate(obj->vm_obj); error = -vm_mmap_to_errno(rv); } else { args->addr_ptr = (uint64_t)addr; } DRM_LOCK(dev); out: drm_gem_object_unreference(obj); return (error); } static int i915_gem_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { *color = 0; /* XXXKIB */ return (0); } int i915_intr_pf; static int i915_gem_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct drm_gem_object *gem_obj; struct drm_i915_gem_object *obj; struct drm_device *dev; drm_i915_private_t *dev_priv; vm_page_t m, oldm; int cause, ret; bool write; gem_obj = vm_obj->handle; obj = to_intel_bo(gem_obj); dev = obj->base.dev; dev_priv = dev->dev_private; #if 0 write = (prot & VM_PROT_WRITE) != 0; #else write = true; #endif vm_object_pip_add(vm_obj, 1); /* * Remove the placeholder page inserted by vm_fault() from the * object before dropping the object lock. If * i915_gem_release_mmap() is active in parallel on this gem * object, then it owns the drm device sx and might find the * placeholder already. Then, since the page is busy, * i915_gem_release_mmap() sleeps waiting for the busy state * of the page cleared. We will be not able to acquire drm * device lock until i915_gem_release_mmap() is able to make a * progress. */ if (*mres != NULL) { oldm = *mres; vm_page_lock(oldm); vm_page_remove(oldm); vm_page_unlock(oldm); *mres = NULL; } else oldm = NULL; VM_OBJECT_WUNLOCK(vm_obj); retry: cause = ret = 0; m = NULL; if (i915_intr_pf) { ret = i915_mutex_lock_interruptible(dev); if (ret != 0) { cause = 10; goto out; } } else DRM_LOCK(dev); /* * Since the object lock was dropped, other thread might have * faulted on the same GTT address and instantiated the * mapping for the page. Recheck. */ VM_OBJECT_WLOCK(vm_obj); m = vm_page_lookup(vm_obj, OFF_TO_IDX(offset)); if (m != NULL) { if (vm_page_busied(m)) { DRM_UNLOCK(dev); vm_page_lock(m); VM_OBJECT_WUNLOCK(vm_obj); vm_page_busy_sleep(m, "915pee"); goto retry; } goto have_page; } else VM_OBJECT_WUNLOCK(vm_obj); /* Now bind it into the GTT if needed */ if (!obj->map_and_fenceable) { ret = i915_gem_object_unbind(obj); if (ret != 0) { cause = 20; goto unlock; } } if (!obj->gtt_space) { ret = i915_gem_object_bind_to_gtt(obj, 0, true); if (ret != 0) { cause = 30; goto unlock; } ret = i915_gem_object_set_to_gtt_domain(obj, write); if (ret != 0) { cause = 40; goto unlock; } } if (obj->tiling_mode == I915_TILING_NONE) ret = i915_gem_object_put_fence(obj); else ret = i915_gem_object_get_fence(obj, NULL); if (ret != 0) { cause = 50; goto unlock; } if (i915_gem_object_is_inactive(obj)) list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list); obj->fault_mappable = true; VM_OBJECT_WLOCK(vm_obj); m = vm_phys_fictitious_to_vm_page(dev->agp->base + obj->gtt_offset + offset); if (m == NULL) { VM_OBJECT_WUNLOCK(vm_obj); cause = 60; ret = -EFAULT; goto unlock; } KASSERT((m->flags & PG_FICTITIOUS) != 0, ("not fictitious %p", m)); KASSERT(m->wire_count == 1, ("wire_count not 1 %p", m)); if (vm_page_busied(m)) { DRM_UNLOCK(dev); vm_page_lock(m); VM_OBJECT_WUNLOCK(vm_obj); vm_page_busy_sleep(m, "915pbs"); goto retry; } if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) { DRM_UNLOCK(dev); VM_OBJECT_WUNLOCK(vm_obj); VM_WAIT; goto retry; } m->valid = VM_PAGE_BITS_ALL; have_page: *mres = m; vm_page_xbusy(m); CTR4(KTR_DRM, "fault %p %jx %x phys %x", gem_obj, offset, prot, m->phys_addr); DRM_UNLOCK(dev); if (oldm != NULL) { vm_page_lock(oldm); vm_page_free(oldm); vm_page_unlock(oldm); } vm_object_pip_wakeup(vm_obj); return (VM_PAGER_OK); unlock: DRM_UNLOCK(dev); out: KASSERT(ret != 0, ("i915_gem_pager_fault: wrong return")); CTR5(KTR_DRM, "fault_fail %p %jx %x err %d %d", gem_obj, offset, prot, -ret, cause); if (ret == -EAGAIN || ret == -EIO || ret == -EINTR) { kern_yield(PRI_USER); goto retry; } VM_OBJECT_WLOCK(vm_obj); vm_object_pip_wakeup(vm_obj); return (VM_PAGER_ERROR); } static void i915_gem_pager_dtor(void *handle) { struct drm_gem_object *obj; struct drm_device *dev; obj = handle; dev = obj->dev; DRM_LOCK(dev); drm_gem_free_mmap_offset(obj); i915_gem_release_mmap(to_intel_bo(obj)); drm_gem_object_unreference(obj); DRM_UNLOCK(dev); } struct cdev_pager_ops i915_gem_pager_ops = { .cdev_pg_fault = i915_gem_pager_fault, .cdev_pg_ctor = i915_gem_pager_ctor, .cdev_pg_dtor = i915_gem_pager_dtor }; int i915_gem_mmap_gtt(struct drm_file *file, struct drm_device *dev, uint32_t handle, uint64_t *offset) { struct drm_i915_private *dev_priv; struct drm_i915_gem_object *obj; int ret; if (!(dev->driver->driver_features & DRIVER_GEM)) return (-ENODEV); dev_priv = dev->dev_private; ret = i915_mutex_lock_interruptible(dev); if (ret != 0) return (ret); obj = to_intel_bo(drm_gem_object_lookup(dev, file, handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->base.size > dev_priv->mm.gtt_mappable_end) { ret = -E2BIG; goto out; } if (obj->madv != I915_MADV_WILLNEED) { DRM_ERROR("Attempting to mmap a purgeable buffer\n"); ret = -EINVAL; goto out; } ret = drm_gem_create_mmap_offset(&obj->base); if (ret != 0) goto out; *offset = DRM_GEM_MAPPING_OFF(obj->base.map_list.key) | DRM_GEM_MAPPING_KEY; out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return (ret); } int i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_private *dev_priv; struct drm_i915_gem_mmap_gtt *args; dev_priv = dev->dev_private; args = data; return (i915_gem_mmap_gtt(file, dev, args->handle, &args->offset)); } struct drm_i915_gem_object * i915_gem_alloc_object(struct drm_device *dev, size_t size) { struct drm_i915_private *dev_priv; struct drm_i915_gem_object *obj; dev_priv = dev->dev_private; obj = malloc(sizeof(*obj), DRM_I915_GEM, M_WAITOK | M_ZERO); if (drm_gem_object_init(dev, &obj->base, size) != 0) { free(obj, DRM_I915_GEM); return (NULL); } obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->base.read_domains = I915_GEM_DOMAIN_CPU; if (HAS_LLC(dev)) obj->cache_level = I915_CACHE_LLC; else obj->cache_level = I915_CACHE_NONE; obj->base.driver_private = NULL; obj->fence_reg = I915_FENCE_REG_NONE; INIT_LIST_HEAD(&obj->mm_list); INIT_LIST_HEAD(&obj->gtt_list); INIT_LIST_HEAD(&obj->ring_list); INIT_LIST_HEAD(&obj->exec_list); INIT_LIST_HEAD(&obj->gpu_write_list); obj->madv = I915_MADV_WILLNEED; /* Avoid an unnecessary call to unbind on the first bind. */ obj->map_and_fenceable = true; i915_gem_info_add_obj(dev_priv, size); return (obj); } void i915_gem_clflush_object(struct drm_i915_gem_object *obj) { /* If we don't have a page list set up, then we're not pinned * to GPU, and we can ignore the cache flush because it'll happen * again at bind time. */ if (obj->pages == NULL) return; /* If the GPU is snooping the contents of the CPU cache, * we do not need to manually clear the CPU cache lines. However, * the caches are only snooped when the render cache is * flushed/invalidated. As we always have to emit invalidations * and flushes when moving into and out of the RENDER domain, correct * snooping behaviour occurs naturally as the result of our domain * tracking. */ if (obj->cache_level != I915_CACHE_NONE) return; CTR1(KTR_DRM, "object_clflush %p", obj); drm_clflush_pages(obj->pages, obj->base.size / PAGE_SIZE); } static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj) { uint32_t old_write_domain; if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) return; i915_gem_clflush_object(obj); intel_gtt_chipset_flush(); old_write_domain = obj->base.write_domain; obj->base.write_domain = 0; CTR3(KTR_DRM, "object_change_domain flush_cpu_write %p %x %x", obj, obj->base.read_domains, old_write_domain); } static int i915_gem_object_flush_gpu_write_domain(struct drm_i915_gem_object *obj) { if ((obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0) return (0); return (i915_gem_flush_ring(obj->ring, 0, obj->base.write_domain)); } static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj) { uint32_t old_write_domain; if (obj->base.write_domain != I915_GEM_DOMAIN_GTT) return; wmb(); old_write_domain = obj->base.write_domain; obj->base.write_domain = 0; CTR3(KTR_DRM, "object_change_domain flush gtt_write %p %x %x", obj, obj->base.read_domains, old_write_domain); } int i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write) { uint32_t old_write_domain, old_read_domains; int ret; if (obj->gtt_space == NULL) return (-EINVAL); if (obj->base.write_domain == I915_GEM_DOMAIN_GTT) return 0; ret = i915_gem_object_flush_gpu_write_domain(obj); if (ret != 0) return (ret); if (obj->pending_gpu_write || write) { ret = i915_gem_object_wait_rendering(obj); if (ret != 0) return (ret); } i915_gem_object_flush_cpu_write_domain(obj); old_write_domain = obj->base.write_domain; old_read_domains = obj->base.read_domains; KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_GTT) == 0, ("In GTT write domain")); obj->base.read_domains |= I915_GEM_DOMAIN_GTT; if (write) { obj->base.read_domains = I915_GEM_DOMAIN_GTT; obj->base.write_domain = I915_GEM_DOMAIN_GTT; obj->dirty = 1; } CTR3(KTR_DRM, "object_change_domain set_to_gtt %p %x %x", obj, old_read_domains, old_write_domain); return (0); } int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, enum i915_cache_level cache_level) { struct drm_device *dev; drm_i915_private_t *dev_priv; int ret; if (obj->cache_level == cache_level) return 0; if (obj->pin_count) { DRM_DEBUG("can not change the cache level of pinned objects\n"); return (-EBUSY); } dev = obj->base.dev; dev_priv = dev->dev_private; if (obj->gtt_space) { ret = i915_gem_object_finish_gpu(obj); if (ret != 0) return (ret); i915_gem_object_finish_gtt(obj); /* Before SandyBridge, you could not use tiling or fence * registers with snooped memory, so relinquish any fences * currently pointing to our region in the aperture. */ if (INTEL_INFO(obj->base.dev)->gen < 6) { ret = i915_gem_object_put_fence(obj); if (ret != 0) return (ret); } i915_gem_gtt_rebind_object(obj, cache_level); if (obj->has_aliasing_ppgtt_mapping) i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt, obj, cache_level); } if (cache_level == I915_CACHE_NONE) { u32 old_read_domains, old_write_domain; /* If we're coming from LLC cached, then we haven't * actually been tracking whether the data is in the * CPU cache or not, since we only allow one bit set * in obj->write_domain and have been skipping the clflushes. * Just set it to the CPU cache for now. */ KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) == 0, ("obj %p in CPU write domain", obj)); KASSERT((obj->base.read_domains & ~I915_GEM_DOMAIN_CPU) == 0, ("obj %p in CPU read domain", obj)); old_read_domains = obj->base.read_domains; old_write_domain = obj->base.write_domain; obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; CTR3(KTR_DRM, "object_change_domain set_cache_level %p %x %x", obj, old_read_domains, old_write_domain); } obj->cache_level = cache_level; return (0); } int i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, u32 alignment, struct intel_ring_buffer *pipelined) { u32 old_read_domains, old_write_domain; int ret; ret = i915_gem_object_flush_gpu_write_domain(obj); if (ret != 0) return (ret); if (pipelined != obj->ring) { ret = i915_gem_object_wait_rendering(obj); if (ret == -ERESTART || ret == -EINTR) return (ret); } ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE); if (ret != 0) return (ret); ret = i915_gem_object_pin(obj, alignment, true); if (ret != 0) return (ret); i915_gem_object_flush_cpu_write_domain(obj); old_write_domain = obj->base.write_domain; old_read_domains = obj->base.read_domains; KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_GTT) == 0, ("obj %p in GTT write domain", obj)); obj->base.read_domains |= I915_GEM_DOMAIN_GTT; CTR3(KTR_DRM, "object_change_domain pin_to_display_plan %p %x %x", obj, old_read_domains, obj->base.write_domain); return (0); } int i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj) { int ret; if ((obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0) return (0); if (obj->base.write_domain & I915_GEM_GPU_DOMAINS) { ret = i915_gem_flush_ring(obj->ring, 0, obj->base.write_domain); if (ret != 0) return (ret); } ret = i915_gem_object_wait_rendering(obj); if (ret != 0) return (ret); obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS; return (0); } static int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) { uint32_t old_write_domain, old_read_domains; int ret; if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) return 0; ret = i915_gem_object_flush_gpu_write_domain(obj); if (ret != 0) return (ret); ret = i915_gem_object_wait_rendering(obj); if (ret != 0) return (ret); i915_gem_object_flush_gtt_write_domain(obj); i915_gem_object_set_to_full_cpu_read_domain(obj); old_write_domain = obj->base.write_domain; old_read_domains = obj->base.read_domains; if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) { i915_gem_clflush_object(obj); obj->base.read_domains |= I915_GEM_DOMAIN_CPU; } KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) == 0, ("In cpu write domain")); if (write) { obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; } CTR3(KTR_DRM, "object_change_domain set_to_cpu %p %x %x", obj, old_read_domains, old_write_domain); return (0); } static void i915_gem_object_set_to_full_cpu_read_domain(struct drm_i915_gem_object *obj) { int i; if (obj->page_cpu_valid == NULL) return; if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) != 0) { for (i = 0; i <= (obj->base.size - 1) / PAGE_SIZE; i++) { if (obj->page_cpu_valid[i] != 0) continue; drm_clflush_pages(obj->pages + i, 1); } } free(obj->page_cpu_valid, DRM_I915_GEM); obj->page_cpu_valid = NULL; } static int i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj, uint64_t offset, uint64_t size) { uint32_t old_read_domains; int i, ret; if (offset == 0 && size == obj->base.size) return (i915_gem_object_set_to_cpu_domain(obj, 0)); ret = i915_gem_object_flush_gpu_write_domain(obj); if (ret != 0) return (ret); ret = i915_gem_object_wait_rendering(obj); if (ret != 0) return (ret); i915_gem_object_flush_gtt_write_domain(obj); if (obj->page_cpu_valid == NULL && (obj->base.read_domains & I915_GEM_DOMAIN_CPU) != 0) return (0); if (obj->page_cpu_valid == NULL) { obj->page_cpu_valid = malloc(obj->base.size / PAGE_SIZE, DRM_I915_GEM, M_WAITOK | M_ZERO); } else if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) memset(obj->page_cpu_valid, 0, obj->base.size / PAGE_SIZE); for (i = offset / PAGE_SIZE; i <= (offset + size - 1) / PAGE_SIZE; i++) { if (obj->page_cpu_valid[i]) continue; drm_clflush_pages(obj->pages + i, 1); obj->page_cpu_valid[i] = 1; } KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) == 0, ("In gpu write domain")); old_read_domains = obj->base.read_domains; obj->base.read_domains |= I915_GEM_DOMAIN_CPU; CTR3(KTR_DRM, "object_change_domain set_cpu_read %p %x %x", obj, old_read_domains, obj->base.write_domain); return (0); } static uint32_t i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size, int tiling_mode) { uint32_t gtt_size; if (INTEL_INFO(dev)->gen >= 4 || tiling_mode == I915_TILING_NONE) return (size); /* Previous chips need a power-of-two fence region when tiling */ if (INTEL_INFO(dev)->gen == 3) gtt_size = 1024*1024; else gtt_size = 512*1024; while (gtt_size < size) gtt_size <<= 1; return (gtt_size); } /** * i915_gem_get_gtt_alignment - return required GTT alignment for an object * @obj: object to check * * Return the required GTT alignment for an object, taking into account * potential fence register mapping. */ static uint32_t i915_gem_get_gtt_alignment(struct drm_device *dev, uint32_t size, int tiling_mode) { /* * Minimum alignment is 4k (GTT page size), but might be greater * if a fence register is needed for the object. */ if (INTEL_INFO(dev)->gen >= 4 || tiling_mode == I915_TILING_NONE) return (4096); /* * Previous chips need to be aligned to the size of the smallest * fence register that can contain the object. */ return (i915_gem_get_gtt_size(dev, size, tiling_mode)); } uint32_t i915_gem_get_unfenced_gtt_alignment(struct drm_device *dev, uint32_t size, int tiling_mode) { if (tiling_mode == I915_TILING_NONE) return (4096); /* * Minimum alignment is 4k (GTT page size) for sane hw. */ if (INTEL_INFO(dev)->gen >= 4 || IS_G33(dev)) return (4096); /* * Previous hardware however needs to be aligned to a power-of-two * tile height. The simplest method for determining this is to reuse * the power-of-tile object size. */ return (i915_gem_get_gtt_size(dev, size, tiling_mode)); } static int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj, unsigned alignment, bool map_and_fenceable) { struct drm_device *dev; struct drm_i915_private *dev_priv; struct drm_mm_node *free_space; uint32_t size, fence_size, fence_alignment, unfenced_alignment; bool mappable, fenceable; int ret; dev = obj->base.dev; dev_priv = dev->dev_private; if (obj->madv != I915_MADV_WILLNEED) { DRM_ERROR("Attempting to bind a purgeable object\n"); return (-EINVAL); } fence_size = i915_gem_get_gtt_size(dev, obj->base.size, obj->tiling_mode); fence_alignment = i915_gem_get_gtt_alignment(dev, obj->base.size, obj->tiling_mode); unfenced_alignment = i915_gem_get_unfenced_gtt_alignment(dev, obj->base.size, obj->tiling_mode); if (alignment == 0) alignment = map_and_fenceable ? fence_alignment : unfenced_alignment; if (map_and_fenceable && (alignment & (fence_alignment - 1)) != 0) { DRM_ERROR("Invalid object alignment requested %u\n", alignment); return (-EINVAL); } size = map_and_fenceable ? fence_size : obj->base.size; /* If the object is bigger than the entire aperture, reject it early * before evicting everything in a vain attempt to find space. */ if (obj->base.size > (map_and_fenceable ? dev_priv->mm.gtt_mappable_end : dev_priv->mm.gtt_total)) { DRM_ERROR( "Attempting to bind an object larger than the aperture\n"); return (-E2BIG); } search_free: if (map_and_fenceable) free_space = drm_mm_search_free_in_range( &dev_priv->mm.gtt_space, size, alignment, 0, dev_priv->mm.gtt_mappable_end, 0); else free_space = drm_mm_search_free(&dev_priv->mm.gtt_space, size, alignment, 0); if (free_space != NULL) { if (map_and_fenceable) obj->gtt_space = drm_mm_get_block_range_generic( free_space, size, alignment, 0, dev_priv->mm.gtt_mappable_end, 1); else obj->gtt_space = drm_mm_get_block_generic(free_space, size, alignment, 1); } if (obj->gtt_space == NULL) { ret = i915_gem_evict_something(dev, size, alignment, map_and_fenceable); if (ret != 0) return (ret); goto search_free; } ret = i915_gem_object_get_pages_gtt(obj, 0); if (ret != 0) { drm_mm_put_block(obj->gtt_space); obj->gtt_space = NULL; /* * i915_gem_object_get_pages_gtt() cannot return * ENOMEM, since we use vm_page_grab(). */ return (ret); } ret = i915_gem_gtt_bind_object(obj); if (ret != 0) { i915_gem_object_put_pages_gtt(obj); drm_mm_put_block(obj->gtt_space); obj->gtt_space = NULL; if (i915_gem_evict_everything(dev, false)) return (ret); goto search_free; } list_add_tail(&obj->gtt_list, &dev_priv->mm.gtt_list); list_add_tail(&obj->mm_list, &dev_priv->mm.inactive_list); KASSERT((obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0, ("Object in gpu read domain")); KASSERT((obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0, ("Object in gpu write domain")); obj->gtt_offset = obj->gtt_space->start; fenceable = obj->gtt_space->size == fence_size && (obj->gtt_space->start & (fence_alignment - 1)) == 0; mappable = obj->gtt_offset + obj->base.size <= dev_priv->mm.gtt_mappable_end; obj->map_and_fenceable = mappable && fenceable; CTR4(KTR_DRM, "object_bind %p %x %x %d", obj, obj->gtt_offset, obj->base.size, map_and_fenceable); return (0); } static void i915_gem_object_finish_gtt(struct drm_i915_gem_object *obj) { u32 old_write_domain, old_read_domains; /* Act a barrier for all accesses through the GTT */ mb(); /* Force a pagefault for domain tracking on next user access */ i915_gem_release_mmap(obj); if ((obj->base.read_domains & I915_GEM_DOMAIN_GTT) == 0) return; old_read_domains = obj->base.read_domains; old_write_domain = obj->base.write_domain; obj->base.read_domains &= ~I915_GEM_DOMAIN_GTT; obj->base.write_domain &= ~I915_GEM_DOMAIN_GTT; CTR3(KTR_DRM, "object_change_domain finish gtt %p %x %x", obj, old_read_domains, old_write_domain); } int i915_gem_object_unbind(struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv; int ret; dev_priv = obj->base.dev->dev_private; ret = 0; if (obj->gtt_space == NULL) return (0); if (obj->pin_count != 0) { DRM_ERROR("Attempting to unbind pinned buffer\n"); return (-EINVAL); } ret = i915_gem_object_finish_gpu(obj); if (ret == -ERESTART || ret == -EINTR) return (ret); i915_gem_object_finish_gtt(obj); if (ret == 0) ret = i915_gem_object_set_to_cpu_domain(obj, 1); if (ret == -ERESTART || ret == -EINTR) return (ret); if (ret != 0) { i915_gem_clflush_object(obj); obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU; } ret = i915_gem_object_put_fence(obj); if (ret == -ERESTART) return (ret); i915_gem_gtt_unbind_object(obj); if (obj->has_aliasing_ppgtt_mapping) { i915_ppgtt_unbind_object(dev_priv->mm.aliasing_ppgtt, obj); obj->has_aliasing_ppgtt_mapping = 0; } i915_gem_object_put_pages_gtt(obj); list_del_init(&obj->gtt_list); list_del_init(&obj->mm_list); obj->map_and_fenceable = true; drm_mm_put_block(obj->gtt_space); obj->gtt_space = NULL; obj->gtt_offset = 0; if (i915_gem_object_is_purgeable(obj)) i915_gem_object_truncate(obj); CTR1(KTR_DRM, "object_unbind %p", obj); return (ret); } static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj, int flags) { struct drm_device *dev; vm_object_t vm_obj; vm_page_t m; int page_count, i, j; dev = obj->base.dev; KASSERT(obj->pages == NULL, ("Obj already has pages")); page_count = obj->base.size / PAGE_SIZE; obj->pages = malloc(page_count * sizeof(vm_page_t), DRM_I915_GEM, M_WAITOK); vm_obj = obj->base.vm_obj; VM_OBJECT_WLOCK(vm_obj); for (i = 0; i < page_count; i++) { if ((obj->pages[i] = i915_gem_wire_page(vm_obj, i)) == NULL) goto failed; } VM_OBJECT_WUNLOCK(vm_obj); if (i915_gem_object_needs_bit17_swizzle(obj)) i915_gem_object_do_bit_17_swizzle(obj); return (0); failed: for (j = 0; j < i; j++) { m = obj->pages[j]; vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); atomic_add_long(&i915_gem_wired_pages_cnt, -1); } VM_OBJECT_WUNLOCK(vm_obj); free(obj->pages, DRM_I915_GEM); obj->pages = NULL; return (-EIO); } #define GEM_PARANOID_CHECK_GTT 0 #if GEM_PARANOID_CHECK_GTT static void i915_gem_assert_pages_not_mapped(struct drm_device *dev, vm_page_t *ma, int page_count) { struct drm_i915_private *dev_priv; vm_paddr_t pa; unsigned long start, end; u_int i; int j; dev_priv = dev->dev_private; start = OFF_TO_IDX(dev_priv->mm.gtt_start); end = OFF_TO_IDX(dev_priv->mm.gtt_end); for (i = start; i < end; i++) { pa = intel_gtt_read_pte_paddr(i); for (j = 0; j < page_count; j++) { if (pa == VM_PAGE_TO_PHYS(ma[j])) { panic("Page %p in GTT pte index %d pte %x", ma[i], i, intel_gtt_read_pte(i)); } } } } #endif static void i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj) { vm_page_t m; int page_count, i; KASSERT(obj->madv != I915_MADV_PURGED_INTERNAL, ("Purged object")); if (obj->tiling_mode != I915_TILING_NONE) i915_gem_object_save_bit_17_swizzle(obj); if (obj->madv == I915_MADV_DONTNEED) obj->dirty = 0; page_count = obj->base.size / PAGE_SIZE; VM_OBJECT_WLOCK(obj->base.vm_obj); #if GEM_PARANOID_CHECK_GTT i915_gem_assert_pages_not_mapped(obj->base.dev, obj->pages, page_count); #endif for (i = 0; i < page_count; i++) { m = obj->pages[i]; if (obj->dirty) vm_page_dirty(m); if (obj->madv == I915_MADV_WILLNEED) vm_page_reference(m); vm_page_lock(m); - vm_page_unwire(obj->pages[i], 1); + vm_page_unwire(obj->pages[i], PQ_ACTIVE); vm_page_unlock(m); atomic_add_long(&i915_gem_wired_pages_cnt, -1); } VM_OBJECT_WUNLOCK(obj->base.vm_obj); obj->dirty = 0; free(obj->pages, DRM_I915_GEM); obj->pages = NULL; } void i915_gem_release_mmap(struct drm_i915_gem_object *obj) { vm_object_t devobj; vm_page_t m; int i, page_count; if (!obj->fault_mappable) return; CTR3(KTR_DRM, "release_mmap %p %x %x", obj, obj->gtt_offset, OFF_TO_IDX(obj->base.size)); devobj = cdev_pager_lookup(obj); if (devobj != NULL) { page_count = OFF_TO_IDX(obj->base.size); VM_OBJECT_WLOCK(devobj); retry: for (i = 0; i < page_count; i++) { m = vm_page_lookup(devobj, i); if (m == NULL) continue; if (vm_page_sleep_if_busy(m, "915unm")) goto retry; cdev_pager_free_page(devobj, m); } VM_OBJECT_WUNLOCK(devobj); vm_object_deallocate(devobj); } obj->fault_mappable = false; } int i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj) { int ret; KASSERT((obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0, ("In GPU write domain")); CTR5(KTR_DRM, "object_wait_rendering %p %s %x %d %d", obj, obj->ring != NULL ? obj->ring->name : "none", obj->gtt_offset, obj->active, obj->last_rendering_seqno); if (obj->active) { ret = i915_wait_request(obj->ring, obj->last_rendering_seqno, true); if (ret != 0) return (ret); } return (0); } void i915_gem_object_move_to_active(struct drm_i915_gem_object *obj, struct intel_ring_buffer *ring, uint32_t seqno) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_fence_reg *reg; obj->ring = ring; KASSERT(ring != NULL, ("NULL ring")); /* Add a reference if we're newly entering the active list. */ if (!obj->active) { drm_gem_object_reference(&obj->base); obj->active = 1; } /* Move from whatever list we were on to the tail of execution. */ list_move_tail(&obj->mm_list, &dev_priv->mm.active_list); list_move_tail(&obj->ring_list, &ring->active_list); obj->last_rendering_seqno = seqno; if (obj->fenced_gpu_access) { obj->last_fenced_seqno = seqno; obj->last_fenced_ring = ring; /* Bump MRU to take account of the delayed flush */ if (obj->fence_reg != I915_FENCE_REG_NONE) { reg = &dev_priv->fence_regs[obj->fence_reg]; list_move_tail(®->lru_list, &dev_priv->mm.fence_list); } } } static void i915_gem_object_move_off_active(struct drm_i915_gem_object *obj) { list_del_init(&obj->ring_list); obj->last_rendering_seqno = 0; obj->last_fenced_seqno = 0; } static void i915_gem_object_move_to_flushing(struct drm_i915_gem_object *obj) { struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; KASSERT(obj->active, ("Object not active")); list_move_tail(&obj->mm_list, &dev_priv->mm.flushing_list); i915_gem_object_move_off_active(obj); } static void i915_gem_object_move_to_inactive(struct drm_i915_gem_object *obj) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; if (obj->pin_count != 0) list_move_tail(&obj->mm_list, &dev_priv->mm.pinned_list); else list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list); KASSERT(list_empty(&obj->gpu_write_list), ("On gpu_write_list")); KASSERT(obj->active, ("Object not active")); obj->ring = NULL; obj->last_fenced_ring = NULL; i915_gem_object_move_off_active(obj); obj->fenced_gpu_access = false; obj->active = 0; obj->pending_gpu_write = false; drm_gem_object_unreference(&obj->base); #if 1 KIB_NOTYET(); #else WARN_ON(i915_verify_lists(dev)); #endif } static void i915_gem_object_truncate(struct drm_i915_gem_object *obj) { vm_object_t vm_obj; vm_obj = obj->base.vm_obj; VM_OBJECT_WLOCK(vm_obj); vm_object_page_remove(vm_obj, 0, 0, false); VM_OBJECT_WUNLOCK(vm_obj); obj->madv = I915_MADV_PURGED_INTERNAL; } static inline int i915_gem_object_is_purgeable(struct drm_i915_gem_object *obj) { return (obj->madv == I915_MADV_DONTNEED); } static void i915_gem_process_flushing_list(struct intel_ring_buffer *ring, uint32_t flush_domains) { struct drm_i915_gem_object *obj, *next; uint32_t old_write_domain; list_for_each_entry_safe(obj, next, &ring->gpu_write_list, gpu_write_list) { if (obj->base.write_domain & flush_domains) { old_write_domain = obj->base.write_domain; obj->base.write_domain = 0; list_del_init(&obj->gpu_write_list); i915_gem_object_move_to_active(obj, ring, i915_gem_next_request_seqno(ring)); CTR3(KTR_DRM, "object_change_domain process_flush %p %x %x", obj, obj->base.read_domains, old_write_domain); } } } static int i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv; dev_priv = obj->base.dev->dev_private; return (dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_9_10_17 && obj->tiling_mode != I915_TILING_NONE); } static vm_page_t i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; int rv; VM_OBJECT_ASSERT_WLOCKED(object); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { if (vm_pager_has_page(object, pindex, NULL, NULL)) { rv = vm_pager_get_pages(object, &m, 1, 0); m = vm_page_lookup(object, pindex); if (m == NULL) return (NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); return (NULL); } } else { pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } } vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); vm_page_xunbusy(m); atomic_add_long(&i915_gem_wired_pages_cnt, 1); return (m); } int i915_gem_flush_ring(struct intel_ring_buffer *ring, uint32_t invalidate_domains, uint32_t flush_domains) { int ret; if (((invalidate_domains | flush_domains) & I915_GEM_GPU_DOMAINS) == 0) return 0; CTR3(KTR_DRM, "ring_flush %s %x %x", ring->name, invalidate_domains, flush_domains); ret = ring->flush(ring, invalidate_domains, flush_domains); if (ret) return ret; if (flush_domains & I915_GEM_GPU_DOMAINS) i915_gem_process_flushing_list(ring, flush_domains); return 0; } static int i915_ring_idle(struct intel_ring_buffer *ring, bool do_retire) { int ret; if (list_empty(&ring->gpu_write_list) && list_empty(&ring->active_list)) return 0; if (!list_empty(&ring->gpu_write_list)) { ret = i915_gem_flush_ring(ring, I915_GEM_GPU_DOMAINS, I915_GEM_GPU_DOMAINS); if (ret != 0) return ret; } return (i915_wait_request(ring, i915_gem_next_request_seqno(ring), do_retire)); } int i915_gpu_idle(struct drm_device *dev, bool do_retire) { drm_i915_private_t *dev_priv = dev->dev_private; int ret, i; /* Flush everything onto the inactive list. */ for (i = 0; i < I915_NUM_RINGS; i++) { ret = i915_ring_idle(&dev_priv->rings[i], do_retire); if (ret) return ret; } return 0; } int i915_wait_request(struct intel_ring_buffer *ring, uint32_t seqno, bool do_retire) { drm_i915_private_t *dev_priv; struct drm_i915_gem_request *request; uint32_t ier; int flags, ret; bool recovery_complete; KASSERT(seqno != 0, ("Zero seqno")); dev_priv = ring->dev->dev_private; ret = 0; if (atomic_load_acq_int(&dev_priv->mm.wedged) != 0) { /* Give the error handler a chance to run. */ mtx_lock(&dev_priv->error_completion_lock); recovery_complete = (&dev_priv->error_completion) > 0; mtx_unlock(&dev_priv->error_completion_lock); return (recovery_complete ? -EIO : -EAGAIN); } if (seqno == ring->outstanding_lazy_request) { request = malloc(sizeof(*request), DRM_I915_GEM, M_WAITOK | M_ZERO); if (request == NULL) return (-ENOMEM); ret = i915_add_request(ring, NULL, request); if (ret != 0) { free(request, DRM_I915_GEM); return (ret); } seqno = request->seqno; } if (!i915_seqno_passed(ring->get_seqno(ring), seqno)) { if (HAS_PCH_SPLIT(ring->dev)) ier = I915_READ(DEIER) | I915_READ(GTIER); else ier = I915_READ(IER); if (!ier) { DRM_ERROR("something (likely vbetool) disabled " "interrupts, re-enabling\n"); ring->dev->driver->irq_preinstall(ring->dev); ring->dev->driver->irq_postinstall(ring->dev); } CTR2(KTR_DRM, "request_wait_begin %s %d", ring->name, seqno); ring->waiting_seqno = seqno; mtx_lock(&ring->irq_lock); if (ring->irq_get(ring)) { flags = dev_priv->mm.interruptible ? PCATCH : 0; while (!i915_seqno_passed(ring->get_seqno(ring), seqno) && !atomic_load_acq_int(&dev_priv->mm.wedged) && ret == 0) { ret = -msleep(ring, &ring->irq_lock, flags, "915gwr", 0); } ring->irq_put(ring); mtx_unlock(&ring->irq_lock); } else { mtx_unlock(&ring->irq_lock); if (_intel_wait_for(ring->dev, i915_seqno_passed(ring->get_seqno(ring), seqno) || atomic_load_acq_int(&dev_priv->mm.wedged), 3000, 0, "i915wrq") != 0) ret = -EBUSY; } ring->waiting_seqno = 0; CTR3(KTR_DRM, "request_wait_end %s %d %d", ring->name, seqno, ret); } if (atomic_load_acq_int(&dev_priv->mm.wedged)) ret = -EAGAIN; /* Directly dispatch request retiring. While we have the work queue * to handle this, the waiter on a request often wants an associated * buffer to have made it to the inactive list, and we would need * a separate wait queue to handle that. */ if (ret == 0 && do_retire) i915_gem_retire_requests_ring(ring); return (ret); } static u32 i915_gem_get_seqno(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; u32 seqno = dev_priv->next_seqno; /* reserve 0 for non-seqno */ if (++dev_priv->next_seqno == 0) dev_priv->next_seqno = 1; return seqno; } u32 i915_gem_next_request_seqno(struct intel_ring_buffer *ring) { if (ring->outstanding_lazy_request == 0) ring->outstanding_lazy_request = i915_gem_get_seqno(ring->dev); return ring->outstanding_lazy_request; } int i915_add_request(struct intel_ring_buffer *ring, struct drm_file *file, struct drm_i915_gem_request *request) { drm_i915_private_t *dev_priv; struct drm_i915_file_private *file_priv; uint32_t seqno; u32 request_ring_position; int was_empty; int ret; KASSERT(request != NULL, ("NULL request in add")); DRM_LOCK_ASSERT(ring->dev); dev_priv = ring->dev->dev_private; seqno = i915_gem_next_request_seqno(ring); request_ring_position = intel_ring_get_tail(ring); ret = ring->add_request(ring, &seqno); if (ret != 0) return ret; CTR2(KTR_DRM, "request_add %s %d", ring->name, seqno); request->seqno = seqno; request->ring = ring; request->tail = request_ring_position; request->emitted_jiffies = ticks; was_empty = list_empty(&ring->request_list); list_add_tail(&request->list, &ring->request_list); if (file != NULL) { file_priv = file->driver_priv; mtx_lock(&file_priv->mm.lck); request->file_priv = file_priv; list_add_tail(&request->client_list, &file_priv->mm.request_list); mtx_unlock(&file_priv->mm.lck); } ring->outstanding_lazy_request = 0; if (!dev_priv->mm.suspended) { if (i915_enable_hangcheck) { callout_schedule(&dev_priv->hangcheck_timer, DRM_I915_HANGCHECK_PERIOD); } if (was_empty) taskqueue_enqueue_timeout(dev_priv->tq, &dev_priv->mm.retire_task, hz); } return (0); } static inline void i915_gem_request_remove_from_client(struct drm_i915_gem_request *request) { struct drm_i915_file_private *file_priv = request->file_priv; if (!file_priv) return; DRM_LOCK_ASSERT(request->ring->dev); mtx_lock(&file_priv->mm.lck); if (request->file_priv != NULL) { list_del(&request->client_list); request->file_priv = NULL; } mtx_unlock(&file_priv->mm.lck); } void i915_gem_release(struct drm_device *dev, struct drm_file *file) { struct drm_i915_file_private *file_priv; struct drm_i915_gem_request *request; file_priv = file->driver_priv; /* Clean up our request list when the client is going away, so that * later retire_requests won't dereference our soon-to-be-gone * file_priv. */ mtx_lock(&file_priv->mm.lck); while (!list_empty(&file_priv->mm.request_list)) { request = list_first_entry(&file_priv->mm.request_list, struct drm_i915_gem_request, client_list); list_del(&request->client_list); request->file_priv = NULL; } mtx_unlock(&file_priv->mm.lck); } static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv, struct intel_ring_buffer *ring) { if (ring->dev != NULL) DRM_LOCK_ASSERT(ring->dev); while (!list_empty(&ring->request_list)) { struct drm_i915_gem_request *request; request = list_first_entry(&ring->request_list, struct drm_i915_gem_request, list); list_del(&request->list); i915_gem_request_remove_from_client(request); free(request, DRM_I915_GEM); } while (!list_empty(&ring->active_list)) { struct drm_i915_gem_object *obj; obj = list_first_entry(&ring->active_list, struct drm_i915_gem_object, ring_list); obj->base.write_domain = 0; list_del_init(&obj->gpu_write_list); i915_gem_object_move_to_inactive(obj); } } static void i915_gem_reset_fences(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int i; for (i = 0; i < dev_priv->num_fence_regs; i++) { struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i]; struct drm_i915_gem_object *obj = reg->obj; if (!obj) continue; if (obj->tiling_mode) i915_gem_release_mmap(obj); reg->obj->fence_reg = I915_FENCE_REG_NONE; reg->obj->fenced_gpu_access = false; reg->obj->last_fenced_seqno = 0; reg->obj->last_fenced_ring = NULL; i915_gem_clear_fence_reg(dev, reg); } } void i915_gem_reset(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj; int i; for (i = 0; i < I915_NUM_RINGS; i++) i915_gem_reset_ring_lists(dev_priv, &dev_priv->rings[i]); /* Remove anything from the flushing lists. The GPU cache is likely * to be lost on reset along with the data, so simply move the * lost bo to the inactive list. */ while (!list_empty(&dev_priv->mm.flushing_list)) { obj = list_first_entry(&dev_priv->mm.flushing_list, struct drm_i915_gem_object, mm_list); obj->base.write_domain = 0; list_del_init(&obj->gpu_write_list); i915_gem_object_move_to_inactive(obj); } /* Move everything out of the GPU domains to ensure we do any * necessary invalidation upon reuse. */ list_for_each_entry(obj, &dev_priv->mm.inactive_list, mm_list) { obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS; } /* The fence registers are invalidated so clear them out */ i915_gem_reset_fences(dev); } /** * This function clears the request list as sequence numbers are passed. */ void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring) { uint32_t seqno; int i; if (list_empty(&ring->request_list)) return; seqno = ring->get_seqno(ring); CTR2(KTR_DRM, "retire_request_ring %s %d", ring->name, seqno); for (i = 0; i < DRM_ARRAY_SIZE(ring->sync_seqno); i++) if (seqno >= ring->sync_seqno[i]) ring->sync_seqno[i] = 0; while (!list_empty(&ring->request_list)) { struct drm_i915_gem_request *request; request = list_first_entry(&ring->request_list, struct drm_i915_gem_request, list); if (!i915_seqno_passed(seqno, request->seqno)) break; CTR2(KTR_DRM, "retire_request_seqno_passed %s %d", ring->name, seqno); ring->last_retired_head = request->tail; list_del(&request->list); i915_gem_request_remove_from_client(request); free(request, DRM_I915_GEM); } /* Move any buffers on the active list that are no longer referenced * by the ringbuffer to the flushing/inactive lists as appropriate. */ while (!list_empty(&ring->active_list)) { struct drm_i915_gem_object *obj; obj = list_first_entry(&ring->active_list, struct drm_i915_gem_object, ring_list); if (!i915_seqno_passed(seqno, obj->last_rendering_seqno)) break; if (obj->base.write_domain != 0) i915_gem_object_move_to_flushing(obj); else i915_gem_object_move_to_inactive(obj); } if (ring->trace_irq_seqno && i915_seqno_passed(seqno, ring->trace_irq_seqno)) { mtx_lock(&ring->irq_lock); ring->irq_put(ring); mtx_unlock(&ring->irq_lock); ring->trace_irq_seqno = 0; } } void i915_gem_retire_requests(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj, *next; int i; if (!list_empty(&dev_priv->mm.deferred_free_list)) { list_for_each_entry_safe(obj, next, &dev_priv->mm.deferred_free_list, mm_list) i915_gem_free_object_tail(obj); } for (i = 0; i < I915_NUM_RINGS; i++) i915_gem_retire_requests_ring(&dev_priv->rings[i]); } static int sandybridge_write_fence_reg(struct drm_i915_gem_object *obj, struct intel_ring_buffer *pipelined) { struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; u32 size = obj->gtt_space->size; int regnum = obj->fence_reg; uint64_t val; val = (uint64_t)((obj->gtt_offset + size - 4096) & 0xfffff000) << 32; val |= obj->gtt_offset & 0xfffff000; val |= (uint64_t)((obj->stride / 128) - 1) << SANDYBRIDGE_FENCE_PITCH_SHIFT; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I965_FENCE_TILING_Y_SHIFT; val |= I965_FENCE_REG_VALID; if (pipelined) { int ret = intel_ring_begin(pipelined, 6); if (ret) return ret; intel_ring_emit(pipelined, MI_NOOP); intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(2)); intel_ring_emit(pipelined, FENCE_REG_SANDYBRIDGE_0 + regnum*8); intel_ring_emit(pipelined, (u32)val); intel_ring_emit(pipelined, FENCE_REG_SANDYBRIDGE_0 + regnum*8 + 4); intel_ring_emit(pipelined, (u32)(val >> 32)); intel_ring_advance(pipelined); } else I915_WRITE64(FENCE_REG_SANDYBRIDGE_0 + regnum * 8, val); return 0; } static int i965_write_fence_reg(struct drm_i915_gem_object *obj, struct intel_ring_buffer *pipelined) { struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; u32 size = obj->gtt_space->size; int regnum = obj->fence_reg; uint64_t val; val = (uint64_t)((obj->gtt_offset + size - 4096) & 0xfffff000) << 32; val |= obj->gtt_offset & 0xfffff000; val |= ((obj->stride / 128) - 1) << I965_FENCE_PITCH_SHIFT; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I965_FENCE_TILING_Y_SHIFT; val |= I965_FENCE_REG_VALID; if (pipelined) { int ret = intel_ring_begin(pipelined, 6); if (ret) return ret; intel_ring_emit(pipelined, MI_NOOP); intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(2)); intel_ring_emit(pipelined, FENCE_REG_965_0 + regnum*8); intel_ring_emit(pipelined, (u32)val); intel_ring_emit(pipelined, FENCE_REG_965_0 + regnum*8 + 4); intel_ring_emit(pipelined, (u32)(val >> 32)); intel_ring_advance(pipelined); } else I915_WRITE64(FENCE_REG_965_0 + regnum * 8, val); return 0; } static int i915_write_fence_reg(struct drm_i915_gem_object *obj, struct intel_ring_buffer *pipelined) { struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; u32 size = obj->gtt_space->size; u32 fence_reg, val, pitch_val; int tile_width; if ((obj->gtt_offset & ~I915_FENCE_START_MASK) || (size & -size) != size || (obj->gtt_offset & (size - 1))) { printf( "object 0x%08x [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n", obj->gtt_offset, obj->map_and_fenceable, size); return -EINVAL; } if (obj->tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev)) tile_width = 128; else tile_width = 512; /* Note: pitch better be a power of two tile widths */ pitch_val = obj->stride / tile_width; pitch_val = ffs(pitch_val) - 1; val = obj->gtt_offset; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I830_FENCE_TILING_Y_SHIFT; val |= I915_FENCE_SIZE_BITS(size); val |= pitch_val << I830_FENCE_PITCH_SHIFT; val |= I830_FENCE_REG_VALID; fence_reg = obj->fence_reg; if (fence_reg < 8) fence_reg = FENCE_REG_830_0 + fence_reg * 4; else fence_reg = FENCE_REG_945_8 + (fence_reg - 8) * 4; if (pipelined) { int ret = intel_ring_begin(pipelined, 4); if (ret) return ret; intel_ring_emit(pipelined, MI_NOOP); intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(1)); intel_ring_emit(pipelined, fence_reg); intel_ring_emit(pipelined, val); intel_ring_advance(pipelined); } else I915_WRITE(fence_reg, val); return 0; } static int i830_write_fence_reg(struct drm_i915_gem_object *obj, struct intel_ring_buffer *pipelined) { struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; u32 size = obj->gtt_space->size; int regnum = obj->fence_reg; uint32_t val; uint32_t pitch_val; if ((obj->gtt_offset & ~I830_FENCE_START_MASK) || (size & -size) != size || (obj->gtt_offset & (size - 1))) { printf( "object 0x%08x not 512K or pot-size 0x%08x aligned\n", obj->gtt_offset, size); return -EINVAL; } pitch_val = obj->stride / 128; pitch_val = ffs(pitch_val) - 1; val = obj->gtt_offset; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I830_FENCE_TILING_Y_SHIFT; val |= I830_FENCE_SIZE_BITS(size); val |= pitch_val << I830_FENCE_PITCH_SHIFT; val |= I830_FENCE_REG_VALID; if (pipelined) { int ret = intel_ring_begin(pipelined, 4); if (ret) return ret; intel_ring_emit(pipelined, MI_NOOP); intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(1)); intel_ring_emit(pipelined, FENCE_REG_830_0 + regnum*4); intel_ring_emit(pipelined, val); intel_ring_advance(pipelined); } else I915_WRITE(FENCE_REG_830_0 + regnum * 4, val); return 0; } static bool ring_passed_seqno(struct intel_ring_buffer *ring, u32 seqno) { return i915_seqno_passed(ring->get_seqno(ring), seqno); } static int i915_gem_object_flush_fence(struct drm_i915_gem_object *obj, struct intel_ring_buffer *pipelined) { int ret; if (obj->fenced_gpu_access) { if (obj->base.write_domain & I915_GEM_GPU_DOMAINS) { ret = i915_gem_flush_ring(obj->last_fenced_ring, 0, obj->base.write_domain); if (ret) return ret; } obj->fenced_gpu_access = false; } if (obj->last_fenced_seqno && pipelined != obj->last_fenced_ring) { if (!ring_passed_seqno(obj->last_fenced_ring, obj->last_fenced_seqno)) { ret = i915_wait_request(obj->last_fenced_ring, obj->last_fenced_seqno, true); if (ret) return ret; } obj->last_fenced_seqno = 0; obj->last_fenced_ring = NULL; } /* Ensure that all CPU reads are completed before installing a fence * and all writes before removing the fence. */ if (obj->base.read_domains & I915_GEM_DOMAIN_GTT) mb(); return 0; } int i915_gem_object_put_fence(struct drm_i915_gem_object *obj) { int ret; if (obj->tiling_mode) i915_gem_release_mmap(obj); ret = i915_gem_object_flush_fence(obj, NULL); if (ret) return ret; if (obj->fence_reg != I915_FENCE_REG_NONE) { struct drm_i915_private *dev_priv = obj->base.dev->dev_private; if (dev_priv->fence_regs[obj->fence_reg].pin_count != 0) printf("%s: pin_count %d\n", __func__, dev_priv->fence_regs[obj->fence_reg].pin_count); i915_gem_clear_fence_reg(obj->base.dev, &dev_priv->fence_regs[obj->fence_reg]); obj->fence_reg = I915_FENCE_REG_NONE; } return 0; } static struct drm_i915_fence_reg * i915_find_fence_reg(struct drm_device *dev, struct intel_ring_buffer *pipelined) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_fence_reg *reg, *first, *avail; int i; /* First try to find a free reg */ avail = NULL; for (i = dev_priv->fence_reg_start; i < dev_priv->num_fence_regs; i++) { reg = &dev_priv->fence_regs[i]; if (!reg->obj) return reg; if (!reg->pin_count) avail = reg; } if (avail == NULL) return NULL; /* None available, try to steal one or wait for a user to finish */ avail = first = NULL; list_for_each_entry(reg, &dev_priv->mm.fence_list, lru_list) { if (reg->pin_count) continue; if (first == NULL) first = reg; if (!pipelined || !reg->obj->last_fenced_ring || reg->obj->last_fenced_ring == pipelined) { avail = reg; break; } } if (avail == NULL) avail = first; return avail; } int i915_gem_object_get_fence(struct drm_i915_gem_object *obj, struct intel_ring_buffer *pipelined) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_fence_reg *reg; int ret; pipelined = NULL; ret = 0; if (obj->fence_reg != I915_FENCE_REG_NONE) { reg = &dev_priv->fence_regs[obj->fence_reg]; list_move_tail(®->lru_list, &dev_priv->mm.fence_list); if (obj->tiling_changed) { ret = i915_gem_object_flush_fence(obj, pipelined); if (ret) return ret; if (!obj->fenced_gpu_access && !obj->last_fenced_seqno) pipelined = NULL; if (pipelined) { reg->setup_seqno = i915_gem_next_request_seqno(pipelined); obj->last_fenced_seqno = reg->setup_seqno; obj->last_fenced_ring = pipelined; } goto update; } if (!pipelined) { if (reg->setup_seqno) { if (!ring_passed_seqno(obj->last_fenced_ring, reg->setup_seqno)) { ret = i915_wait_request( obj->last_fenced_ring, reg->setup_seqno, true); if (ret) return ret; } reg->setup_seqno = 0; } } else if (obj->last_fenced_ring && obj->last_fenced_ring != pipelined) { ret = i915_gem_object_flush_fence(obj, pipelined); if (ret) return ret; } if (!obj->fenced_gpu_access && !obj->last_fenced_seqno) pipelined = NULL; KASSERT(pipelined || reg->setup_seqno == 0, ("!pipelined")); if (obj->tiling_changed) { if (pipelined) { reg->setup_seqno = i915_gem_next_request_seqno(pipelined); obj->last_fenced_seqno = reg->setup_seqno; obj->last_fenced_ring = pipelined; } goto update; } return 0; } reg = i915_find_fence_reg(dev, pipelined); if (reg == NULL) return -EDEADLK; ret = i915_gem_object_flush_fence(obj, pipelined); if (ret) return ret; if (reg->obj) { struct drm_i915_gem_object *old = reg->obj; drm_gem_object_reference(&old->base); if (old->tiling_mode) i915_gem_release_mmap(old); ret = i915_gem_object_flush_fence(old, pipelined); if (ret) { drm_gem_object_unreference(&old->base); return ret; } if (old->last_fenced_seqno == 0 && obj->last_fenced_seqno == 0) pipelined = NULL; old->fence_reg = I915_FENCE_REG_NONE; old->last_fenced_ring = pipelined; old->last_fenced_seqno = pipelined ? i915_gem_next_request_seqno(pipelined) : 0; drm_gem_object_unreference(&old->base); } else if (obj->last_fenced_seqno == 0) pipelined = NULL; reg->obj = obj; list_move_tail(®->lru_list, &dev_priv->mm.fence_list); obj->fence_reg = reg - dev_priv->fence_regs; obj->last_fenced_ring = pipelined; reg->setup_seqno = pipelined ? i915_gem_next_request_seqno(pipelined) : 0; obj->last_fenced_seqno = reg->setup_seqno; update: obj->tiling_changed = false; switch (INTEL_INFO(dev)->gen) { case 7: case 6: ret = sandybridge_write_fence_reg(obj, pipelined); break; case 5: case 4: ret = i965_write_fence_reg(obj, pipelined); break; case 3: ret = i915_write_fence_reg(obj, pipelined); break; case 2: ret = i830_write_fence_reg(obj, pipelined); break; } return ret; } static void i915_gem_clear_fence_reg(struct drm_device *dev, struct drm_i915_fence_reg *reg) { drm_i915_private_t *dev_priv = dev->dev_private; uint32_t fence_reg = reg - dev_priv->fence_regs; switch (INTEL_INFO(dev)->gen) { case 7: case 6: I915_WRITE64(FENCE_REG_SANDYBRIDGE_0 + fence_reg*8, 0); break; case 5: case 4: I915_WRITE64(FENCE_REG_965_0 + fence_reg*8, 0); break; case 3: if (fence_reg >= 8) fence_reg = FENCE_REG_945_8 + (fence_reg - 8) * 4; else case 2: fence_reg = FENCE_REG_830_0 + fence_reg * 4; I915_WRITE(fence_reg, 0); break; } list_del_init(®->lru_list); reg->obj = NULL; reg->setup_seqno = 0; reg->pin_count = 0; } int i915_gem_init_object(struct drm_gem_object *obj) { printf("i915_gem_init_object called\n"); return (0); } static bool i915_gem_object_is_inactive(struct drm_i915_gem_object *obj) { return (obj->gtt_space && !obj->active && obj->pin_count == 0); } static void i915_gem_retire_task_handler(void *arg, int pending) { drm_i915_private_t *dev_priv; struct drm_device *dev; bool idle; int i; dev_priv = arg; dev = dev_priv->dev; /* Come back later if the device is busy... */ if (!sx_try_xlock(&dev->dev_struct_lock)) { taskqueue_enqueue_timeout(dev_priv->tq, &dev_priv->mm.retire_task, hz); return; } CTR0(KTR_DRM, "retire_task"); i915_gem_retire_requests(dev); /* Send a periodic flush down the ring so we don't hold onto GEM * objects indefinitely. */ idle = true; for (i = 0; i < I915_NUM_RINGS; i++) { struct intel_ring_buffer *ring = &dev_priv->rings[i]; if (!list_empty(&ring->gpu_write_list)) { struct drm_i915_gem_request *request; int ret; ret = i915_gem_flush_ring(ring, 0, I915_GEM_GPU_DOMAINS); request = malloc(sizeof(*request), DRM_I915_GEM, M_WAITOK | M_ZERO); if (ret || request == NULL || i915_add_request(ring, NULL, request)) free(request, DRM_I915_GEM); } idle &= list_empty(&ring->request_list); } if (!dev_priv->mm.suspended && !idle) taskqueue_enqueue_timeout(dev_priv->tq, &dev_priv->mm.retire_task, hz); DRM_UNLOCK(dev); } void i915_gem_lastclose(struct drm_device *dev) { int ret; if (drm_core_check_feature(dev, DRIVER_MODESET)) return; ret = i915_gem_idle(dev); if (ret != 0) DRM_ERROR("failed to idle hardware: %d\n", ret); } static int i915_gem_init_phys_object(struct drm_device *dev, int id, int size, int align) { drm_i915_private_t *dev_priv; struct drm_i915_gem_phys_object *phys_obj; int ret; dev_priv = dev->dev_private; if (dev_priv->mm.phys_objs[id - 1] != NULL || size == 0) return (0); phys_obj = malloc(sizeof(struct drm_i915_gem_phys_object), DRM_I915_GEM, M_WAITOK | M_ZERO); phys_obj->id = id; phys_obj->handle = drm_pci_alloc(dev, size, align, ~0); if (phys_obj->handle == NULL) { ret = -ENOMEM; goto free_obj; } pmap_change_attr((vm_offset_t)phys_obj->handle->vaddr, size / PAGE_SIZE, PAT_WRITE_COMBINING); dev_priv->mm.phys_objs[id - 1] = phys_obj; return (0); free_obj: free(phys_obj, DRM_I915_GEM); return (ret); } static void i915_gem_free_phys_object(struct drm_device *dev, int id) { drm_i915_private_t *dev_priv; struct drm_i915_gem_phys_object *phys_obj; dev_priv = dev->dev_private; if (dev_priv->mm.phys_objs[id - 1] == NULL) return; phys_obj = dev_priv->mm.phys_objs[id - 1]; if (phys_obj->cur_obj != NULL) i915_gem_detach_phys_object(dev, phys_obj->cur_obj); drm_pci_free(dev, phys_obj->handle); free(phys_obj, DRM_I915_GEM); dev_priv->mm.phys_objs[id - 1] = NULL; } void i915_gem_free_all_phys_object(struct drm_device *dev) { int i; for (i = I915_GEM_PHYS_CURSOR_0; i <= I915_MAX_PHYS_OBJECT; i++) i915_gem_free_phys_object(dev, i); } void i915_gem_detach_phys_object(struct drm_device *dev, struct drm_i915_gem_object *obj) { vm_page_t m; struct sf_buf *sf; char *vaddr, *dst; int i, page_count; if (obj->phys_obj == NULL) return; vaddr = obj->phys_obj->handle->vaddr; page_count = obj->base.size / PAGE_SIZE; VM_OBJECT_WLOCK(obj->base.vm_obj); for (i = 0; i < page_count; i++) { m = i915_gem_wire_page(obj->base.vm_obj, i); if (m == NULL) continue; /* XXX */ VM_OBJECT_WUNLOCK(obj->base.vm_obj); sf = sf_buf_alloc(m, 0); if (sf != NULL) { dst = (char *)sf_buf_kva(sf); memcpy(dst, vaddr + IDX_TO_OFF(i), PAGE_SIZE); sf_buf_free(sf); } drm_clflush_pages(&m, 1); VM_OBJECT_WLOCK(obj->base.vm_obj); vm_page_reference(m); vm_page_lock(m); vm_page_dirty(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); atomic_add_long(&i915_gem_wired_pages_cnt, -1); } VM_OBJECT_WUNLOCK(obj->base.vm_obj); intel_gtt_chipset_flush(); obj->phys_obj->cur_obj = NULL; obj->phys_obj = NULL; } int i915_gem_attach_phys_object(struct drm_device *dev, struct drm_i915_gem_object *obj, int id, int align) { drm_i915_private_t *dev_priv; vm_page_t m; struct sf_buf *sf; char *dst, *src; int i, page_count, ret; if (id > I915_MAX_PHYS_OBJECT) return (-EINVAL); if (obj->phys_obj != NULL) { if (obj->phys_obj->id == id) return (0); i915_gem_detach_phys_object(dev, obj); } dev_priv = dev->dev_private; if (dev_priv->mm.phys_objs[id - 1] == NULL) { ret = i915_gem_init_phys_object(dev, id, obj->base.size, align); if (ret != 0) { DRM_ERROR("failed to init phys object %d size: %zu\n", id, obj->base.size); return (ret); } } /* bind to the object */ obj->phys_obj = dev_priv->mm.phys_objs[id - 1]; obj->phys_obj->cur_obj = obj; page_count = obj->base.size / PAGE_SIZE; VM_OBJECT_WLOCK(obj->base.vm_obj); ret = 0; for (i = 0; i < page_count; i++) { m = i915_gem_wire_page(obj->base.vm_obj, i); if (m == NULL) { ret = -EIO; break; } VM_OBJECT_WUNLOCK(obj->base.vm_obj); sf = sf_buf_alloc(m, 0); src = (char *)sf_buf_kva(sf); dst = (char *)obj->phys_obj->handle->vaddr + IDX_TO_OFF(i); memcpy(dst, src, PAGE_SIZE); sf_buf_free(sf); VM_OBJECT_WLOCK(obj->base.vm_obj); vm_page_reference(m); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); atomic_add_long(&i915_gem_wired_pages_cnt, -1); } VM_OBJECT_WUNLOCK(obj->base.vm_obj); return (0); } static int i915_gem_phys_pwrite(struct drm_device *dev, struct drm_i915_gem_object *obj, uint64_t data_ptr, uint64_t offset, uint64_t size, struct drm_file *file_priv) { char *user_data, *vaddr; int ret; vaddr = (char *)obj->phys_obj->handle->vaddr + offset; user_data = (char *)(uintptr_t)data_ptr; if (copyin_nofault(user_data, vaddr, size) != 0) { /* The physical object once assigned is fixed for the lifetime * of the obj, so we can safely drop the lock and continue * to access vaddr. */ DRM_UNLOCK(dev); ret = -copyin(user_data, vaddr, size); DRM_LOCK(dev); if (ret != 0) return (ret); } intel_gtt_chipset_flush(); return (0); } static int i915_gpu_is_active(struct drm_device *dev) { drm_i915_private_t *dev_priv; dev_priv = dev->dev_private; return (!list_empty(&dev_priv->mm.flushing_list) || !list_empty(&dev_priv->mm.active_list)); } static void i915_gem_lowmem(void *arg) { struct drm_device *dev; struct drm_i915_private *dev_priv; struct drm_i915_gem_object *obj, *next; int cnt, cnt_fail, cnt_total; dev = arg; dev_priv = dev->dev_private; if (!sx_try_xlock(&dev->dev_struct_lock)) return; CTR0(KTR_DRM, "gem_lowmem"); rescan: /* first scan for clean buffers */ i915_gem_retire_requests(dev); cnt_total = cnt_fail = cnt = 0; list_for_each_entry_safe(obj, next, &dev_priv->mm.inactive_list, mm_list) { if (i915_gem_object_is_purgeable(obj)) { if (i915_gem_object_unbind(obj) != 0) cnt_total++; } else cnt_total++; } /* second pass, evict/count anything still on the inactive list */ list_for_each_entry_safe(obj, next, &dev_priv->mm.inactive_list, mm_list) { if (i915_gem_object_unbind(obj) == 0) cnt++; else cnt_fail++; } if (cnt_fail > cnt_total / 100 && i915_gpu_is_active(dev)) { /* * We are desperate for pages, so as a last resort, wait * for the GPU to finish and discard whatever we can. * This has a dramatic impact to reduce the number of * OOM-killer events whilst running the GPU aggressively. */ if (i915_gpu_idle(dev, true) == 0) goto rescan; } DRM_UNLOCK(dev); } void i915_gem_unload(struct drm_device *dev) { struct drm_i915_private *dev_priv; dev_priv = dev->dev_private; EVENTHANDLER_DEREGISTER(vm_lowmem, dev_priv->mm.i915_lowmem); } Index: user/attilio/rm_vmobj_cache/sys/kern/uipc_syscalls.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/kern/uipc_syscalls.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/kern/uipc_syscalls.c (revision 267237) @@ -1,3668 +1,3668 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #ifdef SCTP #include #include #endif /* SCTP */ #endif /* INET || INET6 */ /* * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC * and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; static int filt_sfsync_attach(struct knote *kn); static void filt_sfsync_detach(struct knote *kn); static int filt_sfsync(struct knote *kn, long hint); /* * sendfile(2)-related variables and associated sysctls */ static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, "sendfile(2) tunables"); static int sfreadahead = 1; SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); #ifdef SFSYNC_DEBUG static int sf_sync_debug = 0; SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW, &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle"); #define SFSYNC_DPRINTF(s, ...) \ do { \ if (sf_sync_debug) \ printf((s), ##__VA_ARGS__); \ } while (0) #else #define SFSYNC_DPRINTF(c, ...) #endif static uma_zone_t zone_sfsync; static struct filterops sendfile_filtops = { .f_isfd = 0, .f_attach = filt_sfsync_attach, .f_detach = filt_sfsync_detach, .f_event = filt_sfsync, }; static void sfstat_init(const void *unused) { COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), M_WAITOK); } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); static void sf_sync_init(const void *unused) { zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops); } SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL); static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { struct sfstat s; COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); if (req->newptr) COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); return (SYSCTL_OUT(req, &s, sizeof(s))); } SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * A reference on the file entry is held upon returning. */ static int getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp) { struct file *fp; int error; error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(td, uap) struct thread *td; struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct socket *so; struct file *fp; int fd, error, type, oflag, fflag; AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); type = uap->type; oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, uap->domain, type, uap->protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(uap->domain, &so, type, uap->protocol, td->td_ucred, td); if (error != 0) { fdclose(td->td_proc->p_fd, fp, fd, td); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_bind(td, uap) struct thread *td; struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bind(td, uap->s, sa); free(sa, M_SONAME); } return (error); } static int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td->td_proc->p_fd, fd, cap_rights_init(&rights, CAP_BIND), &fp, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } int kern_bind(struct thread *td, int fd, struct sockaddr *sa) { return (kern_bindat(td, AT_FDCWD, fd, sa)); } /* ARGSUSED */ int sys_bindat(td, uap) struct thread *td; struct bindat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } /* ARGSUSED */ int sys_listen(td, uap) struct thread *td; struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td->td_proc->p_fd, uap->s, cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, uap->backlog, td); fdrop(fp, td); } return(error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (flags & ACCEPT4_COMPAT) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct filedesc *fdp; struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; cap_rights_t rights; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); fdp = td->td_proc->p_fd; error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT), &headfp, &fflag); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); if (error != 0) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error != 0) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(fdp, nfp, fd, td); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ /* ARGSUSED */ int sys_connect(td, uap) struct thread *td; struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connect(td, uap->s, sa); free(sa, M_SONAME); } return (error); } static int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error, interrupted = 0; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td->td_proc->p_fd, fd, cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int kern_connect(struct thread *td, int fd, struct sockaddr *sa) { return (kern_connectat(td, AT_FDCWD, fd, sa)); } /* ARGSUSED */ int sys_connectat(td, uap) struct thread *td; struct connectat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct filedesc *fdp = td->td_proc->p_fd; struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(fdp, fp2, rsv[1], td); fdrop(fp2, td); free3: fdclose(fdp, fp1, rsv[0], td); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(td, s, mp, flags) struct thread *td; int s; struct msghdr *mp; int flags; { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(td, s, mp, flags, control, segflg) struct thread *td; int s; struct msghdr *mp; int flags; struct mbuf *control; enum uio_seg segflg; { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); cap_rights_init(&rights, CAP_SEND); if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL); if (error != 0) return (error); so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) goto bad; } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto bad; #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(td, uap) struct thread *td; struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(td, uap) struct thread *td; struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(td, uap) struct thread *td; struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(td, uap) struct thread *td; struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(td, s, mp, fromseg, controlp) struct thread *td; int s; struct msghdr *mp; enum uio_seg fromseg; struct mbuf **controlp; { struct uio auio; struct iovec *iov; struct mbuf *m, *control = NULL; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, CAP_RECV), &fp, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(td, s, mp, namelenp) struct thread *td; int s; struct msghdr *mp; void *namelenp; { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(td, uap) struct thread *td; struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; struct sockaddr * __restrict from; socklen_t * __restrict fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(td, uap) struct thread *td; struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(td, uap) struct thread *td; struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(td, uap) struct thread *td; struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } /* ARGSUSED */ int sys_shutdown(td, uap) struct thread *td; struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td->td_proc->p_fd, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); fdrop(fp, td); } return (error); } /* ARGSUSED */ int sys_setsockopt(td, uap) struct thread *td; struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } /* ARGSUSED */ int sys_getsockopt(td, uap) struct thread *td; struct getsockopt_args /* { int s; int level; int name; void * __restrict val; socklen_t * __restrict avalsize; } */ *uap; { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t *valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td->td_proc->p_fd, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; struct getpeername_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td->td_proc->p_fd, fd, cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; int buflen, type; { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } static int filt_sfsync_attach(struct knote *kn) { struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata; struct knlist *knl = &sfs->klist; SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); /* * Validate that we actually received this via the kernel API. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_v = sfs; kn->kn_flags &= ~EV_FLAG1; knl->kl_lock(knl->kl_lockarg); /* * If we're in the "freeing" state, * don't allow the add. That way we don't * end up racing with some other thread that * is trying to finish some setup. */ if (sfs->state == SF_STATE_FREEING) { knl->kl_unlock(knl->kl_lockarg); return (EINVAL); } knlist_add(&sfs->klist, kn, 1); knl->kl_unlock(knl->kl_lockarg); return (0); } /* * Called when a knote is being detached. */ static void filt_sfsync_detach(struct knote *kn) { struct knlist *knl; struct sendfile_sync *sfs; int do_free = 0; sfs = kn->kn_ptr.p_v; knl = &sfs->klist; SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); /* * If the list is empty _AND_ the refcount is 0 * _AND_ we've finished the setup phase and now * we're in the running phase, we can free the * underlying sendfile_sync. * * But we shouldn't do it before finishing the * underlying divorce from the knote. * * So, we have the sfsync lock held; transition * it to "freeing", then unlock, then free * normally. */ if (knlist_empty(knl)) { if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) { SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " "count==0, empty list: time to free!\n", __func__, (unsigned long long) curthread->td_tid, sfs); sf_sync_set_state(sfs, SF_STATE_FREEING, 1); do_free = 1; } } knl->kl_unlock(knl->kl_lockarg); /* * Only call free if we're the one who has transitioned things * to free. Otherwise we could race with another thread that * is currently tearing things down. */ if (do_free == 1) { SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n", __func__, (unsigned long long) curthread->td_tid, sfs, __FILE__, __LINE__); sf_sync_free(sfs); } } static int filt_sfsync(struct knote *kn, long hint) { struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v; int ret; SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); /* * XXX add a lock assertion here! */ ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED); return (ret); } /* * Detach mapped page and release resources back to the system. */ int sf_buf_mext(struct mbuf *mb, void *addr, void *args) { vm_page_t m; struct sendfile_sync *sfs; m = sf_buf_page(args); sf_buf_free(args); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (m->wire_count == 0 && m->object == NULL) vm_page_free(m); vm_page_unlock(m); if (addr != NULL) { sfs = addr; sf_sync_deref(sfs); } /* * sfs may be invalid at this point, don't use it! */ return (EXT_FREE_OK); } /* * Called to remove a reference to a sf_sync object. * * This is generally done during the mbuf free path to signify * that one of the mbufs in the transaction has been completed. * * If we're doing SF_SYNC and the refcount is zero then we'll wake * up any waiters. * * IF we're doing SF_KQUEUE and the refcount is zero then we'll * fire off the knote. */ void sf_sync_deref(struct sendfile_sync *sfs) { int do_free = 0; if (sfs == NULL) return; mtx_lock(&sfs->mtx); KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); sfs->count --; /* * Only fire off the wakeup / kqueue notification if * we are in the running state. */ if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) { if (sfs->flags & SF_SYNC) cv_signal(&sfs->cv); if (sfs->flags & SF_KQUEUE) { SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n", __func__, (unsigned long long) curthread->td_tid, sfs); KNOTE_LOCKED(&sfs->klist, 1); } /* * If we're not waiting around for a sync, * check if the knote list is empty. * If it is, we transition to free. * * XXX I think it's about time I added some state * or flag that says whether we're supposed to be * waiting around until we've done a signal. * * XXX Ie, the reason that I don't free it here * is because the caller will free the last reference, * not us. That should be codified in some flag * that indicates "self-free" rather than checking * for SF_SYNC all the time. */ if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) { SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " "count==0, empty list: time to free!\n", __func__, (unsigned long long) curthread->td_tid, sfs); sf_sync_set_state(sfs, SF_STATE_FREEING, 1); do_free = 1; } } mtx_unlock(&sfs->mtx); /* * Attempt to do a free here. * * We do this outside of the lock because it may destroy the * lock in question as it frees things. We can optimise this * later. * * XXX yes, we should make it a requirement to hold the * lock across sf_sync_free(). */ if (do_free == 1) { SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n", __func__, (unsigned long long) curthread->td_tid, sfs); sf_sync_free(sfs); } } /* * Allocate a sendfile_sync state structure. * * For now this only knows about the "sleep" sync, but later it will * grow various other personalities. */ struct sendfile_sync * sf_sync_alloc(uint32_t flags) { struct sendfile_sync *sfs; sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); sfs->flags = flags; sfs->state = SF_STATE_SETUP; knlist_init_mtx(&sfs->klist, &sfs->mtx); SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags); return (sfs); } /* * Take a reference to a sfsync instance. * * This has to map 1:1 to free calls coming in via sf_buf_mext(), * so typically this will be referenced once for each mbuf allocated. */ void sf_sync_ref(struct sendfile_sync *sfs) { if (sfs == NULL) return; mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } void sf_sync_syscall_wait(struct sendfile_sync *sfs) { if (sfs == NULL) return; KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", __func__, sfs)); /* * If we're not requested to wait during the syscall, * don't bother waiting. */ if ((sfs->flags & SF_SYNC) == 0) goto out; /* * This is a bit suboptimal and confusing, so bear with me. * * Ideally sf_sync_syscall_wait() will wait until * all pending mbuf transmit operations are done. * This means that when sendfile becomes async, it'll * run in the background and will transition from * RUNNING to COMPLETED when it's finished acquiring * new things to send. Then, when the mbufs finish * sending, COMPLETED + sfs->count == 0 is enough to * know that no further work is being done. * * So, we will sleep on both RUNNING and COMPLETED. * It's up to the (in progress) async sendfile loop * to transition the sf_sync from RUNNING to * COMPLETED so the wakeup above will actually * do the cv_signal() call. */ if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING) goto out; if (sfs->count != 0) cv_wait(&sfs->cv, &sfs->mtx); KASSERT(sfs->count == 0, ("sendfile sync still busy")); out: return; } /* * Free an sf_sync if it's appropriate to. */ void sf_sync_free(struct sendfile_sync *sfs) { if (sfs == NULL) return; SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x " "count=%d\n", __func__, (long long) curthread->td_tid, sfs, sfs->state, sfs->flags, sfs->count); mtx_lock(&sfs->mtx); /* * We keep the sf_sync around if the state is active, * we are doing kqueue notification and we have active * knotes. * * If the caller wants to free us right this second it * should transition this to the freeing state. * * So, complain loudly if they break this rule. */ if (sfs->state != SF_STATE_FREEING) { printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n", __func__, (unsigned long long) curthread->td_tid, sfs); mtx_unlock(&sfs->mtx); return; } KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); /* * This doesn't call knlist_detach() on each knote; it just frees * the entire list. */ knlist_delete(&sfs->klist, curthread, 1); mtx_destroy(&sfs->mtx); SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n", __func__, (unsigned long long) curthread->td_tid, sfs); uma_zfree(zone_sfsync, sfs); } /* * Setup a sf_sync to post a kqueue notification when things are complete. */ int sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq) { struct kevent kev; int error; sfs->flags |= SF_KQUEUE; /* Check the flags are valid */ if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) return (EINVAL); SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n", __func__, sfs, sfkq->kq_fd, sfkq->kq_flags, (void *) sfkq->kq_ident, (void *) sfkq->kq_udata); /* Setup and register a knote on the given kqfd. */ kev.ident = (uintptr_t) sfkq->kq_ident; kev.filter = EVFILT_SENDFILE; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags; kev.data = (intptr_t) sfs; kev.udata = sfkq->kq_udata; error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1); if (error != 0) { SFSYNC_DPRINTF("%s: returned %d\n", __func__, error); } return (error); } void sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state, int islocked) { sendfile_sync_state_t old_state; if (! islocked) mtx_lock(&sfs->mtx); /* * Update our current state. */ old_state = sfs->state; sfs->state = state; SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n", __func__, (unsigned long long) curthread->td_tid, sfs, old_state, state); /* * If we're transitioning from RUNNING to COMPLETED and the count is * zero, then post the knote. The caller may have completed the * send before we updated the state to COMPLETED and we need to make * sure this is communicated. */ if (old_state == SF_STATE_RUNNING && state == SF_STATE_COMPLETED && sfs->count == 0 && sfs->flags & SF_KQUEUE) { SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n", __func__, (unsigned long long) curthread->td_tid, sfs); KNOTE_LOCKED(&sfs->klist, 1); } if (! islocked) mtx_unlock(&sfs->mtx); } /* * Set the retval/errno for the given transaction. * * This will eventually/ideally be used when the KNOTE is fired off * to signify the completion of this transaction. * * The sfsync lock should be held before entering this function. */ void sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno) { KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", __func__, sfs)); SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n", __func__, (unsigned long long) curthread->td_tid, sfs, xerrno, (intmax_t) retval); sfs->retval = retval; sfs->xerrno = xerrno; } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sys_sendfile(struct thread *td, struct sendfile_args *uap) { return (do_sendfile(td, uap, 0)); } int _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags, int compat, off_t offset, size_t nbytes, off_t *sbytes, struct uio *hdr_uio, struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq) { cap_rights_t rights; struct sendfile_sync *sfs = NULL; struct file *fp; int error; int do_kqueue = 0; int do_free = 0; AUDIT_ARG_FD(src_fd); if (hdtr_kq != NULL) do_kqueue = 1; /* * sendfile(2) can start at any offset within a file so we require * CAP_READ+CAP_SEEK = CAP_PREAD. */ if ((error = fget_read(td, src_fd, cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { goto out; } /* * IF SF_KQUEUE is set but we haven't copied in anything for * kqueue data, error out. */ if (flags & SF_KQUEUE && do_kqueue == 0) { SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__); goto out; } /* * If we need to wait for completion, initialise the sfsync * state here. */ if (flags & (SF_SYNC | SF_KQUEUE)) sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE)); if (flags & SF_KQUEUE) { error = sf_sync_kqueue_setup(sfs, hdtr_kq); if (error) { SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n", __func__, (unsigned long long) curthread->td_tid, sfs); sf_sync_set_state(sfs, SF_STATE_FREEING, 0); sf_sync_free(sfs); goto out; } } /* * Do the sendfile call. * * If this fails, it'll free the mbuf chain which will free up the * sendfile_sync references. */ error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset, nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td); /* * If the sendfile call succeeded, transition the sf_sync state * to RUNNING, then COMPLETED. * * If the sendfile call failed, then the sendfile call may have * actually sent some data first - so we check to see whether * any data was sent. If some data was queued (ie, count > 0) * then we can't call free; we have to wait until the partial * transaction completes before we continue along. * * This has the side effect of firing off the knote * if the refcount has hit zero by the time we get here. */ if (sfs != NULL) { mtx_lock(&sfs->mtx); if (error == 0 || sfs->count > 0) { /* * When it's time to do async sendfile, the transition * to RUNNING signifies that we're actually actively * adding and completing mbufs. When the last disk * buffer is read (ie, when we're not doing any * further read IO and all subsequent stuff is mbuf * transmissions) we'll transition to COMPLETED * and when the final mbuf is freed, the completion * will be signaled. */ sf_sync_set_state(sfs, SF_STATE_RUNNING, 1); /* * Set the retval before we signal completed. * If we do it the other way around then transitioning to * COMPLETED may post the knote before you set the return * status! * * XXX for now, errno is always 0, as we don't post * knotes if sendfile failed. Maybe that'll change later. */ sf_sync_set_retval(sfs, *sbytes, error); /* * And now transition to completed, which will kick off * the knote if required. */ sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1); } else { /* * Error isn't zero, sfs_count is zero, so we * won't have some other thing to wake things up. * Thus free. */ sf_sync_set_state(sfs, SF_STATE_FREEING, 1); do_free = 1; } /* * Next - wait if appropriate. */ sf_sync_syscall_wait(sfs); /* * If we're not doing kqueue notifications, we can * transition this immediately to the freeing state. */ if ((sfs->flags & SF_KQUEUE) == 0) { sf_sync_set_state(sfs, SF_STATE_FREEING, 1); do_free = 1; } mtx_unlock(&sfs->mtx); } /* * If do_free is set, free here. * * If we're doing no-kqueue notification and it's just sleep notification, * we also do free; it's the only chance we have. */ if (sfs != NULL && do_free == 1) { sf_sync_free(sfs); } /* * XXX Should we wait until the send has completed before freeing the source * file handle? It's the previous behaviour, sure, but is it required? * We've wired down the page references after all. */ fdrop(fp, td); out: /* Return error */ return (error); } static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct sf_hdtr_kq hdtr_kq; struct uio *hdr_uio, *trl_uio; int error; off_t sbytes; int do_kqueue = 0; /* * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if (uap->offset < 0) return (EINVAL); hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error != 0) goto out; } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; } /* * If SF_KQUEUE is set, then we need to also copy in * the kqueue data after the normal hdtr set and set * do_kqueue=1. */ if (uap->flags & SF_KQUEUE) { error = copyin(((char *) uap->hdtr) + sizeof(hdtr), &hdtr_kq, sizeof(hdtr_kq)); if (error != 0) goto out; do_kqueue = 1; } } /* Call sendfile */ error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat, uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq); if (uap->sbytes != NULL) { copyout(&sbytes, uap->sbytes, sizeof(off_t)); } out: free(hdr_uio, M_IOV); free(trl_uio, M_IOV); return (error); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (do_sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ static int sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) { vm_page_t m; vm_pindex_t pindex; ssize_t resid; int error, readahead, rv; pindex = OFF_TO_IDX(off); VM_OBJECT_WLOCK(obj); m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); /* * Check if page is valid for what we need, otherwise initiate I/O. * * The non-zero nd argument prevents disk I/O, instead we * return the caller what he specified in nd. In particular, * if we already turned some pages into mbufs, nd == EAGAIN * and the main function send them the pages before we come * here again and block. */ if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { if (vp == NULL) vm_page_xunbusy(m); VM_OBJECT_WUNLOCK(obj); *res = m; return (0); } else if (nd != 0) { if (vp == NULL) vm_page_xunbusy(m); error = nd; goto free_page; } /* * Get the page from backing store. */ error = 0; if (vp != NULL) { VM_OBJECT_WUNLOCK(obj); readahead = sfreadahead * MAXBSIZE; /* * Use vn_rdwr() instead of the pager interface for * the vnode, to allow the read-ahead. * * XXXMAC: Because we don't have fp->f_cred here, we * pass in NOCRED. This is probably wrong, but is * consistent with our original implementation. */ error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); SFSTAT_INC(sf_iocnt); VM_OBJECT_WLOCK(obj); } else { if (vm_pager_has_page(obj, pindex, NULL, NULL)) { rv = vm_pager_get_pages(obj, &m, 1, 0); SFSTAT_INC(sf_iocnt); m = vm_page_lookup(obj, pindex); if (m == NULL) error = EIO; else if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); m = NULL; error = EIO; } } else { pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; m->dirty = 0; } if (m != NULL) vm_page_xunbusy(m); } if (error == 0) { *res = m; } else if (m != NULL) { free_page: vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); /* * See if anyone else might know about this page. If * not and it is not valid, then free it. */ if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) vm_page_free(m); vm_page_unlock(m); } KASSERT(error != 0 || (m->wire_count > 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)), ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, xfsize)); VM_OBJECT_WUNLOCK(obj); return (error); } static int sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, int *bsize) { struct vattr va; vm_object_t obj; struct vnode *vp; struct shmfd *shmfd; int error; vp = *vp_res = NULL; obj = NULL; shmfd = *shmfd_res = NULL; *bsize = 0; /* * The file descriptor must be a regular file and have a * backing VM object. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VREG) { error = EINVAL; goto out; } *bsize = vp->v_mount->mnt_stat.f_iosize; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) goto out; *obj_size = va.va_size; obj = vp->v_object; if (obj == NULL) { error = EINVAL; goto out; } } else if (fp->f_type == DTYPE_SHM) { shmfd = fp->f_data; obj = shmfd->shm_object; *obj_size = shmfd->shm_size; } else { error = EINVAL; goto out; } VM_OBJECT_WLOCK(obj); if ((obj->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(obj); error = EBADF; goto out; } /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); *obj_res = obj; *vp_res = vp; *shmfd_res = shmfd; out: if (vp != NULL) VOP_UNLOCK(vp, 0); return (error); } static int kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, struct socket **so) { cap_rights_t rights; int error; *sock_fp = NULL; *so = NULL; /* * The socket must be a stream socket and connected. */ error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, CAP_SEND), sock_fp, NULL); if (error != 0) return (error); *so = (*sock_fp)->f_data; if ((*so)->so_type != SOCK_STREAM) return (EINVAL); if (((*so)->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); return (0); } int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, int kflags, struct sendfile_sync *sfs, struct thread *td) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj; struct socket *so; struct mbuf *m; struct sf_buf *sf; struct vm_page *pg; struct shmfd *shmfd; struct vattr va; off_t off, xfsize, fsbytes, sbytes, rem, obj_size; int error, bsize, nd, hdrlen, mnw; pg = NULL; obj = NULL; so = NULL; m = NULL; fsbytes = sbytes = 0; hdrlen = mnw = 0; rem = nbytes; obj_size = 0; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) return (error); if (rem == 0) rem = obj_size; error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; /* * Do not wait on memory allocations but return ENOMEM for * caller to retry later. * XXX: Experimental. */ if (flags & SF_MNOWAIT) mnw = 1; #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto out; #endif /* If headers are specified copy them into mbufs. */ if (hdr_uio != NULL) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; if (hdr_uio->uio_resid > 0) { /* * In FBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (kflags & SFK_COMPAT) { if (nbytes > hdr_uio->uio_resid) nbytes -= hdr_uio->uio_resid; else nbytes = 0; } m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 0, 0, 0); if (m == NULL) { error = mnw ? EAGAIN : ENOBUFS; goto out; } hdrlen = m_length(m, NULL); } } /* * Protect against multiple writers to the socket. * * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = offset; ; ) { struct mbuf *mtail; int loopbytes; int space; int done; if ((nbytes != 0 && nbytes == fsbytes) || (nbytes == 0 && obj_size == fsbytes)) break; mtail = NULL; loopbytes = 0; space = 0; done = 0; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error != 0) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * Reduce space in the socket buffer by the size of * the header mbuf chain. * hdrlen is set to 0 after the first loop. */ space -= hdrlen; if (vp != NULL) { error = vn_lock(vp, LK_SHARED); if (error != 0) goto done; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0 || off >= va.va_size) { VOP_UNLOCK(vp, 0); goto done; } obj_size = va.va_size; } /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ while (space > loopbytes) { vm_offset_t pgoff; struct mbuf *m0; /* * Calculate the amount to transfer. * Not to exceed a page, the EOF, * or the passed in nbytes. */ pgoff = (vm_offset_t)(off & PAGE_MASK); rem = obj_size - offset; if (nbytes != 0) rem = omin(rem, nbytes); rem -= fsbytes + loopbytes; xfsize = omin(PAGE_SIZE - pgoff, rem); xfsize = omin(space - loopbytes, xfsize); if (xfsize <= 0) { done = 1; /* all data sent */ break; } /* * Attempt to look up the page. Allocate * if not found or wait and loop if busy. */ if (m != NULL) nd = EAGAIN; /* send what we already got */ else if ((flags & SF_NODISKIO) != 0) nd = EBUSY; else nd = 0; error = sendfile_readpage(obj, vp, nd, off, xfsize, bsize, td, &pg); if (error != 0) { if (error == EAGAIN) error = 0; /* not a real error */ break; } /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually * wait as long as necessary, but this wait * can be interrupted. For consequent * buffers, do not sleep, since several * threads might exhaust the buffers and then * deadlock. */ sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : SFB_CATCH); if (sf == NULL) { SFSTAT_INC(sf_allocfail); vm_page_lock(pg); - vm_page_unwire(pg, 0); + vm_page_unwire(pg, PQ_INACTIVE); KASSERT(pg->object != NULL, ("%s: object disappeared", __func__)); vm_page_unlock(pg); if (m == NULL) error = (mnw ? EAGAIN : EINTR); break; } /* * Get an mbuf and set it up as having * external storage. */ m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); if (m0 == NULL) { error = (mnw ? EAGAIN : ENOBUFS); (void)sf_buf_mext(NULL, NULL, sf); break; } if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF, (mnw ? M_NOWAIT : M_WAITOK)) != 0) { error = (mnw ? EAGAIN : ENOBUFS); (void)sf_buf_mext(NULL, NULL, sf); m_freem(m0); break; } m0->m_data = (char *)sf_buf_kva(sf) + pgoff; m0->m_len = xfsize; /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else if (m != NULL) m_last(m)->m_next = m0; else m = m0; mtail = m0; /* Keep track of bits processed. */ loopbytes += xfsize; off += xfsize; /* * XXX eventually this should be a sfsync * method call! */ if (sfs != NULL) sf_sync_ref(sfs); } if (vp != NULL) VOP_UNLOCK(vp, 0); /* Add the buffer chain to the socket buffer. */ if (m != NULL) { int mlen, err; mlen = m_length(m, NULL); SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } SOCKBUF_UNLOCK(&so->so_snd); CURVNET_SET(so->so_vnet); /* Avoid error aliasing. */ err = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); CURVNET_RESTORE(); if (err == 0) { /* * We need two counters to get the * file offset and nbytes to send * right: * - sbytes contains the total amount * of bytes sent, including headers. * - fsbytes contains the total amount * of bytes sent from the file. */ sbytes += mlen; fsbytes += mlen; if (hdrlen) { fsbytes -= hdrlen; hdrlen = 0; } } else if (error == 0) error = err; m = NULL; /* pru_send always consumes */ } /* Quit outer loop on error or when we're done. */ if (done) break; if (error != 0) goto done; } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { sbunlock(&so->so_snd); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; goto out; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (sent != NULL) { (*sent) = sbytes; } if (obj != NULL) vm_object_deallocate(obj); if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (error == ERESTART) error = EINTR; return (error); } /* * SCTP syscalls. * Functionality only compiled in if SCTP is defined in the kernel Makefile, * otherwise all return EOPNOTSUPP. * XXX: We should make this loadable one day. */ int sys_sctp_peeloff(td, uap) struct thread *td; struct sctp_peeloff_args /* { int sd; caddr_t name; } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct file *nfp = NULL; struct socket *head, *so; cap_rights_t rights; u_int fflag; int error, fd; AUDIT_ARG_FD(uap->sd); error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), &head, &fflag); if (error != 0) goto done2; if (head->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto done; } error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); if (error != 0) goto done; /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = falloc(td, &nfp, &fd, 0); if (error != 0) goto done; td->td_retval[0] = fd; CURVNET_SET(head->so_vnet); so = sonewconn(head, SS_ISCONNECTED); if (so == NULL) { error = ENOMEM; goto noconnection; } /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); soref(so); /* file descriptor reference */ SOCK_UNLOCK(so); ACCEPT_LOCK(); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_state &= ~SS_NOFDREF; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error != 0) goto noconnection; if (head->so_sigio != NULL) fsetown(fgetown(&head->so_sigio), &so->so_sigio); noconnection: /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td->td_proc->p_fd, nfp, fd, td); /* * Release explicitly held references before returning. */ CURVNET_RESTORE(); done: if (nfp != NULL) fdrop(nfp, td); fputsock(head); done2: return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg (td, uap) struct thread *td; struct sctp_generic_sendmsg_args /* { int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec iov[1]; cap_rights_t rights; int error = 0, len; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); if (error != 0) goto sctp_bad; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif iov[0].iov_base = uap->msg; iov[0].iov_len = uap->mlen; so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; len = auio.uio_resid = uap->mlen; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg_iov(td, uap) struct thread *td; struct sctp_generic_sendmsg_iov_args /* { int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec *iov, *tiov; cap_rights_t rights; ssize_t len; int error, i; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); if (error != 0) goto sctp_bad1; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto sctp_bad1; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto sctp_bad; } } len = auio.uio_resid; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: free(iov, M_IOV); sctp_bad1: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_recvmsg(td, uap) struct thread *td; struct sctp_generic_recvmsg_args /* { int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) uint8_t sockbufstore[256]; struct uio auio; struct iovec *iov, *tiov; struct sctp_sndrcvinfo sinfo; struct socket *so; struct file *fp = NULL; struct sockaddr *fromsa; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, fromlen, i, msg_flags; AUDIT_ARG_FD(uap->sd); error = getsock_cap(td->td_proc->p_fd, uap->sd, cap_rights_init(&rights, CAP_RECV), &fp, NULL); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto out1; so = fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto out; } #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) goto out; #endif /* MAC */ if (uap->fromlenaddr != NULL) { error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); if (error != 0) goto out; } else { fromlen = 0; } if (uap->msg_flags) { error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); if (error != 0) goto out; } else { msg_flags = 0; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto out; } } len = auio.uio_resid; fromsa = (struct sockaddr *)sockbufstore; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); CURVNET_SET(so->so_vnet); error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, fromsa, fromlen, &msg_flags, (struct sctp_sndrcvinfo *)&sinfo, 1); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } else { if (uap->sinfo) error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(uap->sd, UIO_READ, ktruio, error); } #endif /* KTRACE */ if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (fromlen && uap->from) { len = fromlen; if (len <= 0 || fromsa == 0) len = 0; else { len = MIN(len, fromsa->sa_len); error = copyout(fromsa, uap->from, (size_t)len); if (error != 0) goto out; } error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); if (error != 0) goto out; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif if (uap->msg_flags) { error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); if (error != 0) goto out; } out: free(iov, M_IOV); out1: if (fp != NULL) fdrop(fp, td); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } Index: user/attilio/rm_vmobj_cache/sys/kern/vfs_bio.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/kern/vfs_bio.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/kern/vfs_bio.c (revision 267237) @@ -1,4607 +1,4607 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_compat.h" #include "opt_swap.h" static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; /* * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. */ struct buf *buf; /* buffer header pool */ caddr_t unmapped_buf; static struct proc *bufdaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_release(struct buf *bp); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static int buf_flush(int); static int flushbufqueues(int, int); static void buf_daemon(void); static void bremfreel(struct buf *bp); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); #endif int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); static long bufspace; #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers"); #else SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, "Virtual memory used for buffers"); #endif static long unmapped_bufspace; SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD, &unmapped_bufspace, 0, "Amount of unmapped buffers, inclusive in the bufspace"); static long maxbufspace; SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, "Maximum allowed value of bufspace (including buf_daemon)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, "Maximum allowed value of bufspace (excluding buf_daemon)"); static int bufreusecnt; SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, "Number of times we have reused a buffer"); static int buffreekvacnt; SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, "Number of times we have freed the KVA space from some buffer"); static int bufdefragcnt; SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int numdirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, "XXX Unused"); static int hifreebuffers; SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, "XXX Complicatedly unused"); static int getnewbufcalls; SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, "Number of calls to getnewbuf"); static int getnewbufrestarts; SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, "Number of times getnewbuf has had to restart a buffer aquisition"); static int mappingrestarts; SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static long notbufdflushes; SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); /* * Lock for the non-dirty bufqueues */ static struct mtx_padalign bqclean; /* * Lock for the dirty queue. */ static struct mtx_padalign bqdirty; /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign rbreqlock; /* * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. */ static struct mtx_padalign nblock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * bogus page -- for I/O to/from partially complete buffers * this is a temporary solution to the problem, but it is not * really that bad. it would be better to split the buffer * for input in the case of buffers partially already in memory, * but the code is intricate enough already. */ vm_page_t bogus_page; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(), * getnewbuf(), and getblk(). */ static int needsbuffer; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define BUFFER_QUEUES 5 /* number of free buffer queues */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ #define QUEUE_EMPTY 4 /* empty buffer headers */ #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */ /* Queues for free buffers with various properties */ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; #ifdef INVARIANTS static int bq_len[BUFFER_QUEUES]; #endif /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, arg1, arg2, req)); lvalue = *(long *)arg1; if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #endif /* * bqlock: * * Return the appropriate queue lock based on the index. */ static inline struct mtx * bqlock(int qindex) { if (qindex == QUEUE_DIRTY) return (struct mtx *)(&bqdirty); return (struct mtx *)(&bqclean); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(void) { if (atomic_fetchadd_int(&numdirtybuffers, -1) == (lodirtybuffers + hidirtybuffers) / 2) bdirtywakeup(); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(void) { /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ if (atomic_fetchadd_int(&numdirtybuffers, 1) == (lodirtybuffers + hidirtybuffers) / 2) bd_wakeup(); } /* * bufspacewakeup: * * Called when buffer space is potentially available for recovery. * getnewbuf() will block on this flag when it is unable to free * sufficient buffer space. Buffer space becomes recoverable when * bp's get placed back in the queues. */ static __inline void bufspacewakeup(void) { /* * If someone is waiting for BUF space, wake them up. Even * though we haven't freed the kva space yet, the waiting * process will be able to now. */ mtx_lock(&nblock); if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * bufcountadd: * * Called when a buffer has been added to one of the free queues to * account for the buffer and to wakeup anyone waiting for free buffers. * This typically occurs when large amounts of metadata are being handled * by the buffer cache ( else buffer space runs out first, usually ). */ static __inline void bufcountadd(struct buf *bp) { int old; KASSERT((bp->b_flags & B_INFREECNT) == 0, ("buf %p already counted as free", bp)); bp->b_flags |= B_INFREECNT; old = atomic_fetchadd_int(&numfreebuffers, 1); KASSERT(old >= 0 && old < nbuf, ("numfreebuffers climbed to %d", old + 1)); mtx_lock(&nblock); if (needsbuffer) { needsbuffer &= ~VFS_BIO_NEED_ANY; if (numfreebuffers >= hifreebuffers) needsbuffer &= ~VFS_BIO_NEED_FREE; wakeup(&needsbuffer); } mtx_unlock(&nblock); } /* * bufcountsub: * * Decrement the numfreebuffers count as needed. */ static void bufcountsub(struct buf *bp) { int old; /* * Fixup numfreebuffers count. If the buffer is invalid or not * delayed-write, the buffer was free and we must decrement * numfreebuffers. */ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { KASSERT((bp->b_flags & B_INFREECNT) != 0, ("buf %p not counted in numfreebuffers", bp)); bp->b_flags &= ~B_INFREECNT; old = atomic_fetchadd_int(&numfreebuffers, -1); KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1)); } } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static __inline void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artifically limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } /* * swbufs are used as temporary holders for I/O, such as paging I/O. * We have no less then 16 and no more then 256. */ nswbuf = max(min(nbuf/4, 256), 16); #ifdef NSWBUF_MIN if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; #endif /* * Reserve space for the buffer cache buffers */ swbuf = (void *)v; v = (caddr_t)(swbuf + nswbuf); buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF); mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) TAILQ_INIT(&bufqueues[i]); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL | B_INFREECNT; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_EMPTY; bp->b_xflags = 0; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); #ifdef INVARIANTS bq_len[QUEUE_EMPTY]++; #endif } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by buf_daemon. hibufspace is the nominal maximum * used by most other processes. The differential is required to * ensure that buf_daemon is able to run when other processes might * be blocked waiting for buffer space. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); lobufspace = hibufspace - MAXBSIZE; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on average * (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occuring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; numdirtybuffers = 0; /* * To support extreme low-memory systems, make sure hidirtybuffers cannot * eat up all available buffer space. This occurs when our minimum cannot * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming * BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * Try to keep the number of free buffers in the specified range, * and give special processes (e.g. like buf_daemon) access to an * emergency reserve. */ lofreebuffers = nbuf / 18 + 5; hifreebuffers = 2 * lofreebuffers; numfreebuffers = nbuf; bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT((bp->b_flags & B_UNMAPPED) == 0, ("mapped buf %p %x", bp, bp->b_flags)); KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED, ("unmapped buf %p %x", bp, bp->b_flags)); KASSERT(bp->b_kvabase == unmapped_buf, ("unmapped buf: corrupted b_kvabase %p", bp)); KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } /* * bfreekva() - free the kva allocation for a buffer. * * Since this call frees up buffer space, we call bufspacewakeup(). */ static void bfreekva(struct buf *bp) { if (bp->b_kvasize == 0) return; atomic_add_int(&buffreekvacnt, 1); atomic_subtract_long(&bufspace, bp->b_kvasize); if ((bp->b_flags & B_UNMAPPED) == 0) { BUF_CHECK_MAPPED(bp); vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); } else { BUF_CHECK_UNMAPPED(bp); if ((bp->b_flags & B_KVAALLOC) != 0) { vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc, bp->b_kvasize); } atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC); } bp->b_kvasize = 0; bufspacewakeup(); } /* * binsfree: * * Insert the buffer into the appropriate free list. */ static void binsfree(struct buf *bp, int qindex) { struct mtx *olock, *nlock; BUF_ASSERT_XLOCKED(bp); olock = bqlock(bp->b_qindex); nlock = bqlock(qindex); mtx_lock(olock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) bremfreel(bp); if (bp->b_qindex != QUEUE_NONE) panic("binsfree: free buffer onto another queue???"); bp->b_qindex = qindex; if (olock != nlock) { mtx_unlock(olock); mtx_lock(nlock); } if (bp->b_flags & B_AGE) TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); else TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS bq_len[bp->b_qindex]++; #endif mtx_unlock(nlock); /* * Something we can maybe free or reuse. */ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) bufspacewakeup(); if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) bufcountadd(bp); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; bufcountsub(bp); } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct mtx *qlock; qlock = bqlock(bp->b_qindex); mtx_lock(qlock); bremfreel(bp); mtx_unlock(qlock); } /* * bremfreel: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bremfreel(struct buf *bp) { CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfreel: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); mtx_assert(bqlock(bp->b_qindex), MA_OWNED); TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); #ifdef INVARIANTS KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow", bp->b_qindex)); bq_len[bp->b_qindex]--; #endif bp->b_qindex = QUEUE_NONE; /* * If this was a delayed bremfree() we only need to remove the buffer * from the queue and return the stats are already done. */ if (bp->b_flags & B_REMFREE) { bp->b_flags &= ~B_REMFREE; return; } bufcountsub(bp); } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred) { struct buf *rabp; int i; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } else { brelse(rabp); } } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp) { struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); /* * Can only return NULL if GB_LOCK_NOWAIT flag is specified. */ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags); if (bp == NULL) return (EBUSY); /* if not found in cache, do some I/O */ if ((bp->b_flags & B_CACHE) == 0) { if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } breada(vp, rablkno, rabsize, cnt, cred); if (readwait) { rv = bufwait(bp); } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) barrierwrites++; oldflags = bp->b_flags; BUF_ASSERT_HELD(bp); if (bp->b_pin_count > 0) bunpin_wait(bp); KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); if (!TD_IS_IDLETHREAD(curthread)) curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); BUF_ASSERT_HELD(bp); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (numdirtybuffers >= hidirtybuffers) { mtx_lock(&bdirtylock); while (numdirtybuffers >= hidirtybuffers) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return(numdirtybuffers >= hidirtybuffers); } static __noinline int buf_vm_page_count_severe(void) { KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1); return vm_page_count_severe(); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && bp->b_error == EIO && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. Must clear BIO_ERROR to prevent * pages from being scrapped. If the error is anything * other than an I/O error (EIO), assume that retrying * is futile. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed I/O or we were asked to free or not * cache the buffer. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { if (bp->b_bufsize) allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_release(), even * if B_DELWRI is set. * * If B_DELWRI is not set we may have to set B_RELBUF if we are low * on pages to return pages to the VM page queues. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; else if (buf_vm_page_count_severe()) { /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if (!(bp->b_vflags & BV_BKGRDINPROG)) bp->b_flags |= B_RELBUF; } /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ if ((bp->b_flags & B_VMIO) && !(bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI)) ) { int i, j, resid; vm_page_t m; off_t foff; vm_pindex_t poff; vm_object_t obj; obj = bp->b_bufobj->bo_object; /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ resid = bp->b_bufsize; foff = bp->b_offset; for (i = 0; i < bp->b_npages; i++) { int had_bogus = 0; m = bp->b_pages[i]; /* * If we hit a bogus page, fixup *all* the bogus pages * now. */ if (m == bogus_page) { poff = OFF_TO_IDX(bp->b_offset); had_bogus = 1; VM_OBJECT_RLOCK(obj); for (j = i; j < bp->b_npages; j++) { vm_page_t mtmp; mtmp = bp->b_pages[j]; if (mtmp == bogus_page) { mtmp = vm_page_lookup(obj, poff + j); if (!mtmp) { panic("brelse: page missing\n"); } bp->b_pages[j] = mtmp; } } VM_OBJECT_RUNLOCK(obj); if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) { BUF_CHECK_MAPPED(bp); pmap_qenter( trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } m = bp->b_pages[i]; } if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) { int poffset = foff & PAGE_MASK; int presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); VM_OBJECT_WLOCK(obj); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, "mbncsh"); VM_OBJECT_WLOCK(obj); } if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); VM_OBJECT_WUNLOCK(obj); if (had_bogus) printf("avoided corruption bug in bogus_page/brelse code\n"); } resid -= PAGE_SIZE - (foff & PAGE_MASK); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bp->b_flags & (B_INVAL | B_RELBUF)) vfs_vmio_release(bp); } else if (bp->b_flags & B_VMIO) { if (bp->b_flags & (B_INVAL | B_RELBUF)) { vfs_vmio_release(bp); } } else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { if (bp->b_bufsize != 0) allocbuf(bp, 0); if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } /* buffers with no memory */ if (bp->b_bufsize == 0) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_kvasize) qindex = QUEUE_EMPTYKVA; else qindex = QUEUE_EMPTY; bp->b_flags |= B_AGE; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; binsfree(bp, qindex); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); /* unlock */ BUF_UNLOCK(bp); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if (bp->b_flags & B_DELWRI) { qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if (buf_vm_page_count_severe() && (bp->b_vflags & BV_BKGRDINPROG) == 0) { /* * We are too low on memory, we have to try to free * the buffer (most importantly: the wired pages * making up its backing store) *now*. */ brelse(bp); return; } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); out: /* unlock */ BUF_UNLOCK(bp); } /* Give pages used by the bp back to the VM system (where possible) */ static void vfs_vmio_release(struct buf *bp) { int i; vm_page_t m; if ((bp->b_flags & B_UNMAPPED) == 0) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; bp->b_pages[i] = NULL; /* * In order to keep page LRU ordering consistent, put * everything on the inactive queue. */ vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); /* * Might as well free the page if we can and it has * no valid data. We also free the page if the * buffer was used for direct I/O */ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) { if (m->wire_count == 0 && !vm_page_busied(m)) vm_page_free(m); } else if (bp->b_flags & B_DIRECT) vm_page_try_to_free(m); else if (buf_vm_page_count_severe()) vm_page_try_to_cache(m); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); if (bp->b_bufsize) { bufspacewakeup(); bp->b_bufsize = 0; } bp->b_npages = 0; bp->b_flags &= ~B_VMIO; if (bp->b_vp) brelvp(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } static void setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags) { KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && bp->b_kvasize == 0, ("call bfreekva(%p)", bp)); if ((gbflags & GB_UNMAPPED) == 0) { bp->b_kvabase = (caddr_t)addr; } else if ((gbflags & GB_KVAALLOC) != 0) { KASSERT((gbflags & GB_UNMAPPED) != 0, ("GB_KVAALLOC without GB_UNMAPPED")); bp->b_kvaalloc = (caddr_t)addr; bp->b_flags |= B_UNMAPPED | B_KVAALLOC; atomic_add_long(&unmapped_bufspace, bp->b_kvasize); } bp->b_kvasize = maxsize; } /* * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if * needed. */ static int allocbufkva(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; bfreekva(bp); addr = 0; if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ atomic_add_int(&bufdefragcnt, 1); return (1); } setbufkva(bp, addr, maxsize, gbflags); atomic_add_long(&bufspace, bp->b_kvasize); return (0); } /* * Ask the bufdaemon for help, or act as bufdaemon itself, when a * locked vnode is supplied. */ static void getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo, int defrag) { struct thread *td; char *waitmsg; int cnt, error, flags, norunbuf, wait; mtx_assert(&bqclean, MA_OWNED); if (defrag) { flags = VFS_BIO_NEED_BUFSPACE; waitmsg = "nbufkv"; } else if (bufspace >= hibufspace) { waitmsg = "nbufbs"; flags = VFS_BIO_NEED_BUFSPACE; } else { waitmsg = "newbuf"; flags = VFS_BIO_NEED_ANY; } mtx_lock(&nblock); needsbuffer |= flags; mtx_unlock(&nblock); mtx_unlock(&bqclean); bd_speedup(); /* heeeelp */ if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; cnt = 0; wait = MNT_NOWAIT; mtx_lock(&nblock); while (needsbuffer & flags) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { mtx_unlock(&nblock); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ if (cnt++ > 2) wait = MNT_WAIT; ASSERT_VOP_LOCKED(vp, "bufd_helper"); error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); if (error == 0) { /* play bufdaemon */ norunbuf = curthread_pflags_set(TDP_BUFNEED | TDP_NORUNNINGBUF); VOP_FSYNC(vp, wait, td); atomic_add_long(¬bufdflushes, 1); curthread_pflags_restore(norunbuf); } mtx_lock(&nblock); if ((needsbuffer & flags) == 0) break; } if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) break; } mtx_unlock(&nblock); } static void getnewbuf_reuse_bp(struct buf *bp, int qindex) { CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d " "queue %d (recycling)", bp, bp->b_vp, bp->b_flags, bp->b_kvasize, bp->b_bufsize, qindex); mtx_assert(&bqclean, MA_NOTOWNED); /* * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); if (qindex == QUEUE_CLEAN) { if (bp->b_flags & B_VMIO) { bp->b_flags &= ~B_ASYNC; vfs_vmio_release(bp); } if (bp->b_vp != NULL) brelvp(bp); } /* * Get the rest of the buffer freed up. b_kva* is still valid * after this operation. */ if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 3"); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d", bp, bp->b_vp, qindex)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); if (bp->b_bufsize) allocbuf(bp, 0); bp->b_flags &= B_UNMAPPED | B_KVAALLOC; bp->b_ioflags = 0; bp->b_xflags = 0; KASSERT((bp->b_flags & B_INFREECNT) == 0, ("buf %p still counted as free?", bp)); bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_pin_count = 0; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); } static int flushingbufs; static struct buf * getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata) { struct buf *bp, *nbp; int nqindex, qindex, pass; KASSERT(!unmapped || !defrag, ("both unmapped and defrag")); pass = 1; restart: atomic_add_int(&getnewbufrestarts, 1); /* * Setup for scan. If we do not have enough free buffers, * we setup a degenerate case that immediately fails. Note * that if we are specially marked process, we are allowed to * dip into our reserves. * * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN * for the allocation of the mapped buffer. For unmapped, the * easiest is to start with EMPTY outright. * * We start with EMPTYKVA. If the list is empty we backup to EMPTY. * However, there are a number of cases (defragging, reusing, ...) * where we cannot backup. */ nbp = NULL; mtx_lock(&bqclean); if (!defrag && unmapped) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } if (nbp == NULL) { nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); } /* * If no EMPTYKVA buffers and we are either defragging or * reusing, locate a CLEAN buffer to free or reuse. If * bufspace useage is low skip this step so we can allocate a * new buffer. */ if (nbp == NULL && (defrag || bufspace >= lobufspace)) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * If we could not find or were not allowed to reuse a CLEAN * buffer, check to see if it is ok to use an EMPTY buffer. * We can only use an EMPTY buffer if allocating its KVA would * not otherwise run us out of buffer space. No KVA is needed * for the unmapped allocation. */ if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace || metadata)) { nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); } /* * All available buffers might be clean, retry ignoring the * lobufspace as the last resort. */ if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) { nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); } /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { qindex = nqindex; /* * Calculate next bp (we can only use it if we do not * block or do other fancy things). */ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { switch (qindex) { case QUEUE_EMPTY: nqindex = QUEUE_EMPTYKVA; nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_EMPTYKVA: nqindex = QUEUE_CLEAN; nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); if (nbp != NULL) break; /* FALLTHROUGH */ case QUEUE_CLEAN: if (metadata && pass == 1) { pass = 2; nqindex = QUEUE_EMPTY; nbp = TAILQ_FIRST( &bufqueues[QUEUE_EMPTY]); } /* * nbp is NULL. */ break; } } /* * If we are defragging then we need a buffer with * b_kvasize != 0. XXX this situation should no longer * occur, if defrag is non-zero the buffer's b_kvasize * should also be non-zero at this point. XXX */ if (defrag && bp->b_kvasize == 0) { printf("Warning: defrag empty buffer %p\n", bp); continue; } /* * Start freeing the bp. This is somewhat involved. nbp * remains valid only for QUEUE_EMPTY[KVA] bp's. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if (bp->b_vflags & BV_BKGRDINPROG) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); bremfreel(bp); mtx_unlock(&bqclean); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ getnewbuf_reuse_bp(bp, qindex); mtx_assert(&bqclean, MA_NOTOWNED); /* * If we are defragging then free the buffer. */ if (defrag) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); defrag = 0; goto restart; } /* * Notify any waiters for the buffer lock about * identity change by freeing the buffer. */ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (metadata) break; /* * If we are overcomitted then recover the buffer and its * KVM space. This occurs in rare situations when multiple * processes are blocked in getnewbuf() or allocbuf(). */ if (bufspace >= hibufspace) flushingbufs = 1; if (flushingbufs && bp->b_kvasize != 0) { bp->b_flags |= B_INVAL; bfreekva(bp); brelse(bp); goto restart; } if (bufspace < lobufspace) flushingbufs = 0; break; } return (bp); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * Important: B_INVAL is not set. If the caller wishes to throw the * buffer away, the caller must set B_INVAL prior to calling brelse(). * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize, int gbflags) { struct buf *bp; int defrag, metadata; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); defrag = 0; if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = 1; else metadata = 0; /* * We can't afford to block since we might be holding a vnode lock, * which may prevent system daemons from running. We deal with * low-memory situations by proactively returning memory and running * async I/O rather then sync I/O. */ atomic_add_int(&getnewbufcalls, 1); atomic_subtract_int(&getnewbufrestarts, 1); restart: bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED, metadata); if (bp != NULL) defrag = 0; /* * If we exhausted our list, sleep as appropriate. We may have to * wakeup various daemons and write out some dirty buffers. * * Generally we are sleeping due to insufficient buffer space. */ if (bp == NULL) { mtx_assert(&bqclean, MA_OWNED); getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag); mtx_assert(&bqclean, MA_NOTOWNED); } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) { mtx_assert(&bqclean, MA_NOTOWNED); bfreekva(bp); bp->b_flags |= B_UNMAPPED; bp->b_kvabase = bp->b_data = unmapped_buf; bp->b_kvasize = maxsize; atomic_add_long(&bufspace, bp->b_kvasize); atomic_add_long(&unmapped_bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } else { mtx_assert(&bqclean, MA_NOTOWNED); /* * We finally have a valid bp. We aren't quite out of the * woods, we still have to reserve kva space. In order * to keep fragmentation sane we only allocate kva in * BKVASIZE chunks. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == B_UNMAPPED) { if (allocbufkva(bp, maxsize, gbflags)) { defrag = 1; bp->b_flags |= B_INVAL; brelse(bp); goto restart; } atomic_add_int(&bufreusecnt, 1); } else if ((bp->b_flags & B_KVAALLOC) != 0 && (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) { /* * If the reused buffer has KVA allocated, * reassign b_kvaalloc to b_kvabase. */ bp->b_kvabase = bp->b_kvaalloc; bp->b_flags &= ~B_KVAALLOC; atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 && (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED | GB_KVAALLOC)) { /* * The case of reused buffer already have KVA * mapped, but the request is for unmapped * buffer with KVA allocated. */ bp->b_kvaalloc = bp->b_kvabase; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_flags |= B_UNMAPPED | B_KVAALLOC; atomic_add_long(&unmapped_bufspace, bp->b_kvasize); atomic_add_int(&bufreusecnt, 1); } if ((gbflags & GB_UNMAPPED) == 0) { bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; bp->b_flags &= ~B_UNMAPPED; BUF_CHECK_MAPPED(bp); } } return (bp); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(int target) { int flushed; flushed = flushbufqueues(target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ flushed = flushbufqueues(target, 1); } return (flushed); } static void buf_daemon() { int lodirty; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, SHUTDOWN_PRI_LAST); /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kproc_suspend_check(bufdaemonproc); lodirty = lodirtybuffers; if (bd_speedupreq) { lodirty = numdirtybuffers / 2; bd_speedupreq = 0; } /* * Do the flush. Limit the amount of in-transit I/O we * allow to build up, otherwise we would completely saturate * the I/O system. */ while (numdirtybuffers > lodirty) { if (buf_flush(numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (numdirtybuffers <= lodirtybuffers) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(int target, int flushdeps) { struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int queue; int error; flushed = 0; queue = QUEUE_DIRTY; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; mtx_lock(&bqdirty); TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqdirty); while (flushed != target) { maybe_yield(); mtx_lock(&bqdirty); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel, b_freelist); } else { mtx_unlock(&bqdirty); break; } KASSERT(bp->b_qindex != QUEUE_SENTINEL, ("parallel calls to flushbufqueues() bp %p", bp)); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); mtx_unlock(&bqdirty); if (error != 0) continue; if (bp->b_pin_count > 0) { BUF_UNLOCK(bp); continue; } /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); vfs_bio_awrite(bp); vn_finished_write(mp); VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; if (runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } mtx_lock(&bqdirty); TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist); mtx_unlock(&bqdirty); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. It handles the * cases of both B_UNMAPPED buffer, and buffer with the preallocated * KVA which is not mapped (B_KVAALLOC). */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { struct buf *scratch_bp; int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = (bp->b_flags & B_UNMAPPED) != 0 && (gbflags & GB_UNMAPPED) == 0; need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ bp->b_flags &= ~B_KVAALLOC; KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0")); bp->b_kvabase = bp->b_kvaalloc; atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize); goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); mapping_loop: if (allocbufkva(bp, maxsize, gbflags)) { /* * Request defragmentation. getnewbuf() returns us the * allocated space by the scratch buffer KVA. */ scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags | (GB_UNMAPPED | GB_KVAALLOC)); if (scratch_bp == NULL) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp); } atomic_add_int(&mappingrestarts, 1); goto mapping_loop; } KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0, ("scratch bp !B_KVAALLOC %p", scratch_bp)); setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc, scratch_bp->b_kvasize, gbflags); /* Get rid of the scratch buffer. */ scratch_bp->b_kvasize = 0; scratch_bp->b_flags |= B_INVAL; scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC); brelse(scratch_bp); } if (!need_mapping) return; has_addr: bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */ bp->b_flags &= ~B_UNMAPPED; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } /* * getblk: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successfull read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; struct bufobj *bo; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if (flags & GB_LOCK_NOWAIT) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error) return (NULL); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { /* * If buffer is pinned and caller does * not want sleep waiting for it to be * unpinned, bail out * */ if (bp->b_pin_count > 0) { if (flags & GB_LOCK_NOWAIT) { bqrelse(bp); return (NULL); } else { bunpin_wait(bp); } } bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistant in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ if (bp->b_bcount != size) allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return NULL; if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread)) return NULL; bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~GB_UNMAPPED; } maxsize = imax(maxsize, bsize); bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return NULL; goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_blkno = bp->b_lblkno = blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); BUF_ASSERT_HELD(bp); end: KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); return (bp); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ BUF_ASSERT_HELD(bp); return (bp); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistant data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize, mbsize; int i; BUF_ASSERT_HELD(bp); if (bp->b_kvasize < size) panic("allocbuf: buffer too small"); if ((bp->b_flags & B_VMIO) == 0) { caddr_t origbuf; int origbufsize; /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); if (bp->b_flags & B_MALLOC) newbsize = mbsize; else newbsize = round_page(size); if (newbsize < bp->b_bufsize) { /* * malloced buffers are not shrunk */ if (bp->b_flags & B_MALLOC) { if (newbsize) { bp->b_bcount = size; } else { free(bp->b_data, M_BIOBUF); if (bp->b_bufsize) { atomic_subtract_long( &bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_saveaddr = bp->b_kvabase; bp->b_data = bp->b_saveaddr; bp->b_bcount = 0; bp->b_flags &= ~B_MALLOC; } return 1; } vm_hold_free_pages(bp, newbsize); } else if (newbsize > bp->b_bufsize) { /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. */ /* * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if ( (bufmallocspace < maxbufmallocspace) && (bp->b_bufsize == 0) && (mbsize <= PAGE_SIZE/2)) { bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); bp->b_bufsize = mbsize; bp->b_bcount = size; bp->b_flags |= B_MALLOC; atomic_add_long(&bufmallocspace, mbsize); return 1; } origbuf = NULL; origbufsize = 0; /* * If the buffer is growing on its other-than-first allocation, * then we revert to the page-allocation scheme. */ if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; if (bp->b_bufsize) { atomic_subtract_long(&bufmallocspace, bp->b_bufsize); bufspacewakeup(); bp->b_bufsize = 0; } bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages( bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } } } else { int desiredpages; newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) { /* * DEV_BSIZE aligned new buffer size is less then the * DEV_BSIZE aligned existing buffer size. Figure out * if we have to remove any pages. */ if (desiredpages < bp->b_npages) { vm_page_t m; if ((bp->b_flags & B_UNMAPPED) == 0) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page( (vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); } else BUF_CHECK_UNMAPPED(bp); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = desiredpages; i < bp->b_npages; i++) { /* * the page is not freed here -- it * is the responsibility of * vnode_pager_setsize */ m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); while (vm_page_sleep_if_busy(m, "biodep")) continue; bp->b_pages[i] = NULL; vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_npages = desiredpages; } } else if (size > bp->b_bcount) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; /* * Step 1, bring in the VM pages from the object, * allocating them if necessary. We must clear * B_CACHE if these pages are not valid for the * range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); while (bp->b_npages < desiredpages) { vm_page_t m; /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_COUNT(desiredpages - bp->b_npages)); if (m->valid == 0) bp->b_flags &= ~B_CACHE; bp->b_pages[bp->b_npages] = m; ++bp->b_npages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), new the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; vfs_buf_test_cache( bp, bp->b_offset, toff, tinc, bp->b_pages[pi] ); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVM pmap. */ if ((bp->b_flags & B_UNMAPPED) == 0) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } } if (newbsize < bp->b_bufsize) bufspacewakeup(); bp->b_bufsize = newbsize; /* actual buffer allocation */ bp->b_bcount = size; /* requested buffer size */ return 1; } extern int inflight_transient_maps; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); pmap_qremove(start, OFF_TO_IDX(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else { bp->bio_flags |= BIO_DONE; done(bp); } } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * Call back function from struct bio back up to struct buf. */ static void bufdonebio(struct bio *bip) { struct buf *bp; bp = bip->bio_caller2; bp->b_resid = bip->bio_resid; bp->b_ioflags = bip->bio_flags; bp->b_error = bip->bio_error; if (bp->b_error) bp->b_ioflags |= BIO_ERROR; bufdone(bp); g_destroy_bio(bip); } void dev_strategy(struct cdev *dev, struct buf *bp) { struct cdevsw *csw; int ref; KASSERT(dev->si_refcount > 0, ("dev_strategy on un-referenced struct cdev *(%s) %p", devtoname(dev), dev)); csw = dev_refthread(dev, &ref); dev_strategy_csw(dev, csw, bp); dev_relthread(dev, ref); } void dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp) { struct bio *bip; KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE, ("b_iocmd botch")); KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) || dev->si_threadcount > 0, ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev), dev)); if (csw == NULL) { bp->b_error = ENXIO; bp->b_ioflags = BIO_ERROR; bufdone(bp); return; } for (;;) { bip = g_new_bio(); if (bip != NULL) break; /* Try again later */ tsleep(&bp, PRIBIO, "dev_strat", hz/10); } bip->bio_cmd = bp->b_iocmd; bip->bio_offset = bp->b_iooffset; bip->bio_length = bp->b_bcount; bip->bio_bcount = bp->b_bcount; /* XXX: remove */ bdata2bio(bp, bip); bip->bio_done = bufdonebio; bip->bio_caller2 = bp; bip->bio_dev = dev; (*csw->d_strategy)(bip); } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occured, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * biodone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existance * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); BUF_ASSERT_HELD(bp); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } bufdone_finish(bp); if (dropobj) bufobj_wdrop(dropobj); } void bufdone_finish(struct buf *bp) { BUF_ASSERT_HELD(bp); if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if (bp->b_flags & B_VMIO) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp; int bogus, i, iosize; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("biodone_finish: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("biodone_finish: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("biodone_finish: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("biodone_finish: bp %p has no buffer offset", bp)); /* * Set B_CACHE if the op was a normal read and no error * occured. B_CACHE is set for writes in the b*write() * routines. */ iosize = bp->b_bcount - bp->b_resid; if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) { bp->b_flags |= B_CACHE; } bogus = 0; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { int bogusflag = 0; int resid; resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = bogusflag = 1; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("biodone_finish: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("bufdone_finish:" " page %p has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } vm_page_sunbusy(m); vm_object_pip_subtract(obj, 1); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_WUNLOCK(obj); if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistant. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if ((bp->b_flags & B_UNMAPPED) == 0) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_object_pip_subtract(obj, 1); vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, 0); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundry or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_lock(m); VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); vm_page_busy_sleep(m, "vbpage"); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistant. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistant state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { int i, bogus; vm_object_t obj; vm_ooffset_t foff; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus++; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && (bp->b_flags & B_UNMAPPED) == 0) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if ((bp->b_flags & B_UNMAPPED) == 0) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { tryagain: /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT)); if (p == NULL) { VM_WAIT; goto tryagain; } pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; if (vm_page_sbusied(p)) printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); vm_page_free(p); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. */ int vmapbuf(struct buf *bp, int mapbuf) { caddr_t kva; vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); kva = bp->b_saveaddr; bp->b_saveaddr = bp->b_data; bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK); bp->b_flags &= ~B_UNMAPPED; } else { bp->b_flags |= B_UNMAPPED; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; bp->b_saveaddr = bp->b_data; bp->b_data = unmapped_buf; } return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (bp->b_flags & B_UNMAPPED) bp->b_flags &= ~B_UNMAPPED; else pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = bp->b_saveaddr; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i = 0; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } void bpin(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_pin_count++; mtx_unlock(mtxp); } void bunpin(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); if (--bp->b_pin_count == 0) wakeup(bp); mtx_unlock(mtxp); } void bunpin_wait(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while (bp->b_pin_count > 0) msleep(bp, mtxp, PRIBIO, "bwunpin", 0); mtx_unlock(mtxp); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if ((bp->b_flags & B_UNMAPPED) != 0) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, " "b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_dep.lh_first); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } db_printf(" "); BUF_LOCKPRINTINFO(bp); } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if ((bp->b_flags & B_INFREECNT) != 0) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: user/attilio/rm_vmobj_cache/sys/net/bpf_zerocopy.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/net/bpf_zerocopy.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/net/bpf_zerocopy.c (revision 267237) @@ -1,599 +1,599 @@ /*- * Copyright (c) 2007 Seccuris Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract to * Seccuris Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which * are mapped into the kernel address space using sf_bufs and used directly * by BPF. Memory is wired since page faults cannot be tolerated in the * contexts where the buffers are copied to (locks held, interrupt context, * etc). Access to shared memory buffers is synchronized using a header on * each buffer, allowing the number of system calls to go to zero as BPF * reaches saturation (buffers filled as fast as they can be drained by the * user process). Full details of the protocol for communicating between the * user process and BPF may be found in bpf(4). */ /* * Maximum number of pages per buffer. Since all BPF devices use two, the * maximum per device is 2*BPF_MAX_PAGES. Resource limits on the number of * sf_bufs may be an issue, so do not set this too high. On older systems, * kernel address space limits may also be an issue. */ #define BPF_MAX_PAGES 512 /* * struct zbuf describes a memory buffer loaned by a user process to the * kernel. We represent this as a series of pages managed using an array of * sf_bufs. Even though the memory is contiguous in user space, it may not * be mapped contiguously in the kernel (i.e., a set of physically * non-contiguous pages in the direct map region) so we must implement * scatter-gather copying. One significant mitigating factor is that on * systems with a direct memory map, we can avoid TLB misses. * * At the front of the shared memory region is a bpf_zbuf_header, which * contains shared control data to allow user space and the kernel to * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF * knows that the space is not available. */ struct zbuf { vm_offset_t zb_uaddr; /* User address at time of setup. */ size_t zb_size; /* Size of buffer, incl. header. */ u_int zb_numpages; /* Number of pages. */ int zb_flags; /* Flags on zbuf. */ struct sf_buf **zb_pages; /* Pages themselves. */ struct bpf_zbuf_header *zb_header; /* Shared header. */ }; /* * When a buffer has been assigned to userspace, flag it as such, as the * buffer may remain in the store position as a result of the user process * not yet having acknowledged the buffer in the hold position yet. */ #define ZBUF_FLAG_ASSIGNED 0x00000001 /* Set when owned by user. */ /* * Release a page we've previously wired. */ static void zbuf_page_free(vm_page_t pp) { vm_page_lock(pp); - vm_page_unwire(pp, 0); + vm_page_unwire(pp, PQ_INACTIVE); if (pp->wire_count == 0 && pp->object == NULL) vm_page_free(pp); vm_page_unlock(pp); } /* * Free an sf_buf with attached page. */ static void zbuf_sfbuf_free(struct sf_buf *sf) { vm_page_t pp; pp = sf_buf_page(sf); sf_buf_free(sf); zbuf_page_free(pp); } /* * Free a zbuf, including its page array, sbufs, and pages. Allow partially * allocated zbufs to be freed so that it may be used even during a zbuf * setup. */ static void zbuf_free(struct zbuf *zb) { int i; for (i = 0; i < zb->zb_numpages; i++) { if (zb->zb_pages[i] != NULL) zbuf_sfbuf_free(zb->zb_pages[i]); } free(zb->zb_pages, M_BPF); free(zb, M_BPF); } /* * Given a user pointer to a page of user memory, return an sf_buf for the * page. Because we may be requesting quite a few sf_bufs, prefer failure to * deadlock and use SFB_NOWAIT. */ static struct sf_buf * zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr) { struct sf_buf *sf; vm_page_t pp; if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ | VM_PROT_WRITE, &pp, 1) < 0) return (NULL); vm_page_lock(pp); vm_page_wire(pp); vm_page_unhold(pp); vm_page_unlock(pp); sf = sf_buf_alloc(pp, SFB_NOWAIT); if (sf == NULL) { zbuf_page_free(pp); return (NULL); } return (sf); } /* * Create a zbuf describing a range of user address space memory. Validate * page alignment, size requirements, etc. */ static int zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len, struct zbuf **zbp) { struct zbuf *zb; struct vm_map *map; int error, i; *zbp = NULL; /* * User address must be page-aligned. */ if (uaddr & PAGE_MASK) return (EINVAL); /* * Length must be an integer number of full pages. */ if (len & PAGE_MASK) return (EINVAL); /* * Length must not exceed per-buffer resource limit. */ if ((len / PAGE_SIZE) > BPF_MAX_PAGES) return (EINVAL); /* * Allocate the buffer and set up each page with is own sf_buf. */ error = 0; zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK); zb->zb_uaddr = uaddr; zb->zb_size = len; zb->zb_numpages = len / PAGE_SIZE; zb->zb_pages = malloc(sizeof(struct sf_buf *) * zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK); map = &td->td_proc->p_vmspace->vm_map; for (i = 0; i < zb->zb_numpages; i++) { zb->zb_pages[i] = zbuf_sfbuf_get(map, uaddr + (i * PAGE_SIZE)); if (zb->zb_pages[i] == NULL) { error = EFAULT; goto error; } } zb->zb_header = (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]); bzero(zb->zb_header, sizeof(*zb->zb_header)); *zbp = zb; return (0); error: zbuf_free(zb); return (error); } /* * Copy bytes from a source into the specified zbuf. The caller is * responsible for performing bounds checking, etc. */ void bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { u_int count, page, poffset; u_char *src_bytes; struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_append_bytes: not in zbuf mode")); KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf")); src_bytes = (u_char *)src; zb = (struct zbuf *)buf; KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0, ("bpf_zerocopy_append_bytes: ZBUF_FLAG_ASSIGNED")); /* * Scatter-gather copy to user pages mapped into kernel address space * using sf_bufs: copy up to a page at a time. */ offset += sizeof(struct bpf_zbuf_header); page = offset / PAGE_SIZE; poffset = offset % PAGE_SIZE; while (len > 0) { KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:" " page overflow (%d p %d np)\n", page, zb->zb_numpages)); count = min(len, PAGE_SIZE - poffset); bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset, count); poffset += count; if (poffset == PAGE_SIZE) { poffset = 0; page++; } KASSERT(poffset < PAGE_SIZE, ("bpf_zerocopy_append_bytes: page offset overflow (%d)", poffset)); len -= count; src_bytes += count; } } /* * Copy bytes from an mbuf chain to the specified zbuf: copying will be * scatter-gather both from mbufs, which may be fragmented over memory, and * to pages, which may not be contiguously mapped in kernel address space. * As with bpf_zerocopy_append_bytes(), the caller is responsible for * checking that this will not exceed the buffer limit. */ void bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { u_int count, moffset, page, poffset; const struct mbuf *m; struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_append_mbuf not in zbuf mode")); KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf")); m = (struct mbuf *)src; zb = (struct zbuf *)buf; KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0, ("bpf_zerocopy_append_mbuf: ZBUF_FLAG_ASSIGNED")); /* * Scatter gather both from an mbuf chain and to a user page set * mapped into kernel address space using sf_bufs. If we're lucky, * each mbuf requires one copy operation, but if page alignment and * mbuf alignment work out less well, we'll be doing two copies per * mbuf. */ offset += sizeof(struct bpf_zbuf_header); page = offset / PAGE_SIZE; poffset = offset % PAGE_SIZE; moffset = 0; while (len > 0) { KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_mbuf: page overflow (%d p %d " "np)\n", page, zb->zb_numpages)); KASSERT(m != NULL, ("bpf_zerocopy_append_mbuf: end of mbuf chain")); count = min(m->m_len - moffset, len); count = min(count, PAGE_SIZE - poffset); bcopy(mtod(m, u_char *) + moffset, ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset, count); poffset += count; if (poffset == PAGE_SIZE) { poffset = 0; page++; } KASSERT(poffset < PAGE_SIZE, ("bpf_zerocopy_append_mbuf: page offset overflow (%d)", poffset)); moffset += count; if (moffset == m->m_len) { m = m->m_next; moffset = 0; } len -= count; } } /* * Notification from the BPF framework that a buffer in the store position is * rejecting packets and may be considered full. We mark the buffer as * immutable and assign to userspace so that it is immediately available for * the user process to access. */ void bpf_zerocopy_buffull(struct bpf_d *d) { struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_buffull: not in zbuf mode")); zb = (struct zbuf *)d->bd_sbuf; KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL")); if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) { zb->zb_flags |= ZBUF_FLAG_ASSIGNED; zb->zb_header->bzh_kernel_len = d->bd_slen; atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1); } } /* * Notification from the BPF framework that a buffer has moved into the held * slot on a descriptor. Zero-copy BPF will update the shared page to let * the user process know and flag the buffer as assigned if it hasn't already * been marked assigned due to filling while it was in the store position. * * Note: identical logic as in bpf_zerocopy_buffull(), except that we operate * on bd_hbuf and bd_hlen. */ void bpf_zerocopy_bufheld(struct bpf_d *d) { struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_bufheld: not in zbuf mode")); zb = (struct zbuf *)d->bd_hbuf; KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL")); if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) { zb->zb_flags |= ZBUF_FLAG_ASSIGNED; zb->zb_header->bzh_kernel_len = d->bd_hlen; atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1); } } /* * Notification from the BPF framework that the free buffer has been been * rotated out of the held position to the free position. This happens when * the user acknowledges the held buffer. */ void bpf_zerocopy_buf_reclaimed(struct bpf_d *d) { struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_reclaim_buf: not in zbuf mode")); KASSERT(d->bd_fbuf != NULL, ("bpf_zerocopy_buf_reclaimed: NULL free buf")); zb = (struct zbuf *)d->bd_fbuf; zb->zb_flags &= ~ZBUF_FLAG_ASSIGNED; } /* * Query from the BPF framework regarding whether the buffer currently in the * held position can be moved to the free position, which can be indicated by * the user process making their generation number equal to the kernel * generation number. */ int bpf_zerocopy_canfreebuf(struct bpf_d *d) { struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_canfreebuf: not in zbuf mode")); zb = (struct zbuf *)d->bd_hbuf; if (zb == NULL) return (0); if (zb->zb_header->bzh_kernel_gen == atomic_load_acq_int(&zb->zb_header->bzh_user_gen)) return (1); return (0); } /* * Query from the BPF framework as to whether or not the buffer current in * the store position can actually be written to. This may return false if * the store buffer is assigned to userspace before the hold buffer is * acknowledged. */ int bpf_zerocopy_canwritebuf(struct bpf_d *d) { struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_canwritebuf: not in zbuf mode")); zb = (struct zbuf *)d->bd_sbuf; KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL")); if (zb->zb_flags & ZBUF_FLAG_ASSIGNED) return (0); return (1); } /* * Free zero copy buffers at request of descriptor. */ void bpf_zerocopy_free(struct bpf_d *d) { struct zbuf *zb; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_free: not in zbuf mode")); zb = (struct zbuf *)d->bd_sbuf; if (zb != NULL) zbuf_free(zb); zb = (struct zbuf *)d->bd_hbuf; if (zb != NULL) zbuf_free(zb); zb = (struct zbuf *)d->bd_fbuf; if (zb != NULL) zbuf_free(zb); } /* * Ioctl to return the maximum buffer size. */ int bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) { KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode")); *i = BPF_MAX_PAGES * PAGE_SIZE; return (0); } /* * Ioctl to force rotation of the two buffers, if there's any data available. * This can be used by user space to implement timeouts when waiting for a * buffer to fill. */ int bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { struct zbuf *bzh; bzero(bz, sizeof(*bz)); BPFD_LOCK(d); if (d->bd_hbuf == NULL && d->bd_slen != 0) { ROTATE_BUFFERS(d); bzh = (struct zbuf *)d->bd_hbuf; bz->bz_bufa = (void *)bzh->zb_uaddr; bz->bz_buflen = d->bd_hlen; } BPFD_UNLOCK(d); return (0); } /* * Ioctl to configure zero-copy buffers -- may be done only once. */ int bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { struct zbuf *zba, *zbb; int error; KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF, ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode")); /* * Must set both buffers. Cannot clear them. */ if (bz->bz_bufa == NULL || bz->bz_bufb == NULL) return (EINVAL); /* * Buffers must have a size greater than 0. Alignment and other size * validity checking is done in zbuf_setup(). */ if (bz->bz_buflen == 0) return (EINVAL); /* * Allocate new buffers. */ error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen, &zba); if (error) return (error); error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen, &zbb); if (error) { zbuf_free(zba); return (error); } /* * We only allow buffers to be installed once, so atomically check * that no buffers are currently installed and install new buffers. */ BPFD_LOCK(d); if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL || d->bd_bif != NULL) { BPFD_UNLOCK(d); zbuf_free(zba); zbuf_free(zbb); return (EINVAL); } /* * Point BPF descriptor at buffers; initialize sbuf as zba so that * it is always filled first in the sequence, per bpf(4). */ d->bd_fbuf = (caddr_t)zbb; d->bd_sbuf = (caddr_t)zba; d->bd_slen = 0; d->bd_hlen = 0; /* * We expose only the space left in the buffer after the size of the * shared management region. */ d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header); BPFD_UNLOCK(d); return (0); } Index: user/attilio/rm_vmobj_cache/sys/vm/vm_fault.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/vm/vm_fault.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/vm/vm_fault.c (revision 267237) @@ -1,1565 +1,1565 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *); #define VM_FAULT_READ_BEHIND 8 #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_NINCR (VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND) #define VM_FAULT_SUM (VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2) #define VM_FAULT_CACHE_BEHIND (VM_FAULT_READ_BEHIND * VM_FAULT_SUM) struct faultstate { vm_page_t m; vm_object_t object; vm_pindex_t pindex; vm_page_t first_m; vm_object_t first_object; vm_pindex_t first_pindex; vm_map_t map; vm_map_entry_t entry; int lookup_still_valid; struct vnode *vp; }; static void vm_fault_cache_behind(const struct faultstate *fs, int distance); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int faultcount, int reqpage); static inline void release_page(struct faultstate *fs) { vm_page_xunbusy(fs->m); vm_page_lock(fs->m); vm_page_deactivate(fs->m); vm_page_unlock(fs->m); fs->m = NULL; } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = FALSE; } } static void unlock_and_deallocate(struct faultstate *fs) { vm_object_pip_wakeup(fs->object); VM_OBJECT_WUNLOCK(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); vm_page_lock(fs->first_m); vm_page_free(fs->first_m); vm_page_unlock(fs->first_m); vm_object_pip_wakeup(fs->first_object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = NULL; } vm_object_deallocate(fs->first_object); unlock_map(fs); if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } /* * TRYPAGER - used by vm_fault to calculate whether the pager for the * current object *might* contain the page. * * default objects are zero-fill, there is no real pager. */ #define TRYPAGER (fs.object->type != OBJT_DEFAULT && \ ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 || wired)) /* * vm_fault: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) { struct thread *td; int result; td = curthread; if ((td->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags, NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND)) ktrfaultend(result); #endif return (result); } int vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { vm_prot_t prot; long ahead, behind; int alloc_req, era, faultcount, nera, reqpage, result; boolean_t growstack, is_first_object_locked, wired; int map_generation; vm_object_t next_object; vm_page_t marray[VM_FAULT_READ_MAX]; int hardfault; struct faultstate fs; struct vnode *vp; int locked, error; hardfault = 0; growstack = TRUE; PCPU_INC(cnt.v_vm_faults); fs.vp = NULL; faultcount = reqpage = 0; RetryFault:; /* * Find the backing store object and offset into it to begin the * search. */ fs.map = map; result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired); if (result != KERN_SUCCESS) { if (growstack && result == KERN_INVALID_ADDRESS && map != kernel_map) { result = vm_map_growstack(curproc, vaddr); if (result != KERN_SUCCESS) return (KERN_FAILURE); growstack = FALSE; goto RetryFault; } return (result); } map_generation = fs.map->timestamp; if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { if ((curthread->td_pflags & TDP_DEVMEMIO) != 0) { vm_map_unlock_read(fs.map); return (KERN_FAILURE); } panic("vm_fault: fault on nofault entry, addr: %lx", (u_long)vaddr); } if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION && fs.entry->wiring_thread != curthread) { vm_map_unlock_read(fs.map); vm_map_lock(fs.map); if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) && (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) { fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs.map, 0); } else vm_map_unlock(fs.map); goto RetryFault; } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. This must be done after * obtaining the vnode lock in order to avoid possible deadlocks. */ VM_OBJECT_WLOCK(fs.first_object); vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.lookup_still_valid = TRUE; if (wired) fault_type = prot | (fault_type & VM_PROT_COPY); fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { /* * If the object is dead, we stop here */ if (fs.object->flags & OBJ_DEAD) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { /* * Wait/Retry if the page is busy. We have to do this * if the page is either exclusive or shared busy * because the vm_pager may be using read busy for * pageouts (and even pageins if it is the vnode * pager), and we could end up trying to pagein and * pageout the same page simultaneously. * * We can theoretically allow the busy case on a read * fault if the page is marked valid, but since such * pages are typically already pmap'd, putting that * special case in might be more effort then it is * worth. We cannot under any circumstances mess * around with a shared busied page except, perhaps, * to pmap it. */ if (vm_page_busied(fs.m)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs.m, PGA_REFERENCED); if (fs.object != fs.first_object) { if (!VM_OBJECT_TRYWLOCK( fs.first_object)) { VM_OBJECT_WUNLOCK(fs.object); VM_OBJECT_WLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.object); } vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); vm_object_pip_wakeup(fs.first_object); VM_OBJECT_WUNLOCK(fs.first_object); fs.first_m = NULL; } unlock_map(&fs); if (fs.m == vm_page_lookup(fs.object, fs.pindex)) { vm_page_sleep_if_busy(fs.m, "vmpfw"); } vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); PCPU_INC(cnt.v_intrans); vm_object_deallocate(fs.first_object); goto RetryFault; } vm_page_lock(fs.m); vm_page_remque(fs.m); vm_page_unlock(fs.m); /* * Mark page busy for other processes, and the * pagedaemon. If it still isn't completely valid * (readable), jump to readrest, else break-out ( we * found the page ). */ vm_page_xbusy(fs.m); if (fs.m->valid != VM_PAGE_BITS_ALL) goto readrest; break; } /* * Page is not resident, If this is the search termination * or the pager might contain the page, allocate a new page. */ if (TRYPAGER || fs.object == fs.first_object) { if (fs.pindex >= fs.object->size) { unlock_and_deallocate(&fs); return (KERN_PROTECTION_FAILURE); } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At * worst, the P_KILLED might be not observed * there, and allocation can fail, causing * restart and new reading of the p_flag. */ fs.m = NULL; if (!vm_page_count_severe() || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 if ((fs.object->flags & OBJ_COLORED) == 0) { fs.object->flags |= OBJ_COLORED; fs.object->pg_color = atop(vaddr) - fs.pindex; } #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs.object->type != OBJT_VNODE && fs.object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; if ((fs.object->flags & OBJ_UNMANAGED) != 0) alloc_req |= VM_ALLOC_WIRED; fs.m = vm_page_alloc(fs.object, fs.pindex, alloc_req); } if (fs.m == NULL) { unlock_and_deallocate(&fs); VM_WAITPFAULT; goto RetryFault; } else if (fs.m->valid == VM_PAGE_BITS_ALL) break; } readrest: /* * We have found a valid page or we have allocated a new page. * The page thus may not be valid or may not be entirely * valid. * * Attempt to fault-in the page if there is a chance that the * pager has it, and potentially fault in additional pages * at the same time. */ if (TRYPAGER) { int rv; u_char behavior = vm_map_entry_behavior(fs.entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { behind = 0; ahead = atop(fs.entry->end - vaddr) - 1; if (ahead > VM_FAULT_READ_AHEAD_MAX) ahead = VM_FAULT_READ_AHEAD_MAX; if (fs.pindex == fs.entry->next_read) vm_fault_cache_behind(&fs, VM_FAULT_READ_MAX); } else { /* * If this is a sequential page fault, then * arithmetically increase the number of pages * in the read-ahead window. Otherwise, reset * the read-ahead window to its smallest size. */ behind = atop(vaddr - fs.entry->start); if (behind > VM_FAULT_READ_BEHIND) behind = VM_FAULT_READ_BEHIND; ahead = atop(fs.entry->end - vaddr) - 1; era = fs.entry->read_ahead; if (fs.pindex == fs.entry->next_read) { nera = era + behind; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; behind = 0; if (ahead > nera) ahead = nera; if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_cache_behind(&fs, VM_FAULT_CACHE_BEHIND); } else if (ahead > VM_FAULT_READ_AHEAD_MIN) ahead = VM_FAULT_READ_AHEAD_MIN; if (era != ahead) fs.entry->read_ahead = ahead; } /* * Call the pager to retrieve the data, if any, after * releasing the lock on the map. We hold a ref on * fs.object and the pages are exclusive busied. */ unlock_map(&fs); if (fs.object->type == OBJT_VNODE) { vp = fs.object->handle; if (vp == fs.vp) goto vnode_locked; else if (fs.vp != NULL) { vput(fs.vp); fs.vp = NULL; } locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* Do not sleep for vnode lock while fs.m is busy */ error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); if (error != 0) { vhold(vp); release_page(&fs); unlock_and_deallocate(&fs); error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); vdrop(vp); fs.vp = vp; KASSERT(error == 0, ("vm_fault: vget failed")); goto RetryFault; } fs.vp = vp; } vnode_locked: KASSERT(fs.vp == NULL || !fs.map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * now we find out if any other pages should be paged * in at this time this routine checks to see if the * pages surrounding this fault reside in the same * object as the page for this fault. If they do, * then they are faulted in also into the object. The * array "marray" returned contains an array of * vm_page_t structs where one of them is the * vm_page_t passed to the routine. The reqpage * return value is the index into the marray for the * vm_page_t passed to the routine. * * fs.m plus the additional pages are exclusive busied. */ faultcount = vm_fault_additional_pages( fs.m, behind, ahead, marray, &reqpage); rv = faultcount ? vm_pager_get_pages(fs.object, marray, faultcount, reqpage) : VM_PAGER_FAIL; if (rv == VM_PAGER_OK) { /* * Found the page. Leave it busy while we play * with it. */ /* * Relookup in case pager changed page. Pager * is responsible for disposition of old page * if moved. */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (!fs.m) { unlock_and_deallocate(&fs); goto RetryFault; } hardfault++; break; /* break to PAGE HAS BEEN FOUND */ } /* * Remove the bogus page (which does not exist at this * object/offset); before doing so, we must get back * our object lock to preserve our invariant. * * Also wake up any other process that may want to bring * in this page. * * If this is the top-level object, we must leave the * busy page to prevent another process from rushing * past us, and inserting the page in that object at * the same time that we are. */ if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * Data outside the range of the pager or an I/O error */ /* * XXX - the check for kernel_map is a kludge to work * around having the machine panic on a kernel space * fault w/ I/O error. */ if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; unlock_and_deallocate(&fs); return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE); } if (fs.object != fs.first_object) { vm_page_lock(fs.m); vm_page_free(fs.m); vm_page_unlock(fs.m); fs.m = NULL; /* * XXX - we cannot just fall out at this * point, m has been freed and is invalid! */ } } /* * We get here if the object has default pager (or unwiring) * or the pager doesn't have the page. */ if (fs.object == fs.first_object) fs.first_m = fs.m; /* * Move on to the next object. Lock the next object before * unlocking the current one. */ fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset); next_object = fs.object->backing_object; if (next_object == NULL) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs.object != fs.first_object) { vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; VM_OBJECT_WLOCK(fs.object); } fs.first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs.m->flags & PG_ZERO) == 0) { pmap_zero_page(fs.m); } else { PCPU_INC(cnt.v_ozfod); } PCPU_INC(cnt.v_zfod); fs.m->valid = VM_PAGE_BITS_ALL; /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND */ } else { KASSERT(fs.object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs.object != fs.first_object) vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); fs.object = next_object; } } vm_page_assert_xbusied(fs.m); /* * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock * is held.] */ /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { /* * This allows pages to be virtually copied from a * backing_object into the first_object, where the * backing object has no other refs to it, and cannot * gain any more refs. Instead of a bcopy, we just * move the page from the backing object to the * first object. Note that we must mark the page * dirty in the first object so that it will go out * to swap when needed. */ is_first_object_locked = FALSE; if ( /* * Only one shadow object */ (fs.object->shadow_count == 1) && /* * No COW refs, except us */ (fs.object->ref_count == 1) && /* * No one else can look this object up */ (fs.object->handle == NULL) && /* * No other ways to look the object up */ ((fs.object->type == OBJT_DEFAULT) || (fs.object->type == OBJT_SWAP)) && (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) && /* * We don't chase down the shadow chain */ fs.object == fs.first_object->backing_object) { /* * get rid of the unnecessary page */ vm_page_lock(fs.first_m); vm_page_free(fs.first_m); vm_page_unlock(fs.first_m); /* * grab the page and put it into the * process'es object. The page is * automatically made dirty. */ if (vm_page_rename(fs.m, fs.first_object, fs.first_pindex)) { unlock_and_deallocate(&fs); goto RetryFault; } vm_page_xbusy(fs.m); fs.first_m = fs.m; fs.m = NULL; PCPU_INC(cnt.v_cow_optim); } else { /* * Oh, well, lets copy it. */ pmap_copy_page(fs.m, fs.first_m); fs.first_m->valid = VM_PAGE_BITS_ALL; if (wired && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) { vm_page_lock(fs.first_m); vm_page_wire(fs.first_m); vm_page_unlock(fs.first_m); vm_page_lock(fs.m); - vm_page_unwire(fs.m, FALSE); + vm_page_unwire(fs.m, PQ_INACTIVE); vm_page_unlock(fs.m); } /* * We no longer need the old page or object. */ release_page(&fs); } /* * fs.object != fs.first_object due to above * conditional */ vm_object_pip_wakeup(fs.object); VM_OBJECT_WUNLOCK(fs.object); /* * Only use the new page below... */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; fs.m = fs.first_m; if (!is_first_object_locked) VM_OBJECT_WLOCK(fs.object); PCPU_INC(cnt.v_cow_faults); curthread->td_cow++; } else { prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; if (!vm_map_trylock_read(fs.map)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } fs.lookup_still_valid = TRUE; if (fs.map->timestamp != map_generation) { result = vm_map_lookup_locked(&fs.map, vaddr, fault_type, &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired); /* * If we don't need the page any longer, put it on the inactive * list (the easiest thing to do here). If no one needs it, * pageout will grab it eventually. */ if (result != KERN_SUCCESS) { release_page(&fs); unlock_and_deallocate(&fs); /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) goto RetryFault; return (result); } if ((retry_object != fs.first_object) || (retry_pindex != fs.first_pindex)) { release_page(&fs); unlock_and_deallocate(&fs); goto RetryFault; } /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ prot &= retry_prot; } } /* * If the page was filled by a pager, update the map entry's * last read offset. Since the pager does not return the * actual set of pages that it read, this update is based on * the requested set. Typically, the requested and actual * sets are the same. * * XXX The following assignment modifies the map * without holding a write lock on it. */ if (hardfault) fs.entry->next_read = fs.pindex + faultcount - reqpage; if ((prot & VM_PROT_WRITE) != 0 || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_object_set_writeable_dirty(fs.object); /* * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if (fs.entry->eflags & MAP_ENTRY_NOSYNC) { if (fs.m->dirty == 0) fs.m->oflags |= VPO_NOSYNC; } else { fs.m->oflags &= ~VPO_NOSYNC; } /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also tell the backing pager, if any, that it should remove * any swap backing since the page is now dirty. */ if (((fault_type & VM_PROT_WRITE) != 0 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) || (fault_flags & VM_FAULT_DIRTY) != 0) { vm_page_dirty(fs.m); vm_pager_page_unswapped(fs.m); } } vm_page_assert_xbusied(fs.m); /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ KASSERT(fs.m->valid == VM_PAGE_BITS_ALL, ("vm_fault: page %p partially invalid", fs.m)); VM_OBJECT_WUNLOCK(fs.object); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired); if (faultcount != 1 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0) vm_fault_prefault(&fs, vaddr, faultcount, reqpage); VM_OBJECT_WLOCK(fs.object); vm_page_lock(fs.m); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if (fault_flags & VM_FAULT_CHANGE_WIRING) { if (wired) vm_page_wire(fs.m); else - vm_page_unwire(fs.m, 1); + vm_page_unwire(fs.m, PQ_ACTIVE); } else vm_page_activate(fs.m); if (m_hold != NULL) { *m_hold = fs.m; vm_page_hold(fs.m); } vm_page_unlock(fs.m); vm_page_xunbusy(fs.m); /* * Unlock everything, and return */ unlock_and_deallocate(&fs); if (hardfault) { PCPU_INC(cnt.v_io_faults); curthread->td_ru.ru_majflt++; } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of up to "distance" pages that precede the * faulting pindex within the first object of the shadow chain. */ static void vm_fault_cache_behind(const struct faultstate *fs, int distance) { vm_object_t first_object, object; vm_page_t m, m_prev; vm_pindex_t pindex; object = fs->object; VM_OBJECT_ASSERT_WLOCKED(object); first_object = fs->first_object; if (first_object != object) { if (!VM_OBJECT_TRYWLOCK(first_object)) { VM_OBJECT_WUNLOCK(object); VM_OBJECT_WLOCK(first_object); VM_OBJECT_WLOCK(object); } } /* Neither fictitious nor unmanaged pages can be cached. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { if (fs->first_pindex < distance) pindex = 0; else pindex = fs->first_pindex - distance; if (pindex < OFF_TO_IDX(fs->entry->offset)) pindex = OFF_TO_IDX(fs->entry->offset); m = first_object != object ? fs->first_m : fs->m; vm_page_assert_xbusied(m); m_prev = vm_page_prev(m); while ((m = m_prev) != NULL && m->pindex >= pindex && m->valid == VM_PAGE_BITS_ALL) { m_prev = vm_page_prev(m); if (vm_page_busied(m)) continue; vm_page_lock(m); if (m->hold_count == 0 && m->wire_count == 0) { pmap_remove_all(m); vm_page_aflag_clear(m, PGA_REFERENCED); if (m->dirty != 0) vm_page_deactivate(m); else vm_page_cache(m); } vm_page_unlock(m); } } if (first_object != object) VM_OBJECT_WUNLOCK(first_object); } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int faultcount, int reqpage) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int backward, forward, i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; if (faultcount > 0) { backward = reqpage; forward = faultcount - reqpage - 1; } else { backward = PFBAK; forward = PFFOR; } entry = fs->entry; starta = addra - backward * PAGE_SIZE; if (starta < entry->start) { starta = entry->start; } else if (starta > addra) { starta = 0; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { VM_OBJECT_RUNLOCK(lobject); break; } if (m->valid == VM_PAGE_BITS_ALL && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); /* * Check for illegal addresses. */ if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. */ for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault_hold(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) { vm_page_lock(*mp); vm_page_unhold(*mp); vm_page_unlock(*mp); } return (-1); } /* * vm_fault_wire: * * Wire down a range of virtual addresses in a map. */ int vm_fault_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t fictitious) { vm_offset_t va; int rv; /* * We simulate a fault to get the page and enter it in the physical * map. For user wiring, we only ask for read access on currently * read-only sections. */ for (va = start; va < end; va += PAGE_SIZE) { rv = vm_fault(map, va, VM_PROT_NONE, VM_FAULT_CHANGE_WIRING); if (rv) { if (va != start) vm_fault_unwire(map, start, va, fictitious); return (rv); } } return (KERN_SUCCESS); } /* * vm_fault_unwire: * * Unwire a range of virtual addresses in a map. */ void vm_fault_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end, boolean_t fictitious) { vm_paddr_t pa; vm_offset_t va; vm_page_t m; pmap_t pmap; pmap = vm_map_pmap(map); /* * Since the pages are wired down, we must be able to get their * mappings from the physical map system. */ for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_extract(pmap, va); if (pa != 0) { pmap_change_wiring(pmap, va, FALSE); if (!fictitious) { m = PHYS_TO_VM_PAGE(pa); vm_page_lock(m); - vm_page_unwire(m, TRUE); + vm_page_unwire(m, PQ_ACTIVE); vm_page_unlock(m); } } } } /* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; access = prot = dst_entry->protection; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. (Doesn't * actually shadow anything - we copy the pages directly.) */ dst_object = vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(dst_entry->end - dst_entry->start)); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_object->charge = dst_entry->end - dst_entry->start; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else if (dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object does share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); dst_m->valid = VM_PAGE_BITS_ALL; dst_m->dirty = VM_PAGE_BITS_ALL; } else { dst_m = src_m; if (vm_page_sleep_if_busy(dst_m, "fltupg")) goto again; vm_page_xbusy(dst_m); KASSERT(dst_m->valid == VM_PAGE_BITS_ALL, ("invalid dst page %p", dst_m)); } VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. */ pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade); /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { if (src_m != dst_m) { vm_page_lock(src_m); - vm_page_unwire(src_m, 0); + vm_page_unwire(src_m, PQ_INACTIVE); vm_page_unlock(src_m); vm_page_lock(dst_m); vm_page_wire(dst_m); vm_page_unlock(dst_m); } else { KASSERT(dst_m->wire_count > 0, ("dst_m %p is not wired", dst_m)); } } else { vm_page_lock(dst_m); vm_page_activate(dst_m); vm_page_unlock(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * This routine checks around the requested page for other pages that * might be able to be faulted in. This routine brackets the viable * pages for the pages to be paged in. * * Inputs: * m, rbehind, rahead * * Outputs: * marray (array of vm_page_t), reqpage (index of requested page) * * Return value: * number of pages in marray */ static int vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage) vm_page_t m; int rbehind; int rahead; vm_page_t *marray; int *reqpage; { int i,j; vm_object_t object; vm_pindex_t pindex, startpindex, endpindex, tpindex; vm_page_t rtm; int alloc_req, cbehind, cahead; VM_OBJECT_ASSERT_WLOCKED(m->object); object = m->object; pindex = m->pindex; cbehind = cahead = 0; /* * if the requested page is not available, then give up now */ if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) { return 0; } if ((cbehind == 0) && (cahead == 0)) { *reqpage = 0; marray[0] = m; return 1; } if (rahead > cahead) { rahead = cahead; } if (rbehind > cbehind) { rbehind = cbehind; } alloc_req = VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED; if ((object->flags & OBJ_UNMANAGED) != 0) alloc_req |= VM_ALLOC_WIRED; /* * scan backward for the read behind pages -- in memory */ if (pindex > 0) { if (rbehind > pindex) { rbehind = pindex; startpindex = 0; } else { startpindex = pindex - rbehind; } if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL && rtm->pindex >= startpindex) startpindex = rtm->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (i = 0, tpindex = pindex - 1; tpindex >= startpindex && tpindex < pindex; i++, tpindex--) { rtm = vm_page_alloc(object, tpindex, alloc_req); if (rtm == NULL) { /* * Shift the allocated pages to the * beginning of the array. */ for (j = 0; j < i; j++) { marray[j] = marray[j + tpindex + 1 - startpindex]; } break; } marray[tpindex - startpindex] = rtm; } } else { startpindex = 0; i = 0; } marray[i] = m; /* page offset of the required page */ *reqpage = i; tpindex = pindex + 1; i++; /* * scan forward for the read ahead pages */ endpindex = tpindex + rahead; if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex) endpindex = rtm->pindex; if (endpindex > object->size) endpindex = object->size; for (; tpindex < endpindex; i++, tpindex++) { rtm = vm_page_alloc(object, tpindex, alloc_req); if (rtm == NULL) { break; } marray[i] = rtm; } /* return number of pages */ return i; } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); } Index: user/attilio/rm_vmobj_cache/sys/vm/vm_glue.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/vm/vm_glue.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/vm/vm_glue.c (revision 267237) @@ -1,1044 +1,1044 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_glue.c 8.6 (Berkeley) 1/5/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include "opt_kstack_pages.h" #include "opt_kstack_max_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef NO_SWAPPING static int swapout(struct proc *); static void swapclear(struct proc *); static void vm_thread_swapin(struct thread *td); static void vm_thread_swapout(struct thread *td); #endif /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. In most cases * just checking the vm_map_entry is sufficient within the kernel's address * space. */ int kernacc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_offset_t saddr, eaddr; vm_prot_t prot; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to kernacc (%x)\n", rw)); if ((vm_offset_t)addr + len > kernel_map->max_offset || (vm_offset_t)addr + len < (vm_offset_t)addr) return (FALSE); prot = rw; saddr = trunc_page((vm_offset_t)addr); eaddr = round_page((vm_offset_t)addr + len); vm_map_lock_read(kernel_map); rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot); vm_map_unlock_read(kernel_map); return (rv == TRUE); } /* * MPSAFE * * WARNING! This code calls vm_map_check_protection() which only checks * the associated vm_map_entry range. It does not determine whether the * contents of the memory is actually readable or writable. vmapbuf(), * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be * used in conjuction with this call. */ int useracc(addr, len, rw) void *addr; int len, rw; { boolean_t rv; vm_prot_t prot; vm_map_t map; KASSERT((rw & ~VM_PROT_ALL) == 0, ("illegal ``rw'' argument to useracc (%x)\n", rw)); prot = rw; map = &curproc->p_vmspace->vm_map; if ((vm_offset_t)addr + len > vm_map_max(map) || (vm_offset_t)addr + len < (vm_offset_t)addr) { return (FALSE); } vm_map_lock_read(map); rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot); vm_map_unlock_read(map); return (rv == TRUE); } int vslock(void *addr, size_t len) { vm_offset_t end, last, start; vm_size_t npages; int error; last = (vm_offset_t)addr + len; start = trunc_page((vm_offset_t)addr); end = round_page(last); if (last < (vm_offset_t)addr || end < (vm_offset_t)addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_wired) return (ENOMEM); #if 0 /* * XXX - not yet * * The limit for transient usage of wired pages should be * larger than for "permanent" wired pages (mlock()). * * Also, the sysctl code, which is the only present user * of vslock(), does a hard loop on EAGAIN. */ if (npages + vm_cnt.v_wire_count > vm_page_max_wired) return (EAGAIN); #endif error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); /* * Return EFAULT on error to match copy{in,out}() behaviour * rather than returning ENOMEM like mlock() would. */ return (error == KERN_SUCCESS ? 0 : EFAULT); } void vsunlock(void *addr, size_t len) { /* Rely on the parameter sanity checks performed by vslock(). */ (void)vm_map_unwire(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); } /* * Pin the page contained within the given object at the given offset. If the * page is not resident, allocate and load it using the given object's pager. * Return the pinned page if successful; otherwise, return NULL. */ static vm_page_t vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m, ma[1]; vm_pindex_t pindex; int rv; VM_OBJECT_WLOCK(object); pindex = OFF_TO_IDX(offset); m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { ma[0] = m; rv = vm_pager_get_pages(object, ma, 1, 0); m = vm_page_lookup(object, pindex); if (m == NULL) goto out; if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); m = NULL; goto out; } } vm_page_xunbusy(m); vm_page_lock(m); vm_page_hold(m); vm_page_unlock(m); out: VM_OBJECT_WUNLOCK(object); return (m); } /* * Return a CPU private mapping to the page at the given offset within the * given object. The page is pinned before it is mapped. */ struct sf_buf * vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset) { vm_page_t m; m = vm_imgact_hold_page(object, offset); if (m == NULL) return (NULL); sched_pin(); return (sf_buf_alloc(m, SFB_CPUPRIVATE)); } /* * Destroy the given CPU private mapping and unpin the page that it mapped. */ void vm_imgact_unmap_page(struct sf_buf *sf) { vm_page_t m; m = sf_buf_page(sf); sf_buf_free(sf); sched_unpin(); vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } void vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz) { pmap_sync_icache(map->pmap, va, sz); } struct kstack_cache_entry *kstack_cache; static int kstack_cache_size = 128; static int kstacks; static struct mtx kstack_cache_mtx; MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF); SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0, ""); SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0, ""); #ifndef KSTACK_MAX_PAGES #define KSTACK_MAX_PAGES 32 #endif /* * Create the kernel stack (including pcb for i386) for a new thread. * This routine directly affects the fork perf for a process and * create performance for a thread. */ int vm_thread_new(struct thread *td, int pages) { vm_object_t ksobj; vm_offset_t ks; vm_page_t m, ma[KSTACK_MAX_PAGES]; struct kstack_cache_entry *ks_ce; int i; /* Bounds check */ if (pages <= 1) pages = KSTACK_PAGES; else if (pages > KSTACK_MAX_PAGES) pages = KSTACK_MAX_PAGES; if (pages == KSTACK_PAGES) { mtx_lock(&kstack_cache_mtx); if (kstack_cache != NULL) { ks_ce = kstack_cache; kstack_cache = ks_ce->next_ks_entry; mtx_unlock(&kstack_cache_mtx); td->td_kstack_obj = ks_ce->ksobj; td->td_kstack = (vm_offset_t)ks_ce; td->td_kstack_pages = KSTACK_PAGES; return (1); } mtx_unlock(&kstack_cache_mtx); } /* * Allocate an object for the kstack. */ ksobj = vm_object_allocate(OBJT_DEFAULT, pages); /* * Get a kernel virtual address for this thread's kstack. */ #if defined(__mips__) /* * We need to align the kstack's mapped address to fit within * a single TLB entry. */ if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_BESTFIT | M_NOWAIT, &ks)) { ks = 0; } #else ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); #endif if (ks == 0) { printf("vm_thread_new: kstack allocation failed\n"); vm_object_deallocate(ksobj); return (0); } atomic_add_int(&kstacks, 1); if (KSTACK_GUARD_PAGES != 0) { pmap_qremove(ks, KSTACK_GUARD_PAGES); ks += KSTACK_GUARD_PAGES * PAGE_SIZE; } td->td_kstack_obj = ksobj; td->td_kstack = ks; /* * Knowing the number of pages allocated is useful when you * want to deallocate them. */ td->td_kstack_pages = pages; /* * For the length of the stack, link in a real page of ram for each * page of stack. */ VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) { /* * Get a kernel stack page. */ m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED); ma[i] = m; m->valid = VM_PAGE_BITS_ALL; } VM_OBJECT_WUNLOCK(ksobj); pmap_qenter(ks, ma, pages); return (1); } static void vm_thread_stack_dispose(vm_object_t ksobj, vm_offset_t ks, int pages) { vm_page_t m; int i; atomic_add_int(&kstacks, -1); pmap_qremove(ks, pages); VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_dispose: kstack already missing?"); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_free(m); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(ksobj); vm_object_deallocate(ksobj); kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE), (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE); } /* * Dispose of a thread's kernel stack. */ void vm_thread_dispose(struct thread *td) { vm_object_t ksobj; vm_offset_t ks; struct kstack_cache_entry *ks_ce; int pages; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; ks = td->td_kstack; td->td_kstack = 0; td->td_kstack_pages = 0; if (pages == KSTACK_PAGES && kstacks <= kstack_cache_size) { ks_ce = (struct kstack_cache_entry *)ks; ks_ce->ksobj = ksobj; mtx_lock(&kstack_cache_mtx); ks_ce->next_ks_entry = kstack_cache; kstack_cache = ks_ce; mtx_unlock(&kstack_cache_mtx); return; } vm_thread_stack_dispose(ksobj, ks, pages); } static void vm_thread_stack_lowmem(void *nulll) { struct kstack_cache_entry *ks_ce, *ks_ce1; mtx_lock(&kstack_cache_mtx); ks_ce = kstack_cache; kstack_cache = NULL; mtx_unlock(&kstack_cache_mtx); while (ks_ce != NULL) { ks_ce1 = ks_ce; ks_ce = ks_ce->next_ks_entry; vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1, KSTACK_PAGES); } } static void kstack_cache_init(void *nulll) { EVENTHANDLER_REGISTER(vm_lowmem, vm_thread_stack_lowmem, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL); #ifndef NO_SWAPPING /* * Allow a thread's kernel stack to be paged out. */ static void vm_thread_swapout(struct thread *td) { vm_object_t ksobj; vm_page_t m; int i, pages; cpu_thread_swapout(td); pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; pmap_qremove(td->td_kstack, pages); VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) { m = vm_page_lookup(ksobj, i); if (m == NULL) panic("vm_thread_swapout: kstack already missing?"); vm_page_dirty(m); vm_page_lock(m); - vm_page_unwire(m, 0); + vm_page_unwire(m, PQ_INACTIVE); vm_page_unlock(m); } VM_OBJECT_WUNLOCK(ksobj); } /* * Bring the kernel stack for a specified thread back in. */ static void vm_thread_swapin(struct thread *td) { vm_object_t ksobj; vm_page_t ma[KSTACK_MAX_PAGES]; int i, j, k, pages, rv; pages = td->td_kstack_pages; ksobj = td->td_kstack_obj; VM_OBJECT_WLOCK(ksobj); for (i = 0; i < pages; i++) ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_WIRED); for (i = 0; i < pages; i++) { if (ma[i]->valid != VM_PAGE_BITS_ALL) { vm_page_assert_xbusied(ma[i]); vm_object_pip_add(ksobj, 1); for (j = i + 1; j < pages; j++) { if (ma[j]->valid != VM_PAGE_BITS_ALL) vm_page_assert_xbusied(ma[j]); if (ma[j]->valid == VM_PAGE_BITS_ALL) break; } rv = vm_pager_get_pages(ksobj, ma + i, j - i, 0); if (rv != VM_PAGER_OK) panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid); vm_object_pip_wakeup(ksobj); for (k = i; k < j; k++) ma[k] = vm_page_lookup(ksobj, k); vm_page_xunbusy(ma[i]); } else if (vm_page_xbusied(ma[i])) vm_page_xunbusy(ma[i]); } VM_OBJECT_WUNLOCK(ksobj); pmap_qenter(td->td_kstack, ma, pages); cpu_thread_swapin(td); } #endif /* !NO_SWAPPING */ /* * Implement fork's actions on an address space. * Here we arrange for the address space to be copied or referenced, * allocate a user struct (pcb and kernel stack), then call the * machine-dependent layer to fill those in and make the new process * ready to run. The new process is set up so that it returns directly * to user mode to avoid stack copying and relocation problems. */ int vm_forkproc(td, p2, td2, vm2, flags) struct thread *td; struct proc *p2; struct thread *td2; struct vmspace *vm2; int flags; { struct proc *p1 = td->td_proc; int error; if ((flags & RFPROC) == 0) { /* * Divorce the memory, if it is shared, essentially * this changes shared memory amongst threads, into * COW locally. */ if ((flags & RFMEM) == 0) { if (p1->p_vmspace->vm_refcnt > 1) { error = vmspace_unshare(p1); if (error) return (error); } } cpu_fork(td, p2, td2, flags); return (0); } if (flags & RFMEM) { p2->p_vmspace = p1->p_vmspace; atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); } while (vm_page_count_severe()) { VM_WAIT; } if ((flags & RFMEM) == 0) { p2->p_vmspace = vm2; if (p1->p_vmspace->vm_shm) shmfork(p1, p2); } /* * cpu_fork will copy and update the pcb, set up the kernel stack, * and make the child ready to run. */ cpu_fork(td, p2, td2, flags); return (0); } /* * Called after process has been wait(2)'ed apon and is being reaped. * The idea is to reclaim resources that we could not reclaim while * the process was still executing. */ void vm_waitproc(p) struct proc *p; { vmspace_exitfree(p); /* and clean-out the vmspace */ } void faultin(p) struct proc *p; { #ifdef NO_SWAPPING PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_INMEM) == 0) panic("faultin: proc swapped out with NO_SWAPPING!"); #else /* !NO_SWAPPING */ struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * If another process is swapping in this process, * just wait until it finishes. */ if (p->p_flag & P_SWAPPINGIN) { while (p->p_flag & P_SWAPPINGIN) msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); return; } if ((p->p_flag & P_INMEM) == 0) { /* * Don't let another thread swap process p out while we are * busy swapping it in. */ ++p->p_lock; p->p_flag |= P_SWAPPINGIN; PROC_UNLOCK(p); /* * We hold no lock here because the list of threads * can not change while all threads in the process are * swapped out. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapin(td); PROC_LOCK(p); swapclear(p); p->p_swtick = ticks; wakeup(&p->p_flag); /* Allow other threads to swap p out now. */ --p->p_lock; } #endif /* NO_SWAPPING */ } /* * This swapin algorithm attempts to swap-in processes only if there * is enough space for them. Of course, if a process waits for a long * time, it will be swapped in anyway. * * Giant is held on entry. */ void swapper(void) { struct proc *p; struct thread *td; struct proc *pp; int slptime; int swtime; int ppri; int pri; loop: if (vm_page_count_min()) { VM_WAIT; goto loop; } pp = NULL; ppri = INT_MIN; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW || p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) { PROC_UNLOCK(p); continue; } swtime = (ticks - p->p_swtick) / hz; FOREACH_THREAD_IN_PROC(p, td) { /* * An otherwise runnable thread of a process * swapped out has only the TDI_SWAPPED bit set. * */ thread_lock(td); if (td->td_inhibitors == TDI_SWAPPED) { slptime = (ticks - td->td_slptick) / hz; pri = swtime + slptime; if ((td->td_flags & TDF_SWAPINREQ) == 0) pri -= p->p_nice * 8; /* * if this thread is higher priority * and there is enough space, then select * this process instead of the previous * selection. */ if (pri > ppri) { pp = p; ppri = pri; } } thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* * Nothing to do, back to sleep. */ if ((p = pp) == NULL) { tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2); goto loop; } PROC_LOCK(p); /* * Another process may be bringing or may have already * brought this process in while we traverse all threads. * Or, this process may even be being swapped out again. */ if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) { PROC_UNLOCK(p); goto loop; } /* * We would like to bring someone in. (only if there is space). * [What checks the space? ] */ faultin(p); PROC_UNLOCK(p); goto loop; } void kick_proc0(void) { wakeup(&proc0); } #ifndef NO_SWAPPING /* * Swap_idle_threshold1 is the guaranteed swapped in time for a process */ static int swap_idle_threshold1 = 2; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process"); /* * Swap_idle_threshold2 is the time that a process can be idle before * it will be swapped out, if idle swapping is enabled. */ static int swap_idle_threshold2 = 10; SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, &swap_idle_threshold2, 0, "Time before a process will be swapped out"); /* * First, if any processes have been sleeping or stopped for at least * "swap_idle_threshold1" seconds, they are swapped out. If, however, * no such processes exist, then the longest-sleeping or stopped * process is swapped out. Finally, and only as a last resort, if * there are no sleeping or stopped processes, the longest-resident * process is swapped out. */ void swapout_procs(action) int action; { struct proc *p; struct thread *td; int didswap = 0; retry: sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { struct vmspace *vm; int minslptime = 100000; int slptime; /* * Watch out for a process in * creation. It may have no * address space or lock yet. */ if (p->p_state == PRS_NEW) continue; /* * An aio daemon switches its * address space while running. * Perform a quick check whether * a process has P_SYSTEM. */ if ((p->p_flag & P_SYSTEM) != 0) continue; /* * Do not swapout a process that * is waiting for VM data * structures as there is a possible * deadlock. Test this first as * this may block. * * Lock the map until swapout * finishes, or a thread of this * process may attempt to alter * the map. */ vm = vmspace_acquire_ref(p); if (vm == NULL) continue; if (!vm_map_trylock(&vm->vm_map)) goto nextproc1; PROC_LOCK(p); if (p->p_lock != 0 || (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT) ) != 0) { goto nextproc; } /* * only aiod changes vmspace, however it will be * skipped because of the if statement above checking * for P_SYSTEM */ if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM) goto nextproc; switch (p->p_state) { default: /* Don't swap out processes in any sort * of 'special' state. */ break; case PRS_NORMAL: /* * do not swapout a realtime process * Check all the thread groups.. */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (PRI_IS_REALTIME(td->td_pri_class)) { thread_unlock(td); goto nextproc; } slptime = (ticks - td->td_slptick) / hz; /* * Guarantee swap_idle_threshold1 * time in memory. */ if (slptime < swap_idle_threshold1) { thread_unlock(td); goto nextproc; } /* * Do not swapout a process if it is * waiting on a critical event of some * kind or there is a thread whose * pageable memory may be accessed. * * This could be refined to support * swapping out a thread. */ if (!thread_safetoswapout(td)) { thread_unlock(td); goto nextproc; } /* * If the system is under memory stress, * or if we are swapping * idle processes >= swap_idle_threshold2, * then swap the process out. */ if (((action & VM_SWAP_NORMAL) == 0) && (((action & VM_SWAP_IDLE) == 0) || (slptime < swap_idle_threshold2))) { thread_unlock(td); goto nextproc; } if (minslptime > slptime) minslptime = slptime; thread_unlock(td); } /* * If the pageout daemon didn't free enough pages, * or if this process is idle and the system is * configured to swap proactively, swap it out. */ if ((action & VM_SWAP_NORMAL) || ((action & VM_SWAP_IDLE) && (minslptime > swap_idle_threshold2))) { if (swapout(p) == 0) didswap++; PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); vmspace_free(vm); sx_sunlock(&allproc_lock); goto retry; } } nextproc: PROC_UNLOCK(p); vm_map_unlock(&vm->vm_map); nextproc1: vmspace_free(vm); continue; } sx_sunlock(&allproc_lock); /* * If we swapped something out, and another process needed memory, * then wakeup the sched process. */ if (didswap) wakeup(&proc0); } static void swapclear(p) struct proc *p; { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_flags |= TDF_INMEM; td->td_flags &= ~TDF_SWAPINREQ; TD_CLR_SWAPPED(td); if (TD_CAN_RUN(td)) if (setrunnable(td)) { #ifdef INVARIANTS /* * XXX: We just cleared TDI_SWAPPED * above and set TDF_INMEM, so this * should never happen. */ panic("not waking up swapper"); #endif } thread_unlock(td); } p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT); p->p_flag |= P_INMEM; } static int swapout(p) struct proc *p; { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); #if defined(SWAP_DEBUG) printf("swapping out %d\n", p->p_pid); #endif /* * The states of this process and its threads may have changed * by now. Assuming that there is only one pageout daemon thread, * this process should still be in memory. */ KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM, ("swapout: lost a swapout race?")); /* * remember the process resident count */ p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); /* * Check and mark all threads before we proceed. */ p->p_flag &= ~P_INMEM; p->p_flag |= P_SWAPPINGOUT; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!thread_safetoswapout(td)) { thread_unlock(td); swapclear(p); return (EBUSY); } td->td_flags &= ~TDF_INMEM; TD_SET_SWAPPED(td); thread_unlock(td); } td = FIRST_THREAD_IN_PROC(p); ++td->td_ru.ru_nswap; PROC_UNLOCK(p); /* * This list is stable because all threads are now prevented from * running. The list is only modified in the context of a running * thread in this process. */ FOREACH_THREAD_IN_PROC(p, td) vm_thread_swapout(td); PROC_LOCK(p); p->p_flag &= ~P_SWAPPINGOUT; p->p_swtick = ticks; return (0); } #endif /* !NO_SWAPPING */ Index: user/attilio/rm_vmobj_cache/sys/vm/vm_page.c =================================================================== --- user/attilio/rm_vmobj_cache/sys/vm/vm_page.c (revision 267236) +++ user/attilio/rm_vmobj_cache/sys/vm/vm_page.c (revision 267237) @@ -1,3210 +1,3212 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 */ /*- * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. * * * The page daemon can acquire and hold any pair of page queue * locks in any order. * * - The object lock is required when inserting or removing * pages from an object (vm_page_insert() or vm_page_remove()). * */ /* * Resident memory management module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Associated with page of user-allocatable memory is a * page structure. */ struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; vm_page_t vm_page_array; long vm_page_array_size; long first_page; int vm_page_zero_count; static int boot_pages = UMA_BOOT_PAGES; TUNABLE_INT("vm.boot_pages", &boot_pages); SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0, "number of pages allocated for bootstrapping the VM system"); static int pa_tryrelock_restart; SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD, &pa_tryrelock_restart, 0, "Number of tryrelock restarts"); static uma_zone_t fakepg_zone; static struct vnode *vm_page_alloc_init(vm_page_t m); static void vm_page_cache_turn_free(vm_page_t m); static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits); -static void vm_page_enqueue(int queue, vm_page_t m); +static void vm_page_enqueue(uint8_t queue, vm_page_t m); static void vm_page_init_fakepg(void *dummy); static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred); static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred); SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL); static void vm_page_init_fakepg(void *dummy) { fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); } /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */ #if PAGE_SIZE == 32768 #ifdef CTASSERT CTASSERT(sizeof(u_long) >= 8); #endif #endif /* * Try to acquire a physical address lock while a pmap is locked. If we * fail to trylock we unlock and lock the pmap directly and cache the * locked pa in *locked. The caller should then restart their loop in case * the virtual to physical mapping has changed. */ int vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked) { vm_paddr_t lockpa; lockpa = *locked; *locked = pa; if (lockpa) { PA_LOCK_ASSERT(lockpa, MA_OWNED); if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa)) return (0); PA_UNLOCK(lockpa); } if (PA_TRYLOCK(pa)) return (0); PMAP_UNLOCK(pmap); atomic_add_int(&pa_tryrelock_restart, 1); PA_LOCK(pa); PMAP_LOCK(pmap); return (EAGAIN); } /* * vm_set_page_size: * * Sets the page size, perhaps based upon the memory * size. Must be called before any use of page-size * dependent functions. */ void vm_set_page_size(void) { if (vm_cnt.v_page_size == 0) vm_cnt.v_page_size = PAGE_SIZE; if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0) panic("vm_set_page_size: page size not a power of two"); } /* * vm_page_blacklist_lookup: * * See if a physical address in this page has been listed * in the blacklist tunable. Entries in the tunable are * separated by spaces or commas. If an invalid integer is * encountered then the rest of the string is skipped. */ static int vm_page_blacklist_lookup(char *list, vm_paddr_t pa) { vm_paddr_t bad; char *cp, *pos; for (pos = list; *pos != '\0'; pos = cp) { bad = strtoq(pos, &cp, 0); if (*cp != '\0') { if (*cp == ' ' || *cp == ',') { cp++; if (cp == pos) continue; } else break; } if (pa == trunc_page(bad)) return (1); } return (0); } static void vm_page_domain_init(struct vm_domain *vmd) { struct vm_pagequeue *pq; int i; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = "vm inactive pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = &vm_cnt.v_inactive_count; *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = "vm active pagequeue"; *__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = &vm_cnt.v_active_count; vmd->vmd_page_count = 0; vmd->vmd_free_count = 0; vmd->vmd_segs = 0; vmd->vmd_oom = FALSE; vmd->vmd_pass = 0; for (i = 0; i < PQ_COUNT; i++) { pq = &vmd->vmd_pagequeues[i]; TAILQ_INIT(&pq->pq_pl); mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); } } /* * vm_page_startup: * * Initializes the resident memory module. * * Allocates memory for the page cells, and * for the object/offset-to-page hash table headers. * Each page cell is initialized and placed on the free list. */ vm_offset_t vm_page_startup(vm_offset_t vaddr) { vm_offset_t mapped; vm_paddr_t page_range; vm_paddr_t new_end; int i; vm_paddr_t pa; vm_paddr_t last_pa; char *list; /* the biggest memory array is the second group of pages */ vm_paddr_t end; vm_paddr_t biggestsize; vm_paddr_t low_water, high_water; int biggestone; biggestsize = 0; biggestone = 0; vaddr = round_page(vaddr); for (i = 0; phys_avail[i + 1]; i += 2) { phys_avail[i] = round_page(phys_avail[i]); phys_avail[i + 1] = trunc_page(phys_avail[i + 1]); } low_water = phys_avail[0]; high_water = phys_avail[1]; for (i = 0; phys_avail[i + 1]; i += 2) { vm_paddr_t size = phys_avail[i + 1] - phys_avail[i]; if (size > biggestsize) { biggestone = i; biggestsize = size; } if (phys_avail[i] < low_water) low_water = phys_avail[i]; if (phys_avail[i + 1] > high_water) high_water = phys_avail[i + 1]; } #ifdef XEN low_water = 0; #endif end = phys_avail[biggestone+1]; /* * Initialize the page and queue locks. */ mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); for (i = 0; i < vm_ndomains; i++) vm_page_domain_init(&vm_dom[i]); /* * Allocate memory for use when boot strapping the kernel memory * allocator. */ new_end = end - (boot_pages * UMA_SLAB_SIZE); new_end = trunc_page(new_end); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)mapped, end - new_end); uma_startup((void *)mapped, boot_pages); #if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \ defined(__mips__) /* * Allocate a bitmap to indicate that a random physical page * needs to be included in a minidump. * * The amd64 port needs this to indicate which direct map pages * need to be dumped, via calls to dump_add_page()/dump_drop_page(). * * However, i386 still needs this workspace internally within the * minidump code. In theory, they are not needed on i386, but are * included should the sf_buf code decide to use them. */ last_pa = 0; for (i = 0; dump_avail[i + 1] != 0; i += 2) if (dump_avail[i + 1] > last_pa) last_pa = dump_avail[i + 1]; page_range = last_pa / PAGE_SIZE; vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY); new_end -= vm_page_dump_size; vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end, new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE); bzero((void *)vm_page_dump, vm_page_dump_size); #endif #ifdef __amd64__ /* * Request that the physical pages underlying the message buffer be * included in a crash dump. Since the message buffer is accessed * through the direct map, they are not automatically included. */ pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr); last_pa = pa + round_page(msgbufsize); while (pa < last_pa) { dump_add_page(pa); pa += PAGE_SIZE; } #endif /* * Compute the number of pages of memory that will be available for * use (taking into account the overhead of a page structure per * page). */ first_page = low_water / PAGE_SIZE; #ifdef VM_PHYSSEG_SPARSE page_range = 0; for (i = 0; phys_avail[i + 1] != 0; i += 2) page_range += atop(phys_avail[i + 1] - phys_avail[i]); #elif defined(VM_PHYSSEG_DENSE) page_range = high_water / PAGE_SIZE - first_page; #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif end = new_end; /* * Reserve an unmapped guard page to trap access to vm_page_array[-1]. */ vaddr += PAGE_SIZE; /* * Initialize the mem entry structures now, and put them in the free * queue. */ new_end = trunc_page(end - page_range * sizeof(struct vm_page)); mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE); vm_page_array = (vm_page_t) mapped; #if VM_NRESERVLEVEL > 0 /* * Allocate memory for the reservation management system's data * structures. */ new_end = vm_reserv_startup(&vaddr, new_end, high_water); #endif #if defined(__amd64__) || defined(__mips__) /* * pmap_map on amd64 and mips can come out of the direct-map, not kvm * like i386, so the pages must be tracked for a crashdump to include * this data. This includes the vm_page_array and the early UMA * bootstrap pages. */ for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE) dump_add_page(pa); #endif phys_avail[biggestone + 1] = new_end; /* * Clear all of the page structures */ bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page)); for (i = 0; i < page_range; i++) vm_page_array[i].order = VM_NFREEORDER; vm_page_array_size = page_range; /* * Initialize the physical memory allocator. */ vm_phys_init(); /* * Add every available physical page that is not blacklisted to * the free lists. */ vm_cnt.v_page_count = 0; vm_cnt.v_free_count = 0; list = getenv("vm.blacklist"); for (i = 0; phys_avail[i + 1] != 0; i += 2) { pa = phys_avail[i]; last_pa = phys_avail[i + 1]; while (pa < last_pa) { if (list != NULL && vm_page_blacklist_lookup(list, pa)) printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa); else vm_phys_add_page(pa); pa += PAGE_SIZE; } } freeenv(list); #if VM_NRESERVLEVEL > 0 /* * Initialize the reservation management system. */ vm_reserv_init(); #endif return (vaddr); } void vm_page_reference(vm_page_t m) { vm_page_aflag_set(m, PGA_REFERENCED); } /* * vm_page_busy_downgrade: * * Downgrade an exclusive busy page into a single shared busy page. */ void vm_page_busy_downgrade(vm_page_t m) { u_int x; vm_page_assert_xbusied(m); for (;;) { x = m->busy_lock; x &= VPB_BIT_WAITERS; if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1) | x)) break; } } /* * vm_page_sbusied: * * Return a positive value if the page is shared busied, 0 otherwise. */ int vm_page_sbusied(vm_page_t m) { u_int x; x = m->busy_lock; return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED); } /* * vm_page_sunbusy: * * Shared unbusy a page. */ void vm_page_sunbusy(vm_page_t m) { u_int x; vm_page_assert_sbusied(m); for (;;) { x = m->busy_lock; if (VPB_SHARERS(x) > 1) { if (atomic_cmpset_int(&m->busy_lock, x, x - VPB_ONE_SHARER)) break; continue; } if ((x & VPB_BIT_WAITERS) == 0) { KASSERT(x == VPB_SHARERS_WORD(1), ("vm_page_sunbusy: invalid lock state")); if (atomic_cmpset_int(&m->busy_lock, VPB_SHARERS_WORD(1), VPB_UNBUSIED)) break; continue; } KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), ("vm_page_sunbusy: invalid lock state for waiters")); vm_page_lock(m); if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { vm_page_unlock(m); continue; } wakeup(m); vm_page_unlock(m); break; } } /* * vm_page_busy_sleep: * * Sleep and release the page lock, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * * The given page must be locked. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg) { u_int x; vm_page_lock_assert(m, MA_OWNED); x = m->busy_lock; if (x == VPB_UNBUSIED) { vm_page_unlock(m); return; } if ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS)) { vm_page_unlock(m); return; } msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); } /* * vm_page_trysbusy: * * Try to shared busy a page. * If the operation succeeds 1 is returned otherwise 0. * The operation never sleeps. */ int vm_page_trysbusy(vm_page_t m) { u_int x; for (;;) { x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) return (1); } } /* * vm_page_xunbusy_hard: * * Called after the first try the exclusive unbusy of a page failed. * It is assumed that the waiters bit is on. */ void vm_page_xunbusy_hard(vm_page_t m) { vm_page_assert_xbusied(m); vm_page_lock(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); wakeup(m); vm_page_unlock(m); } /* * vm_page_flash: * * Wakeup anyone waiting for the page. * The ownership bits do not change. * * The given page must be locked. */ void vm_page_flash(vm_page_t m) { u_int x; vm_page_lock_assert(m, MA_OWNED); for (;;) { x = m->busy_lock; if ((x & VPB_BIT_WAITERS) == 0) return; if (atomic_cmpset_int(&m->busy_lock, x, x & (~VPB_BIT_WAITERS))) break; } wakeup(m); } /* * Keep page from being freed by the page daemon * much of the same effect as wiring, except much lower * overhead and should be used only for *very* temporary * holding ("wiring"). */ void vm_page_hold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); mem->hold_count++; } void vm_page_unhold(vm_page_t mem) { vm_page_lock_assert(mem, MA_OWNED); KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!")); --mem->hold_count; if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0) vm_page_free_toq(mem); } /* * vm_page_unhold_pages: * * Unhold each of the pages that is referenced by the given array. */ void vm_page_unhold_pages(vm_page_t *ma, int count) { struct mtx *mtx, *new_mtx; mtx = NULL; for (; count != 0; count--) { /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(*ma); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_unhold(*ma); ma++; } if (mtx != NULL) mtx_unlock(mtx); } vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa) { vm_page_t m; #ifdef VM_PHYSSEG_SPARSE m = vm_phys_paddr_to_vm_page(pa); if (m == NULL) m = vm_phys_fictitious_to_vm_page(pa); return (m); #elif defined(VM_PHYSSEG_DENSE) long pi; pi = atop(pa); if (pi >= first_page && (pi - first_page) < vm_page_array_size) { m = &vm_page_array[pi - first_page]; return (m); } return (vm_phys_fictitious_to_vm_page(pa)); #else #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined." #endif } /* * vm_page_getfake: * * Create a fictitious page with the specified physical address and * memory attribute. The memory attribute is the only the machine- * dependent aspect of a fictitious page that must be initialized. */ vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr) { vm_page_t m; m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO); vm_page_initfake(m, paddr, memattr); return (m); } void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { if ((m->flags & PG_FICTITIOUS) != 0) { /* * The page's memattr might have changed since the * previous initialization. Update the pmap to the * new memattr. */ goto memattr; } m->phys_addr = paddr; m->queue = PQ_NONE; /* Fictitious pages don't use "segind". */ m->flags = PG_FICTITIOUS; /* Fictitious pages don't use "order" or "pool". */ m->oflags = VPO_UNMANAGED; m->busy_lock = VPB_SINGLE_EXCLUSIVER; m->wire_count = 1; pmap_page_init(m); memattr: pmap_page_set_memattr(m, memattr); } /* * vm_page_putfake: * * Release a fictitious page. */ void vm_page_putfake(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m)); KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_putfake: bad page %p", m)); uma_zfree(fakepg_zone, m); } /* * vm_page_updatefake: * * Update the given fictitious page to the specified physical address and * memory attribute. */ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr) { KASSERT((m->flags & PG_FICTITIOUS) != 0, ("vm_page_updatefake: bad page %p", m)); m->phys_addr = paddr; pmap_page_set_memattr(m, memattr); } /* * vm_page_free: * * Free a page. */ void vm_page_free(vm_page_t m) { m->flags &= ~PG_ZERO; vm_page_free_toq(m); } /* * vm_page_free_zero: * * Free a page to the zerod-pages queue */ void vm_page_free_zero(vm_page_t m) { m->flags |= PG_ZERO; vm_page_free_toq(m); } /* * Unbusy and handle the page queueing for a page from the VOP_GETPAGES() * array which is not the request page. */ void vm_page_readahead_finish(vm_page_t m) { if (m->valid != 0) { /* * Since the page is not the requested page, whether * it should be activated or deactivated is not * obvious. Empirical results have shown that * deactivating the page is usually the best choice, * unless the page is wanted by another thread. */ vm_page_lock(m); if ((m->busy_lock & VPB_BIT_WAITERS) != 0) vm_page_activate(m); else vm_page_deactivate(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { /* * Free the completely invalid page. Such page state * occurs due to the short read operation which did * not covered our page at all, or in case when a read * error happens. */ vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); } } /* * vm_page_sleep_if_busy: * * Sleep and release the page queues lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must * be locked. */ int vm_page_sleep_if_busy(vm_page_t m, const char *msg) { vm_object_t obj; vm_page_lock_assert(m, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (vm_page_busied(m)) { /* * The page-specific object must be cached because page * identity can change during the sleep, causing the * re-lock of a different object. * It is assumed that a reference to the object is already * held by the callers. */ obj = m->object; vm_page_lock(m); VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg); VM_OBJECT_WLOCK(obj); return (TRUE); } return (FALSE); } /* * vm_page_dirty_KBI: [ internal use only ] * * Set all bits in the page's dirty field. * * The object containing the specified page must be locked if the * call is made from the machine-independent layer. * * See vm_page_clear_dirty_mask(). * * This function should only be called by vm_page_dirty(). */ void vm_page_dirty_KBI(vm_page_t m) { /* These assertions refer to this operation by its public name. */ KASSERT((m->flags & PG_CACHED) == 0, ("vm_page_dirty: page in cache!")); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("vm_page_dirty: page is invalid!")); m->dirty = VM_PAGE_BITS_ALL; } /* * vm_page_insert: [ internal use only ] * * Inserts the given mem entry into the object and object list. * * The object must be locked. */ int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { vm_page_t mpred; VM_OBJECT_ASSERT_WLOCKED(object); mpred = vm_radix_lookup_le(&object->rtree, pindex); return (vm_page_insert_after(m, object, pindex, mpred)); } /* * vm_page_insert_after: * * Inserts the page "m" into the specified object at offset "pindex". * * The page "mpred" must immediately precede the offset "pindex" within * the specified object. * * The object must be locked. */ static int vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex, vm_page_t mpred) { vm_pindex_t sidx; vm_object_t sobj; vm_page_t msucc; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(m->object == NULL, ("vm_page_insert_after: page already inserted")); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); msucc = TAILQ_NEXT(mpred, listq); } else msucc = TAILQ_FIRST(&object->memq); if (msucc != NULL) KASSERT(msucc->pindex > pindex, ("vm_page_insert_after: msucc doesn't succeed pindex")); /* * Record the object/offset pair in this page */ sobj = m->object; sidx = m->pindex; m->object = object; m->pindex = pindex; /* * Now link into the object's ordered list of backed pages. */ if (vm_radix_insert(&object->rtree, m)) { m->object = sobj; m->pindex = sidx; return (1); } vm_page_insert_radixdone(m, object, mpred); return (0); } /* * vm_page_insert_radixdone: * * Complete page "m" insertion into the specified object after the * radix trie hooking. * * The page "mpred" must precede the offset "m->pindex" within the * specified object. * * The object must be locked. */ static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object != NULL && m->object == object, ("vm_page_insert_radixdone: page %p has inconsistent object", m)); if (mpred != NULL) { KASSERT(mpred->object == object, ("vm_page_insert_after: object doesn't contain mpred")); KASSERT(mpred->pindex < m->pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); } if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq); else TAILQ_INSERT_HEAD(&object->memq, m, listq); /* * Show that the object has one more resident page. */ object->resident_page_count++; /* * Hold the vnode until the last page is released. */ if (object->resident_page_count == 1 && object->type == OBJT_VNODE) vhold(object->handle); /* * Since we are inserting a new and possibly dirty page, * update the object's OBJ_MIGHTBEDIRTY flag. */ if (pmap_page_is_write_mapped(m)) vm_object_set_writeable_dirty(object); } /* * vm_page_remove: * * Removes the given mem entry from the object/offset-page * table and the object page list, but do not invalidate/terminate * the backing store. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_remove(vm_page_t m) { vm_object_t object; boolean_t lockacq; if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_lock_assert(m, MA_OWNED); if ((object = m->object) == NULL) return; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_xbusied(m)) { lockacq = FALSE; if ((m->oflags & VPO_UNMANAGED) != 0 && !mtx_owned(vm_page_lockptr(m))) { lockacq = TRUE; vm_page_lock(m); } vm_page_flash(m); atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); if (lockacq) vm_page_unlock(m); } /* * Now remove from the object's list of backed pages. */ vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); /* * And show that the object has one fewer resident page. */ object->resident_page_count--; /* * The vnode may now be recycled. */ if (object->resident_page_count == 0 && object->type == OBJT_VNODE) vdrop(object->handle); m->object = NULL; } /* * vm_page_lookup: * * Returns the page associated with the object/offset * pair specified; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_lookup(vm_object_t object, vm_pindex_t pindex) { VM_OBJECT_ASSERT_LOCKED(object); return (vm_radix_lookup(&object->rtree, pindex)); } /* * vm_page_find_least: * * Returns the page associated with the object with least pindex * greater than or equal to the parameter pindex, or NULL. * * The object must be locked. */ vm_page_t vm_page_find_least(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; VM_OBJECT_ASSERT_LOCKED(object); if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex) m = vm_radix_lookup_ge(&object->rtree, pindex); return (m); } /* * Returns the given page's successor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_next(vm_page_t m) { vm_page_t next; VM_OBJECT_ASSERT_WLOCKED(m->object); if ((next = TAILQ_NEXT(m, listq)) != NULL && next->pindex != m->pindex + 1) next = NULL; return (next); } /* * Returns the given page's predecessor (by pindex) within the object if it is * resident; if none is found, NULL is returned. * * The object must be locked. */ vm_page_t vm_page_prev(vm_page_t m) { vm_page_t prev; VM_OBJECT_ASSERT_WLOCKED(m->object); if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL && prev->pindex != m->pindex - 1) prev = NULL; return (prev); } /* * Uses the page mnew as a replacement for an existing page at index * pindex which must be already present in the object. * * The existing page must not be on a paging queue. */ vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) { vm_page_t mold, mpred; VM_OBJECT_ASSERT_WLOCKED(object); /* * This function mostly follows vm_page_insert() and * vm_page_remove() without the radix, object count and vnode * dance. Double check such functions for more comments. */ mpred = vm_radix_lookup(&object->rtree, pindex); KASSERT(mpred != NULL, ("vm_page_replace: replacing page not present with pindex")); mpred = TAILQ_PREV(mpred, respgs, listq); if (mpred != NULL) KASSERT(mpred->pindex < pindex, ("vm_page_insert_after: mpred doesn't precede pindex")); mnew->object = object; mnew->pindex = pindex; mold = vm_radix_replace(&object->rtree, mnew); KASSERT(mold->queue == PQ_NONE, ("vm_page_replace: mold is on a paging queue")); /* Detach the old page from the resident tailq. */ TAILQ_REMOVE(&object->memq, mold, listq); mold->object = NULL; vm_page_xunbusy(mold); /* Insert the new page in the resident tailq. */ if (mpred != NULL) TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq); else TAILQ_INSERT_HEAD(&object->memq, mnew, listq); if (pmap_page_is_write_mapped(mnew)) vm_object_set_writeable_dirty(object); return (mold); } /* * vm_page_rename: * * Move the given memory entry from its * current object to the specified target object/offset. * * Note: swap associated with the page must be invalidated by the move. We * have to do this for several reasons: (1) we aren't freeing the * page, (2) we are dirtying the page, (3) the VM system is probably * moving the page from object A to B, and will then later move * the backing store from A to B and we can't have a conflict. * * Note: we *always* dirty the page. It is necessary both for the * fact that we moved it, and because we may be invalidating * swap. If the page is on the cache, we have to deactivate it * or vm_page_dirty() will panic. Dirty pages are not allowed * on the cache. * * The objects must be locked. */ int vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) { vm_page_t mpred; vm_pindex_t opidx; VM_OBJECT_ASSERT_WLOCKED(new_object); mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex); KASSERT(mpred == NULL || mpred->pindex != new_pindex, ("vm_page_rename: pindex already renamed")); /* * Create a custom version of vm_page_insert() which does not depend * by m_prev and can cheat on the implementation aspects of the * function. */ opidx = m->pindex; m->pindex = new_pindex; if (vm_radix_insert(&new_object->rtree, m)) { m->pindex = opidx; return (1); } /* * The operation cannot fail anymore. The removal must happen before * the listq iterator is tainted. */ m->pindex = opidx; vm_page_lock(m); vm_page_remove(m); /* Return back to the new pindex to complete vm_page_insert(). */ m->pindex = new_pindex; m->object = new_object; vm_page_unlock(m); vm_page_insert_radixdone(m, new_object, mpred); vm_page_dirty(m); return (0); } /* * Convert all of the given object's cached pages that have a * pindex within the given range into free pages. If the value * zero is given for "end", then the range's upper bound is * infinity. If the given object is backed by a vnode and it * transitions from having one or more cached pages to none, the * vnode's hold count is reduced. */ void vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; boolean_t empty; mtx_lock(&vm_page_queue_free_mtx); if (__predict_false(vm_radix_is_empty(&object->cache))) { mtx_unlock(&vm_page_queue_free_mtx); return; } while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) { if (end != 0 && m->pindex >= end) break; vm_radix_remove(&object->cache, m->pindex); vm_page_cache_turn_free(m); } empty = vm_radix_is_empty(&object->cache); mtx_unlock(&vm_page_queue_free_mtx); if (object->type == OBJT_VNODE && empty) vdrop(object->handle); } /* * Returns the cached page that is associated with the given * object and offset. If, however, none exists, returns NULL. * * The free page queue must be locked. */ static inline vm_page_t vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); return (vm_radix_lookup(&object->cache, pindex)); } /* * Remove the given cached page from its containing object's * collection of cached pages. * * The free page queue must be locked. */ static void vm_page_cache_remove(vm_page_t m) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); KASSERT((m->flags & PG_CACHED) != 0, ("vm_page_cache_remove: page %p is not cached", m)); vm_radix_remove(&m->object->cache, m->pindex); m->object = NULL; vm_cnt.v_cache_count--; } /* * Transfer all of the cached pages with offset greater than or * equal to 'offidxstart' from the original object's cache to the * new object's cache. However, any cached pages with offset * greater than or equal to the new object's size are kept in the * original object. Initially, the new object's cache must be * empty. Offset 'offidxstart' in the original object must * correspond to offset zero in the new object. * * The new object must be locked. */ void vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart, vm_object_t new_object) { vm_page_t m; /* * Insertion into an object's collection of cached pages * requires the object to be locked. In contrast, removal does * not. */ VM_OBJECT_ASSERT_WLOCKED(new_object); KASSERT(vm_radix_is_empty(&new_object->cache), ("vm_page_cache_transfer: object %p has cached pages", new_object)); mtx_lock(&vm_page_queue_free_mtx); while ((m = vm_radix_lookup_ge(&orig_object->cache, offidxstart)) != NULL) { /* * Transfer all of the pages with offset greater than or * equal to 'offidxstart' from the original object's * cache to the new object's cache. */ if ((m->pindex - offidxstart) >= new_object->size) break; vm_radix_remove(&orig_object->cache, m->pindex); /* Update the page's object and offset. */ m->object = new_object; m->pindex -= offidxstart; if (vm_radix_insert(&new_object->cache, m)) vm_page_cache_turn_free(m); } mtx_unlock(&vm_page_queue_free_mtx); } /* * Returns TRUE if a cached page is associated with the given object and * offset, and FALSE otherwise. * * The object must be locked. */ boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex) { vm_page_t m; /* * Insertion into an object's collection of cached pages requires the * object to be locked. Therefore, if the object is locked and the * object's collection is empty, there is no need to acquire the free * page queues lock in order to prove that the specified page doesn't * exist. */ VM_OBJECT_ASSERT_WLOCKED(object); if (__predict_true(vm_object_cache_is_empty(object))) return (FALSE); mtx_lock(&vm_page_queue_free_mtx); m = vm_page_cache_lookup(object, pindex); mtx_unlock(&vm_page_queue_free_mtx); return (m != NULL); } /* * vm_page_alloc: * * Allocate and return a page that is associated with the specified * object and offset pair. By default, this page is exclusive busied. * * The caller must always specify an allocation class. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_IFCACHED return page only if it is cached * VM_ALLOC_IFNOTCACHED return NULL, do not reactivate if the page * is cached * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NODUMP do not include the page in a kernel core dump * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_WIRED wire the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req) { struct vnode *vp = NULL; vm_object_t m_object; vm_page_t m, mpred; int flags, req_class, unmanaged; mpred = 0; /* XXX: pacify gcc */ KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, req)); if (object != NULL) VM_OBJECT_ASSERT_WLOCKED(object); unmanaged = (object == NULL || (object->flags & OBJ_UNMANAGED) != 0); KASSERT(unmanaged == 0 || (req & VM_ALLOC_WIRED) != 0, ("vm_page_alloc: unmanaged but unwired request req(%x)", req)); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; if (object != NULL) { mpred = vm_radix_lookup_le(&object->rtree, pindex); KASSERT(mpred == NULL || mpred->pindex != pindex, ("vm_page_alloc: pindex already allocated")); } /* * The page allocation request can came from consumers which already * hold the free page queue mutex, like vm_page_insert() in * vm_page_cache(). */ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) { /* * Allocate from the free queue if the number of free pages * exceeds the minimum for the request class. */ if (object != NULL && (m = vm_page_cache_lookup(object, pindex)) != NULL) { if ((req & VM_ALLOC_IFNOTCACHED) != 0) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } if (vm_phys_unfree_page(m)) vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0); #if VM_NRESERVLEVEL > 0 else if (!vm_reserv_reactivate_page(m)) #else else #endif panic("vm_page_alloc: cache page %p is missing" " from the free queue", m); } else if ((req & VM_ALLOC_IFCACHED) != 0) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); #if VM_NRESERVLEVEL > 0 } else if (object == NULL || (object->flags & (OBJ_COLORED | OBJ_FICTITIOUS)) != OBJ_COLORED || (m = vm_reserv_alloc_page(object, pindex, mpred)) == NULL) { #else } else { #endif m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); #if VM_NRESERVLEVEL > 0 if (m == NULL && vm_reserv_reclaim_inactive()) { m = vm_phys_alloc_pages(object != NULL ? VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0); } #endif } } else { /* * Not allocatable, give up. */ mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } /* * At this point we had better have found a good page. */ KASSERT(m != NULL, ("vm_page_alloc: missing page")); KASSERT(m->queue == PQ_NONE, ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m)); KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m)); KASSERT(!vm_page_sbusied(m), ("vm_page_alloc: page %p is busy", m)); KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("vm_page_alloc: page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); if ((m->flags & PG_CACHED) != 0) { KASSERT((m->flags & PG_ZERO) == 0, ("vm_page_alloc: cached page %p is PG_ZERO", m)); KASSERT(m->valid != 0, ("vm_page_alloc: cached page %p is invalid", m)); if (m->object == object && m->pindex == pindex) vm_cnt.v_reactivated++; else m->valid = 0; m_object = m->object; vm_page_cache_remove(m); if (m_object->type == OBJT_VNODE && vm_object_cache_is_empty(m_object)) vp = m_object->handle; } else { KASSERT(m->valid == 0, ("vm_page_alloc: free page %p is valid", m)); vm_phys_freecnt_adj(m, -1); if ((m->flags & PG_ZERO) != 0) vm_page_zero_count--; } mtx_unlock(&vm_page_queue_free_mtx); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; flags &= m->flags; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; m->flags = flags; m->aflags = 0; m->oflags = (unmanaged != 0) ? VPO_UNMANAGED : 0; m->busy_lock = VPB_UNBUSIED; if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); if (req & VM_ALLOC_WIRED) { /* * The page lock is not required for wiring a page until that * page is inserted into the object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; } m->act_count = 0; if (object != NULL) { if (vm_page_insert_after(m, object, pindex, mpred)) { /* See the comment below about hold count. */ if (vp != NULL) vdrop(vp); pagedaemon_wakeup(); if ((req & VM_ALLOC_WIRED) != 0 && unmanaged == 0) { atomic_subtract_int(&vm_cnt.v_wire_count, 1); m->wire_count = 0; } m->object = NULL; vm_page_free(m); return (NULL); } /* Ignore device objects; the pager sets "memattr" for them. */ if (object->memattr != VM_MEMATTR_DEFAULT && (object->flags & OBJ_FICTITIOUS) == 0) pmap_page_set_memattr(m, object->memattr); } else m->pindex = pindex; /* * The following call to vdrop() must come after the above call * to vm_page_insert() in case both affect the same object and * vnode. Otherwise, the affected vnode's hold count could * temporarily become zero. */ if (vp != NULL) vdrop(vp); /* * Don't wakeup too often - wakeup the pageout daemon when * we would be nearly out of memory. */ if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } static void vm_page_alloc_contig_vdrop(struct spglist *lst) { while (!SLIST_EMPTY(lst)) { vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv); SLIST_REMOVE_HEAD(lst, plinks.s.ss); } } /* * vm_page_alloc_contig: * * Allocate a contiguous set of physical pages of the given size "npages" * from the free lists. All of the physical pages must be at or above * the given physical address "low" and below the given physical address * "high". The given value "alignment" determines the alignment of the * first physical page in the set. If the given value "boundary" is * non-zero, then the set of physical pages cannot cross any physical * address boundary that is a multiple of that value. Both "alignment" * and "boundary" must be a power of two. * * If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT, * then the memory attribute setting for the physical pages is configured * to the object's memory attribute setting. Otherwise, the memory * attribute setting for the physical pages is configured to "memattr", * overriding the object's memory attribute setting. However, if the * object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the * memory attribute setting for the physical pages cannot be configured * to VM_MEMATTR_DEFAULT. * * The caller must always specify an allocation class. * * The returned pages will all be wired. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_NOBUSY do not exclusive busy the page * VM_ALLOC_NOOBJ page is not associated with an object and * should not be exclusive busy * VM_ALLOC_SBUSY shared busy the allocated page * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req, u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary, vm_memattr_t memattr) { struct vnode *drop; struct spglist deferred_vdrop_list; vm_page_t m, m_tmp, m_ret; u_int flags; int req_class; KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) && (object != NULL || (req & VM_ALLOC_SBUSY) == 0) && ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) != (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)), ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object, req)); KASSERT((req & VM_ALLOC_WIRED) == 0, ("vm_page_alloc_contig: VM_ALLOC_WIRED passed in req (%x)", req)); if (object != NULL) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_PHYS, ("vm_page_alloc_contig: object %p isn't OBJT_PHYS", object)); } KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero")); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; SLIST_INIT(&deferred_vdrop_list); mtx_lock(&vm_page_queue_free_mtx); if (vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages + vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages)) { #if VM_NRESERVLEVEL > 0 retry: if (object == NULL || (object->flags & OBJ_COLORED) == 0 || (m_ret = vm_reserv_alloc_contig(object, pindex, npages, low, high, alignment, boundary)) == NULL) #endif m_ret = vm_phys_alloc_contig(npages, low, high, alignment, boundary); } else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, npages); pagedaemon_wakeup(); return (NULL); } if (m_ret != NULL) for (m = m_ret; m < &m_ret[npages]; m++) { drop = vm_page_alloc_init(m); if (drop != NULL) { /* * Enqueue the vnode for deferred vdrop(). */ m->plinks.s.pv = drop; SLIST_INSERT_HEAD(&deferred_vdrop_list, m, plinks.s.ss); } } else { #if VM_NRESERVLEVEL > 0 if (vm_reserv_reclaim_contig(npages, low, high, alignment, boundary)) goto retry; #endif } mtx_unlock(&vm_page_queue_free_mtx); if (m_ret == NULL) return (NULL); /* * Initialize the pages. Only the PG_ZERO flag is inherited. */ flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; if ((req & VM_ALLOC_NODUMP) != 0) flags |= PG_NODUMP; atomic_add_int(&vm_cnt.v_wire_count, npages); if (object != NULL) { if (object->memattr != VM_MEMATTR_DEFAULT && memattr == VM_MEMATTR_DEFAULT) memattr = object->memattr; } for (m = m_ret; m < &m_ret[npages]; m++) { m->aflags = 0; m->flags = (m->flags | PG_NODUMP) & flags; m->busy_lock = VPB_UNBUSIED; if (object != NULL) { if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) m->busy_lock = VPB_SINGLE_EXCLUSIVER; if ((req & VM_ALLOC_SBUSY) != 0) m->busy_lock = VPB_SHARERS_WORD(1); } m->wire_count = 1; /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (object != NULL) { if (vm_page_insert(m, object, pindex)) { vm_page_alloc_contig_vdrop( &deferred_vdrop_list); if (vm_paging_needed()) pagedaemon_wakeup(); for (m_tmp = m, m = m_ret; m < &m_ret[npages]; m++) { m->wire_count = 1; m->oflags = VPO_UNMANAGED; if (m >= m_tmp) m->object = NULL; vm_page_free(m); } return (NULL); } } else m->pindex = pindex; if (memattr != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, memattr); pindex++; } vm_page_alloc_contig_vdrop(&deferred_vdrop_list); if (vm_paging_needed()) pagedaemon_wakeup(); return (m_ret); } /* * Initialize a page that has been freshly dequeued from a freelist. * The caller has to drop the vnode returned, if it is not NULL. * * This function may only be used to initialize unmanaged pages. * * To be called with vm_page_queue_free_mtx held. */ static struct vnode * vm_page_alloc_init(vm_page_t m) { struct vnode *drop; vm_object_t m_object; KASSERT(m->queue == PQ_NONE, ("vm_page_alloc_init: page %p has unexpected queue %d", m, m->queue)); KASSERT(m->wire_count == 0, ("vm_page_alloc_init: page %p is wired", m)); KASSERT(m->hold_count == 0, ("vm_page_alloc_init: page %p is held", m)); KASSERT(!vm_page_sbusied(m), ("vm_page_alloc_init: page %p is busy", m)); KASSERT(m->dirty == 0, ("vm_page_alloc_init: page %p is dirty", m)); KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT, ("vm_page_alloc_init: page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); drop = NULL; if ((m->flags & PG_CACHED) != 0) { KASSERT((m->flags & PG_ZERO) == 0, ("vm_page_alloc_init: cached page %p is PG_ZERO", m)); m->valid = 0; m_object = m->object; vm_page_cache_remove(m); if (m_object->type == OBJT_VNODE && vm_object_cache_is_empty(m_object)) drop = m_object->handle; } else { KASSERT(m->valid == 0, ("vm_page_alloc_init: free page %p is valid", m)); vm_phys_freecnt_adj(m, -1); if ((m->flags & PG_ZERO) != 0) vm_page_zero_count--; } return (drop); } /* * vm_page_alloc_freelist: * * Allocate a physical page from the specified free page list. * * The caller must always specify an allocation class. * * The returned page will be wired. * * allocation classes: * VM_ALLOC_NORMAL normal process request * VM_ALLOC_SYSTEM system *really* needs a page * VM_ALLOC_INTERRUPT interrupt time request * * optional allocation flags: * VM_ALLOC_COUNT(number) the number of additional pages that the caller * intends to allocate * VM_ALLOC_ZERO prefer a zeroed page * * This routine may not sleep. */ vm_page_t vm_page_alloc_freelist(int flind, int req) { struct vnode *drop; vm_page_t m; u_int flags; int req_class; KASSERT((req & VM_ALLOC_WIRED) == 0, ("vm_page_alloc_freelist: VM_ALLOC_WIRED passed in req (%x)", req)); req_class = req & VM_ALLOC_CLASS_MASK; /* * The page daemon is allowed to dig deeper into the free page list. */ if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT) req_class = VM_ALLOC_SYSTEM; /* * Do not allocate reserved pages unless the req has asked for it. */ mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE); if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM && vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT && vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0); else { mtx_unlock(&vm_page_queue_free_mtx); atomic_add_int(&vm_pageout_deficit, max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1)); pagedaemon_wakeup(); return (NULL); } if (m == NULL) { mtx_unlock(&vm_page_queue_free_mtx); return (NULL); } drop = vm_page_alloc_init(m); mtx_unlock(&vm_page_queue_free_mtx); /* * Initialize the page. Only the PG_ZERO flag is inherited. */ m->aflags = 0; flags = 0; if ((req & VM_ALLOC_ZERO) != 0) flags = PG_ZERO; m->flags &= flags; /* * The page lock is not required for wiring a page that does * not belong to an object. */ atomic_add_int(&vm_cnt.v_wire_count, 1); m->wire_count = 1; /* Unmanaged pages don't use "act_count". */ m->oflags = VPO_UNMANAGED; if (drop != NULL) vdrop(drop); if (vm_paging_needed()) pagedaemon_wakeup(); return (m); } /* * vm_wait: (also see VM_WAIT macro) * * Sleep until free pages are available for allocation. * - Called in various places before memory allocations. */ void vm_wait(void) { mtx_lock(&vm_page_queue_free_mtx); if (curproc == pageproc) { vm_pageout_pages_needed = 1; msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx, PDROP | PSWP, "VMWait", 0); } else { if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM, "vmwait", 0); } } /* * vm_waitpfault: (also see VM_WAITPFAULT macro) * * Sleep until free pages are available for allocation. * - Called only in vm_fault so that processes page faulting * can be easily tracked. * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing * processes will be able to grab memory first. Do not change * this balance without careful testing first. */ void vm_waitpfault(void) { mtx_lock(&vm_page_queue_free_mtx); if (!vm_pages_needed) { vm_pages_needed = 1; wakeup(&vm_pages_needed); } msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER, "pfault", 0); } struct vm_pagequeue * vm_page_pagequeue(vm_page_t m) { return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); } /* * vm_page_dequeue: * * Remove the given page from its current page queue. * * The page must be locked. */ void vm_page_dequeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_assert_locked(m); - KASSERT(m->queue == PQ_ACTIVE || m->queue == PQ_INACTIVE, - ("vm_page_dequeue: page %p is not queued", m)); + KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued", + m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } /* * vm_page_dequeue_locked: * * Remove the given page from its current page queue. * * The page and page queue must be locked. */ void vm_page_dequeue_locked(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_dec(pq); } /* * vm_page_enqueue: * * Add the given page to the specified page queue. * * The page must be locked. */ static void -vm_page_enqueue(int queue, vm_page_t m) +vm_page_enqueue(uint8_t queue, vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); + KASSERT(queue < PQ_COUNT, + ("vm_page_enqueue: invalid queue %u request for page %m", + queue, m)); + pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } /* * vm_page_requeue: * * Move the given page to the tail of its current page queue. * * The page must be locked. */ void vm_page_requeue(vm_page_t m) { struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_unlock(pq); } /* * vm_page_requeue_locked: * * Move the given page to the tail of its current page queue. * * The page queue must be locked. */ void vm_page_requeue_locked(vm_page_t m) { struct vm_pagequeue *pq; KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); } /* * vm_page_activate: * * Put the specified page on the active list (if appropriate). * Ensure that act_count is at least ACT_INIT but do not otherwise * mess with it. * * The page must be locked. */ void vm_page_activate(vm_page_t m) { int queue; vm_page_lock_assert(m, MA_OWNED); if ((queue = m->queue) != PQ_ACTIVE) { if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; if (queue != PQ_NONE) vm_page_dequeue(m); vm_page_enqueue(PQ_ACTIVE, m); } else KASSERT(queue == PQ_NONE, ("vm_page_activate: wired page %p is queued", m)); } else { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; } } /* * vm_page_free_wakeup: * * Helper routine for vm_page_free_toq() and vm_page_cache(). This * routine is called when a page has been added to the cache or free * queues. * * The page queues must be locked. */ static inline void vm_page_free_wakeup(void) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); /* * if pageout daemon needs pages, then tell it that there are * some free. */ if (vm_pageout_pages_needed && vm_cnt.v_cache_count + vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) { wakeup(&vm_pageout_pages_needed); vm_pageout_pages_needed = 0; } /* * wakeup processes that are waiting on memory if we hit a * high water mark. And wakeup scheduler process if we have * lots of memory. this process will swapin processes. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = 0; wakeup(&vm_cnt.v_free_count); } } /* * Turn a cached page into a free page, by changing its attributes. * Keep the statistics up-to-date. * * The free page queue must be locked. */ static void vm_page_cache_turn_free(vm_page_t m) { mtx_assert(&vm_page_queue_free_mtx, MA_OWNED); m->object = NULL; m->valid = 0; KASSERT((m->flags & PG_CACHED) != 0, ("vm_page_cache_turn_free: page %p is not cached", m)); m->flags &= ~PG_CACHED; vm_cnt.v_cache_count--; vm_phys_freecnt_adj(m, 1); } /* * vm_page_free_toq: * * Returns the given page to the free list, * disassociating it with any VM object. * * The object must be locked. The page must be locked if it is managed. */ void vm_page_free_toq(vm_page_t m) { if ((m->oflags & VPO_UNMANAGED) == 0) { vm_page_lock_assert(m, MA_OWNED); KASSERT(!pmap_page_is_mapped(m), ("vm_page_free_toq: freeing mapped page %p", m)); } else { KASSERT(m->queue == PQ_NONE, ("vm_page_free_toq: unmanaged page %p is queued", m)); KASSERT(m->wire_count == 1, ("vm_page_free_toq: invalid wired count %u for unmanaged page %p", m->wire_count, m)); m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); } PCPU_INC(cnt.v_tfree); if (vm_page_sbusied(m)) panic("vm_page_free: freeing busy page %p", m); /* * Unqueue, then remove page. Note that we cannot destroy * the page here because we do not want to call the pager's * callback routine until after we've put the page on the * appropriate free queue. */ vm_page_remque(m); vm_page_remove(m); /* * If fictitious remove object association and * return, otherwise delay object association removal. */ if ((m->flags & PG_FICTITIOUS) != 0) { return; } m->valid = 0; vm_page_undirty(m); if (m->wire_count != 0) panic("vm_page_free: freeing wired page %p", m); if (m->hold_count != 0) { m->flags &= ~PG_ZERO; KASSERT((m->flags & PG_UNHOLDFREE) == 0, ("vm_page_free: freeing PG_UNHOLDFREE page %p", m)); m->flags |= PG_UNHOLDFREE; } else { /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* * Insert the page into the physical memory allocator's * cache/free page queues. */ mtx_lock(&vm_page_queue_free_mtx); vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else if (TRUE) #endif vm_phys_free_pages(m, 0); if ((m->flags & PG_ZERO) != 0) ++vm_page_zero_count; else vm_page_zero_idle_wakeup(); vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); } } /* * vm_page_wire: * * Mark this page as wired down by yet * another map, removing it from paging queues * as necessary. * * If the page is fictitious, then its wire count must remain one. * * The page must be locked. */ void vm_page_wire(vm_page_t m) { /* * Only bump the wire statistics if the page is not already wired, * and only unqueue the page if it is on some queue (if it is unmanaged * it is already off the queues). */ vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_wire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count == 0) { KASSERT((m->oflags & VPO_UNMANAGED) == 0 || m->queue == PQ_NONE, ("vm_page_wire: unmanaged page %p is queued", m)); vm_page_remque(m); atomic_add_int(&vm_cnt.v_wire_count, 1); } m->wire_count++; KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m)); } /* * vm_page_unwire: * * Release one wiring of the specified page, potentially enabling it to be * paged again. If paging is enabled, then the value of the parameter - * "activate" determines to which queue the page is added. If "activate" is - * non-zero, then the page is added to the active queue. Otherwise, it is - * added to the inactive queue. + * "queue" determines to which queue the page is added. * - * However, unless the page belongs to an object, it is not enqueued because - * it cannot be paged out. + * If a page is fictitious or managed, then its wire count must always be one. * - * If a page is fictitious, then its wire count must always be one. - * * A managed page must be locked. */ void -vm_page_unwire(vm_page_t m, int activate) +vm_page_unwire(vm_page_t m, uint8_t queue) { + KASSERT(queue < PQ_COUNT, + ("vm_page_unwire: invalid queue %u request for page %m", + queue, m)); if ((m->oflags & VPO_UNMANAGED) == 0) vm_page_lock_assert(m, MA_OWNED); if ((m->flags & PG_FICTITIOUS) != 0) { KASSERT(m->wire_count == 1, ("vm_page_unwire: fictitious page %p's wire count isn't one", m)); return; } if (m->wire_count > 0) { m->wire_count--; if (m->wire_count == 0) { if ((m->oflags & VPO_UNMANAGED) != 0 || m->object == NULL) panic("vm_page_unwire: unmanaged page %p's wire count is one", m); atomic_subtract_int(&vm_cnt.v_wire_count, 1); - if (!activate) + if (queue == PQ_INACTIVE) m->flags &= ~PG_WINATCFLS; - vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m); + vm_page_enqueue(queue, m); } } else panic("vm_page_unwire: page %p's wire count is zero", m); } /* * Move the specified page to the inactive queue. * * Many pages placed on the inactive queue should actually go * into the cache, but it is difficult to figure out which. What * we do instead, if the inactive target is well met, is to put * clean pages at the head of the inactive queue instead of the tail. * This will cause them to be moved to the cache more quickly and * if not actively re-referenced, reclaimed more quickly. If we just * stick these pages at the end of the inactive queue, heavy filesystem * meta-data accesses can cause an unnecessary paging load on memory bound * processes. This optimization causes one-time-use metadata to be * reused more quickly. * * Normally athead is 0 resulting in LRU operation. athead is set * to 1 if we want this page to be 'as if it were placed in the cache', * except without unmapping it from the process address space. * * The page must be locked. */ static inline void _vm_page_deactivate(vm_page_t m, int athead) { struct vm_pagequeue *pq; int queue; vm_page_lock_assert(m, MA_OWNED); /* * Ignore if already inactive. */ if ((queue = m->queue) == PQ_INACTIVE) return; if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) { if (queue != PQ_NONE) vm_page_dequeue(m); m->flags &= ~PG_WINATCFLS; pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); m->queue = PQ_INACTIVE; if (athead) TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q); vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } /* * Move the specified page to the inactive queue. * * The page must be locked. */ void vm_page_deactivate(vm_page_t m) { _vm_page_deactivate(m, 0); } /* * vm_page_try_to_cache: * * Returns 0 on failure, 1 on success */ int vm_page_try_to_cache(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_cache(m); return (1); } /* * vm_page_try_to_free() * * Attempt to free the page. If we cannot free it, we do nothing. * 1 is returned on success, 0 on failure. */ int vm_page_try_to_free(vm_page_t m) { vm_page_lock_assert(m, MA_OWNED); if (m->object != NULL) VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty || m->hold_count || m->wire_count || (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m)) return (0); pmap_remove_all(m); if (m->dirty) return (0); vm_page_free(m); return (1); } /* * vm_page_cache * * Put the specified page onto the page cache queue (if appropriate). * * The object and page must be locked. */ void vm_page_cache(vm_page_t m) { vm_object_t object; boolean_t cache_was_empty; vm_page_lock_assert(m, MA_OWNED); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) || m->hold_count || m->wire_count) panic("vm_page_cache: attempting to cache busy page"); KASSERT(!pmap_page_is_mapped(m), ("vm_page_cache: page %p is mapped", m)); KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m)); if (m->valid == 0 || object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && !vm_pager_has_page(object, m->pindex, NULL, NULL))) { /* * Hypothesis: A cache-elgible page belonging to a * default object or swap object but without a backing * store must be zero filled. */ vm_page_free(m); return; } KASSERT((m->flags & PG_CACHED) == 0, ("vm_page_cache: page %p is already cached", m)); /* * Remove the page from the paging queues. */ vm_page_remque(m); /* * Remove the page from the object's collection of resident * pages. */ vm_radix_remove(&object->rtree, m->pindex); TAILQ_REMOVE(&object->memq, m, listq); object->resident_page_count--; /* * Restore the default memory attribute to the page. */ if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT) pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT); /* * Insert the page into the object's collection of cached pages * and the physical memory allocator's cache/free page queues. */ m->flags &= ~PG_ZERO; mtx_lock(&vm_page_queue_free_mtx); cache_was_empty = vm_radix_is_empty(&object->cache); if (vm_radix_insert(&object->cache, m)) { mtx_unlock(&vm_page_queue_free_mtx); if (object->resident_page_count == 0) vdrop(object->handle); m->object = NULL; vm_page_free(m); return; } /* * The above call to vm_radix_insert() could reclaim the one pre- * existing cached page from this object, resulting in a call to * vdrop(). */ if (!cache_was_empty) cache_was_empty = vm_radix_is_singleton(&object->cache); m->flags |= PG_CACHED; vm_cnt.v_cache_count++; PCPU_INC(cnt.v_tcached); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) { #else if (TRUE) { #endif vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0); vm_phys_free_pages(m, 0); } vm_page_free_wakeup(); mtx_unlock(&vm_page_queue_free_mtx); /* * Increment the vnode's hold count if this is the object's only * cached page. Decrement the vnode's hold count if this was * the object's only resident page. */ if (object->type == OBJT_VNODE) { if (cache_was_empty && object->resident_page_count != 0) vhold(object->handle); else if (!cache_was_empty && object->resident_page_count == 0) vdrop(object->handle); } } /* * vm_page_advise * * Cache, deactivate, or do nothing as appropriate. This routine * is used by madvise(). * * Generally speaking we want to move the page into the cache so * it gets reused quickly. However, this can result in a silly syndrome * due to the page recycling too quickly. Small objects will not be * fully cached. On the other hand, if we move the page to the inactive * queue we wind up with a problem whereby very large objects * unnecessarily blow away our inactive and cache queues. * * The solution is to move the pages based on a fixed weighting. We * either leave them alone, deactivate them, or move them to the cache, * where moving them to the cache has the highest weighting. * By forcing some pages into other queues we eventually force the * system to balance the queues, potentially recovering other unrelated * space from active. The idea is to not force this to happen too * often. * * The object and page must be locked. */ void vm_page_advise(vm_page_t m, int advice) { int dnw, head; vm_page_assert_locked(m); VM_OBJECT_ASSERT_WLOCKED(m->object); if (advice == MADV_FREE) { /* * Mark the page clean. This will allow the page to be freed * up by the system. However, such pages are often reused * quickly by malloc() so we do not do anything that would * cause a page fault if we can help it. * * Specifically, we do not try to actually free the page now * nor do we try to put it in the cache (which would cause a * page fault on reuse). * * But we do make the page is freeable as we can without * actually taking the step of unmapping it. */ m->dirty = 0; m->act_count = 0; } else if (advice != MADV_DONTNEED) return; dnw = PCPU_GET(dnweight); PCPU_INC(dnweight); /* * Occasionally leave the page alone. */ if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) { if (m->act_count >= ACT_INIT) --m->act_count; return; } /* * Clear any references to the page. Otherwise, the page daemon will * immediately reactivate the page. */ vm_page_aflag_clear(m, PGA_REFERENCED); if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty || (dnw & 0x0070) == 0) { /* * Deactivate the page 3 times out of 32. */ head = 0; } else { /* * Cache the page 28 times out of every 32. Note that * the page is deactivated instead of cached, but placed * at the head of the queue instead of the tail. */ head = 1; } _vm_page_deactivate(m, head); } /* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, first allocate it * and then conditionally zero it. * * This routine may sleep. * * The object must be locked on entry. The lock will, however, be released * and reacquired if the routine sleeps. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) { vm_page_t m; int sleep; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 || (allocflags & VM_ALLOC_IGN_SBUSY) != 0, ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch")); retrylookup: if ((m = vm_page_lookup(object, pindex)) != NULL) { sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ? vm_page_xbusied(m) : vm_page_busied(m); if (sleep) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); vm_page_lock(m); VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt"); VM_OBJECT_WLOCK(object); goto retrylookup; } else { if ((allocflags & VM_ALLOC_WIRED) != 0) { vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); } if ((allocflags & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0) vm_page_xbusy(m); if ((allocflags & VM_ALLOC_SBUSY) != 0) vm_page_sbusy(m); return (m); } } m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY); if (m == NULL) { VM_OBJECT_WUNLOCK(object); VM_WAIT; VM_OBJECT_WLOCK(object); goto retrylookup; } else if (m->valid != 0) return (m); if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } /* * Mapping function for valid or dirty bits in a page. * * Inputs are required to range within a page. */ vm_page_bits_t vm_page_bits(int base, int size) { int first_bit; int last_bit; KASSERT( base + size <= PAGE_SIZE, ("vm_page_bits: illegal base/size %d/%d", base, size) ); if (size == 0) /* handle degenerate case */ return (0); first_bit = base >> DEV_BSHIFT; last_bit = (base + size - 1) >> DEV_BSHIFT; return (((vm_page_bits_t)2 << last_bit) - ((vm_page_bits_t)1 << first_bit)); } /* * vm_page_set_valid_range: * * Sets portions of a page valid. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zeroed. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_valid_range(vm_page_t m, int base, int size) { int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = base & ~(DEV_BSIZE - 1)) != base && (m->valid & (1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Assert that no previously invalid block that is now being validated * is already dirty. */ KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0, ("vm_page_set_valid_range: page %p is dirty", m)); /* * Set valid bits inclusive of any overlap. */ m->valid |= vm_page_bits(base, size); } /* * Clear the given bits from the specified page's dirty field. */ static __inline void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits) { uintptr_t addr; #if PAGE_SIZE < 16384 int shift; #endif /* * If the object is locked and the page is neither exclusive busy nor * write mapped, then the page's dirty field cannot possibly be * set by a concurrent pmap operation. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) m->dirty &= ~pagebits; else { /* * The pmap layer can call vm_page_dirty() without * holding a distinguished lock. The combination of * the object's lock and an atomic operation suffice * to guarantee consistency of the page dirty field. * * For PAGE_SIZE == 32768 case, compiler already * properly aligns the dirty field, so no forcible * alignment is needed. Only require existence of * atomic_clear_64 when page size is 32768. */ addr = (uintptr_t)&m->dirty; #if PAGE_SIZE == 32768 atomic_clear_64((uint64_t *)addr, pagebits); #elif PAGE_SIZE == 16384 atomic_clear_32((uint32_t *)addr, pagebits); #else /* PAGE_SIZE <= 8192 */ /* * Use a trick to perform a 32-bit atomic on the * containing aligned word, to not depend on the existence * of atomic_clear_{8, 16}. */ shift = addr & (sizeof(uint32_t) - 1); #if BYTE_ORDER == BIG_ENDIAN shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY; #else shift *= NBBY; #endif addr &= ~(sizeof(uint32_t) - 1); atomic_clear_32((uint32_t *)addr, pagebits << shift); #endif /* PAGE_SIZE */ } } /* * vm_page_set_validclean: * * Sets portions of a page valid and clean. The arguments are expected * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive * of any partial chunks touched by the range. The invalid portion of * such chunks will be zero'd. * * (base + size) must be less then or equal to PAGE_SIZE. */ void vm_page_set_validclean(vm_page_t m, int base, int size) { vm_page_bits_t oldvalid, pagebits; int endoff, frag; VM_OBJECT_ASSERT_WLOCKED(m->object); if (size == 0) /* handle degenerate case */ return; /* * If the base is not DEV_BSIZE aligned and the valid * bit is clear, we have to zero out a portion of the * first block. */ if ((frag = base & ~(DEV_BSIZE - 1)) != base && (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, frag, base - frag); /* * If the ending offset is not DEV_BSIZE aligned and the * valid bit is clear, we have to zero out a portion of * the last block. */ endoff = base + size; if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff && (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0) pmap_zero_page_area(m, endoff, DEV_BSIZE - (endoff & (DEV_BSIZE - 1))); /* * Set valid, clear dirty bits. If validating the entire * page we can safely clear the pmap modify bit. We also * use this opportunity to clear the VPO_NOSYNC flag. If a process * takes a write fault on a MAP_NOSYNC memory area the flag will * be set again. * * We set valid bits inclusive of any overlap, but we can only * clear dirty bits for DEV_BSIZE chunks that are fully within * the range. */ oldvalid = m->valid; pagebits = vm_page_bits(base, size); m->valid |= pagebits; #if 0 /* NOT YET */ if ((frag = base & (DEV_BSIZE - 1)) != 0) { frag = DEV_BSIZE - frag; base += frag; size -= frag; if (size < 0) size = 0; } pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1)); #endif if (base == 0 && size == PAGE_SIZE) { /* * The page can only be modified within the pmap if it is * mapped, and it can only be mapped if it was previously * fully valid. */ if (oldvalid == VM_PAGE_BITS_ALL) /* * Perform the pmap_clear_modify() first. Otherwise, * a concurrent pmap operation, such as * pmap_protect(), could clear a modification in the * pmap and set the dirty field on the page before * pmap_clear_modify() had begun and after the dirty * field was cleared here. */ pmap_clear_modify(m); m->dirty = 0; m->oflags &= ~VPO_NOSYNC; } else if (oldvalid != VM_PAGE_BITS_ALL) m->dirty &= ~pagebits; else vm_page_clear_dirty_mask(m, pagebits); } void vm_page_clear_dirty(vm_page_t m, int base, int size) { vm_page_clear_dirty_mask(m, vm_page_bits(base, size)); } /* * vm_page_set_invalid: * * Invalidates DEV_BSIZE'd chunks within a page. Both the * valid and dirty bits for the effected areas are cleared. */ void vm_page_set_invalid(vm_page_t m, int base, int size) { vm_page_bits_t bits; vm_object_t object; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) + size >= object->un_pager.vnp.vnp_size) bits = VM_PAGE_BITS_ALL; else bits = vm_page_bits(base, size); if (m->valid == VM_PAGE_BITS_ALL && bits != 0) pmap_remove_all(m); KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) || !pmap_page_is_mapped(m), ("vm_page_set_invalid: page %p is mapped", m)); m->valid &= ~bits; m->dirty &= ~bits; } /* * vm_page_zero_invalid() * * The kernel assumes that the invalid portions of a page contain * garbage, but such pages can be mapped into memory by user code. * When this occurs, we must zero out the non-valid portions of the * page so user code sees what it expects. * * Pages are most often semi-valid when the end of a file is mapped * into memory and the file's size is not page aligned. */ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid) { int b; int i; VM_OBJECT_ASSERT_WLOCKED(m->object); /* * Scan the valid bits looking for invalid sections that * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the * valid bit may be set ) have already been zerod by * vm_page_set_validclean(). */ for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) { if (i == (PAGE_SIZE / DEV_BSIZE) || (m->valid & ((vm_page_bits_t)1 << i))) { if (i > b) { pmap_zero_page_area(m, b << DEV_BSHIFT, (i - b) << DEV_BSHIFT); } b = i + 1; } } /* * setvalid is TRUE when we can safely set the zero'd areas * as being valid. We can do this if there are no cache consistancy * issues. e.g. it is ok to do with UFS, but not ok to do with NFS. */ if (setvalid) m->valid = VM_PAGE_BITS_ALL; } /* * vm_page_is_valid: * * Is (partial) page valid? Note that the case where size == 0 * will return FALSE in the degenerate case where the page is * entirely invalid, and TRUE otherwise. */ int vm_page_is_valid(vm_page_t m, int base, int size) { vm_page_bits_t bits; VM_OBJECT_ASSERT_LOCKED(m->object); bits = vm_page_bits(base, size); return (m->valid != 0 && (m->valid & bits) == bits); } /* * vm_page_ps_is_valid: * * Returns TRUE if the entire (super)page is valid and FALSE otherwise. */ boolean_t vm_page_ps_is_valid(vm_page_t m) { int i, npages; VM_OBJECT_ASSERT_LOCKED(m->object); npages = atop(pagesizes[m->psind]); /* * The physically contiguous pages that make up a superpage, i.e., a * page with a page size index ("psind") greater than zero, will * occupy adjacent entries in vm_page_array[]. */ for (i = 0; i < npages; i++) { if (m[i].valid != VM_PAGE_BITS_ALL) return (FALSE); } return (TRUE); } /* * Set the page's dirty bits if the page is modified. */ void vm_page_test_dirty(vm_page_t m) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) vm_page_dirty(m); } void vm_page_lock_KBI(vm_page_t m, const char *file, int line) { mtx_lock_flags_(vm_page_lockptr(m), 0, file, line); } void vm_page_unlock_KBI(vm_page_t m, const char *file, int line) { mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line); } int vm_page_trylock_KBI(vm_page_t m, const char *file, int line) { return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line)); } #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line) { vm_page_lock_assert_KBI(m, MA_OWNED, file, line); } void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line) { mtx_assert_(vm_page_lockptr(m), a, file, line); } #endif #ifdef INVARIANTS void vm_page_object_lock_assert(vm_page_t m) { /* * Certain of the page's fields may only be modified by the * holder of the containing object's lock or the exclusive busy. * holder. Unfortunately, the holder of the write busy is * not recorded, and thus cannot be checked here. */ if (m->object != NULL && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_WLOCKED(m->object); } #endif #include "opt_ddb.h" #ifdef DDB #include #include DB_SHOW_COMMAND(page, vm_page_print_page_info) { db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count); db_printf("vm_cnt.v_cache_count: %d\n", vm_cnt.v_cache_count); db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count); db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count); db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count); db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved); db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min); db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target); db_printf("vm_cnt.v_cache_min: %d\n", vm_cnt.v_cache_min); db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target); } DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { int dom; db_printf("pq_free %d pq_cache %d\n", vm_cnt.v_free_count, vm_cnt.v_cache_count); for (dom = 0; dom < vm_ndomains; dom++) { db_printf( "dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n", dom, vm_dom[dom].vmd_page_count, vm_dom[dom].vmd_free_count, vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, vm_dom[dom].vmd_pass); } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) { vm_page_t m; boolean_t phys; if (!have_addr) { db_printf("show pginfo addr\n"); return; } phys = strchr(modif, 'p') != NULL; if (phys) m = PHYS_TO_VM_PAGE(addr); else m = (vm_page_t)addr; db_printf( "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags, m->flags, m->act_count, m->busy_lock, m->valid, m->dirty); } #endif /* DDB */